18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * linux/mm/mlock.c 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * (C) Copyright 1995 Linus Torvalds 68c2ecf20Sopenharmony_ci * (C) Copyright 2002 Christoph Hellwig 78c2ecf20Sopenharmony_ci */ 88c2ecf20Sopenharmony_ci 98c2ecf20Sopenharmony_ci#include <linux/capability.h> 108c2ecf20Sopenharmony_ci#include <linux/mman.h> 118c2ecf20Sopenharmony_ci#include <linux/mm.h> 128c2ecf20Sopenharmony_ci#include <linux/sched/user.h> 138c2ecf20Sopenharmony_ci#include <linux/swap.h> 148c2ecf20Sopenharmony_ci#include <linux/swapops.h> 158c2ecf20Sopenharmony_ci#include <linux/pagemap.h> 168c2ecf20Sopenharmony_ci#include <linux/pagevec.h> 178c2ecf20Sopenharmony_ci#include <linux/mempolicy.h> 188c2ecf20Sopenharmony_ci#include <linux/syscalls.h> 198c2ecf20Sopenharmony_ci#include <linux/sched.h> 208c2ecf20Sopenharmony_ci#include <linux/export.h> 218c2ecf20Sopenharmony_ci#include <linux/rmap.h> 228c2ecf20Sopenharmony_ci#include <linux/mmzone.h> 238c2ecf20Sopenharmony_ci#include <linux/hugetlb.h> 248c2ecf20Sopenharmony_ci#include <linux/memcontrol.h> 258c2ecf20Sopenharmony_ci#include <linux/mm_inline.h> 268c2ecf20Sopenharmony_ci 278c2ecf20Sopenharmony_ci#include "internal.h" 288c2ecf20Sopenharmony_ci 298c2ecf20Sopenharmony_cibool can_do_mlock(void) 308c2ecf20Sopenharmony_ci{ 318c2ecf20Sopenharmony_ci if (rlimit(RLIMIT_MEMLOCK) != 0) 328c2ecf20Sopenharmony_ci return true; 338c2ecf20Sopenharmony_ci if (capable(CAP_IPC_LOCK)) 348c2ecf20Sopenharmony_ci return true; 358c2ecf20Sopenharmony_ci return false; 368c2ecf20Sopenharmony_ci} 378c2ecf20Sopenharmony_ciEXPORT_SYMBOL(can_do_mlock); 388c2ecf20Sopenharmony_ci 398c2ecf20Sopenharmony_ci/* 408c2ecf20Sopenharmony_ci * Mlocked pages are marked with PageMlocked() flag for efficient testing 418c2ecf20Sopenharmony_ci * in vmscan and, possibly, the fault path; and to support semi-accurate 428c2ecf20Sopenharmony_ci * statistics. 438c2ecf20Sopenharmony_ci * 448c2ecf20Sopenharmony_ci * An mlocked page [PageMlocked(page)] is unevictable. As such, it will 458c2ecf20Sopenharmony_ci * be placed on the LRU "unevictable" list, rather than the [in]active lists. 468c2ecf20Sopenharmony_ci * The unevictable list is an LRU sibling list to the [in]active lists. 478c2ecf20Sopenharmony_ci * PageUnevictable is set to indicate the unevictable state. 488c2ecf20Sopenharmony_ci * 498c2ecf20Sopenharmony_ci * When lazy mlocking via vmscan, it is important to ensure that the 508c2ecf20Sopenharmony_ci * vma's VM_LOCKED status is not concurrently being modified, otherwise we 518c2ecf20Sopenharmony_ci * may have mlocked a page that is being munlocked. So lazy mlock must take 528c2ecf20Sopenharmony_ci * the mmap_lock for read, and verify that the vma really is locked 538c2ecf20Sopenharmony_ci * (see mm/rmap.c). 548c2ecf20Sopenharmony_ci */ 558c2ecf20Sopenharmony_ci 568c2ecf20Sopenharmony_ci/* 578c2ecf20Sopenharmony_ci * LRU accounting for clear_page_mlock() 588c2ecf20Sopenharmony_ci */ 598c2ecf20Sopenharmony_civoid clear_page_mlock(struct page *page) 608c2ecf20Sopenharmony_ci{ 618c2ecf20Sopenharmony_ci int nr_pages; 628c2ecf20Sopenharmony_ci 638c2ecf20Sopenharmony_ci if (!TestClearPageMlocked(page)) 648c2ecf20Sopenharmony_ci return; 658c2ecf20Sopenharmony_ci 668c2ecf20Sopenharmony_ci nr_pages = thp_nr_pages(page); 678c2ecf20Sopenharmony_ci mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); 688c2ecf20Sopenharmony_ci count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); 698c2ecf20Sopenharmony_ci /* 708c2ecf20Sopenharmony_ci * The previous TestClearPageMlocked() corresponds to the smp_mb() 718c2ecf20Sopenharmony_ci * in __pagevec_lru_add_fn(). 728c2ecf20Sopenharmony_ci * 738c2ecf20Sopenharmony_ci * See __pagevec_lru_add_fn for more explanation. 748c2ecf20Sopenharmony_ci */ 758c2ecf20Sopenharmony_ci if (!isolate_lru_page(page)) { 768c2ecf20Sopenharmony_ci putback_lru_page(page); 778c2ecf20Sopenharmony_ci } else { 788c2ecf20Sopenharmony_ci /* 798c2ecf20Sopenharmony_ci * We lost the race. the page already moved to evictable list. 808c2ecf20Sopenharmony_ci */ 818c2ecf20Sopenharmony_ci if (PageUnevictable(page)) 828c2ecf20Sopenharmony_ci count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); 838c2ecf20Sopenharmony_ci } 848c2ecf20Sopenharmony_ci} 858c2ecf20Sopenharmony_ci 868c2ecf20Sopenharmony_ci/* 878c2ecf20Sopenharmony_ci * Mark page as mlocked if not already. 888c2ecf20Sopenharmony_ci * If page on LRU, isolate and putback to move to unevictable list. 898c2ecf20Sopenharmony_ci */ 908c2ecf20Sopenharmony_civoid mlock_vma_page(struct page *page) 918c2ecf20Sopenharmony_ci{ 928c2ecf20Sopenharmony_ci /* Serialize with page migration */ 938c2ecf20Sopenharmony_ci BUG_ON(!PageLocked(page)); 948c2ecf20Sopenharmony_ci 958c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(PageTail(page), page); 968c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); 978c2ecf20Sopenharmony_ci 988c2ecf20Sopenharmony_ci if (!TestSetPageMlocked(page)) { 998c2ecf20Sopenharmony_ci int nr_pages = thp_nr_pages(page); 1008c2ecf20Sopenharmony_ci 1018c2ecf20Sopenharmony_ci mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); 1028c2ecf20Sopenharmony_ci count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); 1038c2ecf20Sopenharmony_ci if (!isolate_lru_page(page)) 1048c2ecf20Sopenharmony_ci putback_lru_page(page); 1058c2ecf20Sopenharmony_ci } 1068c2ecf20Sopenharmony_ci} 1078c2ecf20Sopenharmony_ci 1088c2ecf20Sopenharmony_ci/* 1098c2ecf20Sopenharmony_ci * Isolate a page from LRU with optional get_page() pin. 1108c2ecf20Sopenharmony_ci * Assumes lru_lock already held and page already pinned. 1118c2ecf20Sopenharmony_ci */ 1128c2ecf20Sopenharmony_cistatic bool __munlock_isolate_lru_page(struct page *page, bool getpage) 1138c2ecf20Sopenharmony_ci{ 1148c2ecf20Sopenharmony_ci if (PageLRU(page)) { 1158c2ecf20Sopenharmony_ci struct lruvec *lruvec; 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_ci lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); 1188c2ecf20Sopenharmony_ci if (getpage) 1198c2ecf20Sopenharmony_ci get_page(page); 1208c2ecf20Sopenharmony_ci ClearPageLRU(page); 1218c2ecf20Sopenharmony_ci del_page_from_lru_list(page, lruvec, page_lru(page)); 1228c2ecf20Sopenharmony_ci return true; 1238c2ecf20Sopenharmony_ci } 1248c2ecf20Sopenharmony_ci 1258c2ecf20Sopenharmony_ci return false; 1268c2ecf20Sopenharmony_ci} 1278c2ecf20Sopenharmony_ci 1288c2ecf20Sopenharmony_ci/* 1298c2ecf20Sopenharmony_ci * Finish munlock after successful page isolation 1308c2ecf20Sopenharmony_ci * 1318c2ecf20Sopenharmony_ci * Page must be locked. This is a wrapper for try_to_munlock() 1328c2ecf20Sopenharmony_ci * and putback_lru_page() with munlock accounting. 1338c2ecf20Sopenharmony_ci */ 1348c2ecf20Sopenharmony_cistatic void __munlock_isolated_page(struct page *page) 1358c2ecf20Sopenharmony_ci{ 1368c2ecf20Sopenharmony_ci /* 1378c2ecf20Sopenharmony_ci * Optimization: if the page was mapped just once, that's our mapping 1388c2ecf20Sopenharmony_ci * and we don't need to check all the other vmas. 1398c2ecf20Sopenharmony_ci */ 1408c2ecf20Sopenharmony_ci if (page_mapcount(page) > 1) 1418c2ecf20Sopenharmony_ci try_to_munlock(page); 1428c2ecf20Sopenharmony_ci 1438c2ecf20Sopenharmony_ci /* Did try_to_unlock() succeed or punt? */ 1448c2ecf20Sopenharmony_ci if (!PageMlocked(page)) 1458c2ecf20Sopenharmony_ci count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page)); 1468c2ecf20Sopenharmony_ci 1478c2ecf20Sopenharmony_ci putback_lru_page(page); 1488c2ecf20Sopenharmony_ci} 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_ci/* 1518c2ecf20Sopenharmony_ci * Accounting for page isolation fail during munlock 1528c2ecf20Sopenharmony_ci * 1538c2ecf20Sopenharmony_ci * Performs accounting when page isolation fails in munlock. There is nothing 1548c2ecf20Sopenharmony_ci * else to do because it means some other task has already removed the page 1558c2ecf20Sopenharmony_ci * from the LRU. putback_lru_page() will take care of removing the page from 1568c2ecf20Sopenharmony_ci * the unevictable list, if necessary. vmscan [page_referenced()] will move 1578c2ecf20Sopenharmony_ci * the page back to the unevictable list if some other vma has it mlocked. 1588c2ecf20Sopenharmony_ci */ 1598c2ecf20Sopenharmony_cistatic void __munlock_isolation_failed(struct page *page) 1608c2ecf20Sopenharmony_ci{ 1618c2ecf20Sopenharmony_ci int nr_pages = thp_nr_pages(page); 1628c2ecf20Sopenharmony_ci 1638c2ecf20Sopenharmony_ci if (PageUnevictable(page)) 1648c2ecf20Sopenharmony_ci __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); 1658c2ecf20Sopenharmony_ci else 1668c2ecf20Sopenharmony_ci __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); 1678c2ecf20Sopenharmony_ci} 1688c2ecf20Sopenharmony_ci 1698c2ecf20Sopenharmony_ci/** 1708c2ecf20Sopenharmony_ci * munlock_vma_page - munlock a vma page 1718c2ecf20Sopenharmony_ci * @page: page to be unlocked, either a normal page or THP page head 1728c2ecf20Sopenharmony_ci * 1738c2ecf20Sopenharmony_ci * returns the size of the page as a page mask (0 for normal page, 1748c2ecf20Sopenharmony_ci * HPAGE_PMD_NR - 1 for THP head page) 1758c2ecf20Sopenharmony_ci * 1768c2ecf20Sopenharmony_ci * called from munlock()/munmap() path with page supposedly on the LRU. 1778c2ecf20Sopenharmony_ci * When we munlock a page, because the vma where we found the page is being 1788c2ecf20Sopenharmony_ci * munlock()ed or munmap()ed, we want to check whether other vmas hold the 1798c2ecf20Sopenharmony_ci * page locked so that we can leave it on the unevictable lru list and not 1808c2ecf20Sopenharmony_ci * bother vmscan with it. However, to walk the page's rmap list in 1818c2ecf20Sopenharmony_ci * try_to_munlock() we must isolate the page from the LRU. If some other 1828c2ecf20Sopenharmony_ci * task has removed the page from the LRU, we won't be able to do that. 1838c2ecf20Sopenharmony_ci * So we clear the PageMlocked as we might not get another chance. If we 1848c2ecf20Sopenharmony_ci * can't isolate the page, we leave it for putback_lru_page() and vmscan 1858c2ecf20Sopenharmony_ci * [page_referenced()/try_to_unmap()] to deal with. 1868c2ecf20Sopenharmony_ci */ 1878c2ecf20Sopenharmony_ciunsigned int munlock_vma_page(struct page *page) 1888c2ecf20Sopenharmony_ci{ 1898c2ecf20Sopenharmony_ci int nr_pages; 1908c2ecf20Sopenharmony_ci pg_data_t *pgdat = page_pgdat(page); 1918c2ecf20Sopenharmony_ci 1928c2ecf20Sopenharmony_ci /* For try_to_munlock() and to serialize with page migration */ 1938c2ecf20Sopenharmony_ci BUG_ON(!PageLocked(page)); 1948c2ecf20Sopenharmony_ci 1958c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(PageTail(page), page); 1968c2ecf20Sopenharmony_ci 1978c2ecf20Sopenharmony_ci /* 1988c2ecf20Sopenharmony_ci * Serialize with any parallel __split_huge_page_refcount() which 1998c2ecf20Sopenharmony_ci * might otherwise copy PageMlocked to part of the tail pages before 2008c2ecf20Sopenharmony_ci * we clear it in the head page. It also stabilizes thp_nr_pages(). 2018c2ecf20Sopenharmony_ci */ 2028c2ecf20Sopenharmony_ci spin_lock_irq(&pgdat->lru_lock); 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_ci if (!TestClearPageMlocked(page)) { 2058c2ecf20Sopenharmony_ci /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ 2068c2ecf20Sopenharmony_ci nr_pages = 1; 2078c2ecf20Sopenharmony_ci goto unlock_out; 2088c2ecf20Sopenharmony_ci } 2098c2ecf20Sopenharmony_ci 2108c2ecf20Sopenharmony_ci nr_pages = thp_nr_pages(page); 2118c2ecf20Sopenharmony_ci __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ci if (__munlock_isolate_lru_page(page, true)) { 2148c2ecf20Sopenharmony_ci spin_unlock_irq(&pgdat->lru_lock); 2158c2ecf20Sopenharmony_ci __munlock_isolated_page(page); 2168c2ecf20Sopenharmony_ci goto out; 2178c2ecf20Sopenharmony_ci } 2188c2ecf20Sopenharmony_ci __munlock_isolation_failed(page); 2198c2ecf20Sopenharmony_ci 2208c2ecf20Sopenharmony_ciunlock_out: 2218c2ecf20Sopenharmony_ci spin_unlock_irq(&pgdat->lru_lock); 2228c2ecf20Sopenharmony_ci 2238c2ecf20Sopenharmony_ciout: 2248c2ecf20Sopenharmony_ci return nr_pages - 1; 2258c2ecf20Sopenharmony_ci} 2268c2ecf20Sopenharmony_ci 2278c2ecf20Sopenharmony_ci/* 2288c2ecf20Sopenharmony_ci * convert get_user_pages() return value to posix mlock() error 2298c2ecf20Sopenharmony_ci */ 2308c2ecf20Sopenharmony_cistatic int __mlock_posix_error_return(long retval) 2318c2ecf20Sopenharmony_ci{ 2328c2ecf20Sopenharmony_ci if (retval == -EFAULT) 2338c2ecf20Sopenharmony_ci retval = -ENOMEM; 2348c2ecf20Sopenharmony_ci else if (retval == -ENOMEM) 2358c2ecf20Sopenharmony_ci retval = -EAGAIN; 2368c2ecf20Sopenharmony_ci return retval; 2378c2ecf20Sopenharmony_ci} 2388c2ecf20Sopenharmony_ci 2398c2ecf20Sopenharmony_ci/* 2408c2ecf20Sopenharmony_ci * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec() 2418c2ecf20Sopenharmony_ci * 2428c2ecf20Sopenharmony_ci * The fast path is available only for evictable pages with single mapping. 2438c2ecf20Sopenharmony_ci * Then we can bypass the per-cpu pvec and get better performance. 2448c2ecf20Sopenharmony_ci * when mapcount > 1 we need try_to_munlock() which can fail. 2458c2ecf20Sopenharmony_ci * when !page_evictable(), we need the full redo logic of putback_lru_page to 2468c2ecf20Sopenharmony_ci * avoid leaving evictable page in unevictable list. 2478c2ecf20Sopenharmony_ci * 2488c2ecf20Sopenharmony_ci * In case of success, @page is added to @pvec and @pgrescued is incremented 2498c2ecf20Sopenharmony_ci * in case that the page was previously unevictable. @page is also unlocked. 2508c2ecf20Sopenharmony_ci */ 2518c2ecf20Sopenharmony_cistatic bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec, 2528c2ecf20Sopenharmony_ci int *pgrescued) 2538c2ecf20Sopenharmony_ci{ 2548c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(PageLRU(page), page); 2558c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(!PageLocked(page), page); 2568c2ecf20Sopenharmony_ci 2578c2ecf20Sopenharmony_ci if (page_mapcount(page) <= 1 && page_evictable(page)) { 2588c2ecf20Sopenharmony_ci pagevec_add(pvec, page); 2598c2ecf20Sopenharmony_ci if (TestClearPageUnevictable(page)) 2608c2ecf20Sopenharmony_ci (*pgrescued)++; 2618c2ecf20Sopenharmony_ci unlock_page(page); 2628c2ecf20Sopenharmony_ci return true; 2638c2ecf20Sopenharmony_ci } 2648c2ecf20Sopenharmony_ci 2658c2ecf20Sopenharmony_ci return false; 2668c2ecf20Sopenharmony_ci} 2678c2ecf20Sopenharmony_ci 2688c2ecf20Sopenharmony_ci/* 2698c2ecf20Sopenharmony_ci * Putback multiple evictable pages to the LRU 2708c2ecf20Sopenharmony_ci * 2718c2ecf20Sopenharmony_ci * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of 2728c2ecf20Sopenharmony_ci * the pages might have meanwhile become unevictable but that is OK. 2738c2ecf20Sopenharmony_ci */ 2748c2ecf20Sopenharmony_cistatic void __putback_lru_fast(struct pagevec *pvec, int pgrescued) 2758c2ecf20Sopenharmony_ci{ 2768c2ecf20Sopenharmony_ci count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec)); 2778c2ecf20Sopenharmony_ci /* 2788c2ecf20Sopenharmony_ci *__pagevec_lru_add() calls release_pages() so we don't call 2798c2ecf20Sopenharmony_ci * put_page() explicitly 2808c2ecf20Sopenharmony_ci */ 2818c2ecf20Sopenharmony_ci __pagevec_lru_add(pvec); 2828c2ecf20Sopenharmony_ci count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); 2838c2ecf20Sopenharmony_ci} 2848c2ecf20Sopenharmony_ci 2858c2ecf20Sopenharmony_ci/* 2868c2ecf20Sopenharmony_ci * Munlock a batch of pages from the same zone 2878c2ecf20Sopenharmony_ci * 2888c2ecf20Sopenharmony_ci * The work is split to two main phases. First phase clears the Mlocked flag 2898c2ecf20Sopenharmony_ci * and attempts to isolate the pages, all under a single zone lru lock. 2908c2ecf20Sopenharmony_ci * The second phase finishes the munlock only for pages where isolation 2918c2ecf20Sopenharmony_ci * succeeded. 2928c2ecf20Sopenharmony_ci * 2938c2ecf20Sopenharmony_ci * Note that the pagevec may be modified during the process. 2948c2ecf20Sopenharmony_ci */ 2958c2ecf20Sopenharmony_cistatic void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) 2968c2ecf20Sopenharmony_ci{ 2978c2ecf20Sopenharmony_ci int i; 2988c2ecf20Sopenharmony_ci int nr = pagevec_count(pvec); 2998c2ecf20Sopenharmony_ci int delta_munlocked = -nr; 3008c2ecf20Sopenharmony_ci struct pagevec pvec_putback; 3018c2ecf20Sopenharmony_ci int pgrescued = 0; 3028c2ecf20Sopenharmony_ci 3038c2ecf20Sopenharmony_ci pagevec_init(&pvec_putback); 3048c2ecf20Sopenharmony_ci 3058c2ecf20Sopenharmony_ci /* Phase 1: page isolation */ 3068c2ecf20Sopenharmony_ci spin_lock_irq(&zone->zone_pgdat->lru_lock); 3078c2ecf20Sopenharmony_ci for (i = 0; i < nr; i++) { 3088c2ecf20Sopenharmony_ci struct page *page = pvec->pages[i]; 3098c2ecf20Sopenharmony_ci 3108c2ecf20Sopenharmony_ci if (TestClearPageMlocked(page)) { 3118c2ecf20Sopenharmony_ci /* 3128c2ecf20Sopenharmony_ci * We already have pin from follow_page_mask() 3138c2ecf20Sopenharmony_ci * so we can spare the get_page() here. 3148c2ecf20Sopenharmony_ci */ 3158c2ecf20Sopenharmony_ci if (__munlock_isolate_lru_page(page, false)) 3168c2ecf20Sopenharmony_ci continue; 3178c2ecf20Sopenharmony_ci else 3188c2ecf20Sopenharmony_ci __munlock_isolation_failed(page); 3198c2ecf20Sopenharmony_ci } else { 3208c2ecf20Sopenharmony_ci delta_munlocked++; 3218c2ecf20Sopenharmony_ci } 3228c2ecf20Sopenharmony_ci 3238c2ecf20Sopenharmony_ci /* 3248c2ecf20Sopenharmony_ci * We won't be munlocking this page in the next phase 3258c2ecf20Sopenharmony_ci * but we still need to release the follow_page_mask() 3268c2ecf20Sopenharmony_ci * pin. We cannot do it under lru_lock however. If it's 3278c2ecf20Sopenharmony_ci * the last pin, __page_cache_release() would deadlock. 3288c2ecf20Sopenharmony_ci */ 3298c2ecf20Sopenharmony_ci pagevec_add(&pvec_putback, pvec->pages[i]); 3308c2ecf20Sopenharmony_ci pvec->pages[i] = NULL; 3318c2ecf20Sopenharmony_ci } 3328c2ecf20Sopenharmony_ci __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); 3338c2ecf20Sopenharmony_ci spin_unlock_irq(&zone->zone_pgdat->lru_lock); 3348c2ecf20Sopenharmony_ci 3358c2ecf20Sopenharmony_ci /* Now we can release pins of pages that we are not munlocking */ 3368c2ecf20Sopenharmony_ci pagevec_release(&pvec_putback); 3378c2ecf20Sopenharmony_ci 3388c2ecf20Sopenharmony_ci /* Phase 2: page munlock */ 3398c2ecf20Sopenharmony_ci for (i = 0; i < nr; i++) { 3408c2ecf20Sopenharmony_ci struct page *page = pvec->pages[i]; 3418c2ecf20Sopenharmony_ci 3428c2ecf20Sopenharmony_ci if (page) { 3438c2ecf20Sopenharmony_ci lock_page(page); 3448c2ecf20Sopenharmony_ci if (!__putback_lru_fast_prepare(page, &pvec_putback, 3458c2ecf20Sopenharmony_ci &pgrescued)) { 3468c2ecf20Sopenharmony_ci /* 3478c2ecf20Sopenharmony_ci * Slow path. We don't want to lose the last 3488c2ecf20Sopenharmony_ci * pin before unlock_page() 3498c2ecf20Sopenharmony_ci */ 3508c2ecf20Sopenharmony_ci get_page(page); /* for putback_lru_page() */ 3518c2ecf20Sopenharmony_ci __munlock_isolated_page(page); 3528c2ecf20Sopenharmony_ci unlock_page(page); 3538c2ecf20Sopenharmony_ci put_page(page); /* from follow_page_mask() */ 3548c2ecf20Sopenharmony_ci } 3558c2ecf20Sopenharmony_ci } 3568c2ecf20Sopenharmony_ci } 3578c2ecf20Sopenharmony_ci 3588c2ecf20Sopenharmony_ci /* 3598c2ecf20Sopenharmony_ci * Phase 3: page putback for pages that qualified for the fast path 3608c2ecf20Sopenharmony_ci * This will also call put_page() to return pin from follow_page_mask() 3618c2ecf20Sopenharmony_ci */ 3628c2ecf20Sopenharmony_ci if (pagevec_count(&pvec_putback)) 3638c2ecf20Sopenharmony_ci __putback_lru_fast(&pvec_putback, pgrescued); 3648c2ecf20Sopenharmony_ci} 3658c2ecf20Sopenharmony_ci 3668c2ecf20Sopenharmony_ci/* 3678c2ecf20Sopenharmony_ci * Fill up pagevec for __munlock_pagevec using pte walk 3688c2ecf20Sopenharmony_ci * 3698c2ecf20Sopenharmony_ci * The function expects that the struct page corresponding to @start address is 3708c2ecf20Sopenharmony_ci * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone. 3718c2ecf20Sopenharmony_ci * 3728c2ecf20Sopenharmony_ci * The rest of @pvec is filled by subsequent pages within the same pmd and same 3738c2ecf20Sopenharmony_ci * zone, as long as the pte's are present and vm_normal_page() succeeds. These 3748c2ecf20Sopenharmony_ci * pages also get pinned. 3758c2ecf20Sopenharmony_ci * 3768c2ecf20Sopenharmony_ci * Returns the address of the next page that should be scanned. This equals 3778c2ecf20Sopenharmony_ci * @start + PAGE_SIZE when no page could be added by the pte walk. 3788c2ecf20Sopenharmony_ci */ 3798c2ecf20Sopenharmony_cistatic unsigned long __munlock_pagevec_fill(struct pagevec *pvec, 3808c2ecf20Sopenharmony_ci struct vm_area_struct *vma, struct zone *zone, 3818c2ecf20Sopenharmony_ci unsigned long start, unsigned long end) 3828c2ecf20Sopenharmony_ci{ 3838c2ecf20Sopenharmony_ci pte_t *pte; 3848c2ecf20Sopenharmony_ci spinlock_t *ptl; 3858c2ecf20Sopenharmony_ci 3868c2ecf20Sopenharmony_ci /* 3878c2ecf20Sopenharmony_ci * Initialize pte walk starting at the already pinned page where we 3888c2ecf20Sopenharmony_ci * are sure that there is a pte, as it was pinned under the same 3898c2ecf20Sopenharmony_ci * mmap_lock write op. 3908c2ecf20Sopenharmony_ci */ 3918c2ecf20Sopenharmony_ci pte = get_locked_pte(vma->vm_mm, start, &ptl); 3928c2ecf20Sopenharmony_ci /* Make sure we do not cross the page table boundary */ 3938c2ecf20Sopenharmony_ci end = pgd_addr_end(start, end); 3948c2ecf20Sopenharmony_ci end = p4d_addr_end(start, end); 3958c2ecf20Sopenharmony_ci end = pud_addr_end(start, end); 3968c2ecf20Sopenharmony_ci end = pmd_addr_end(start, end); 3978c2ecf20Sopenharmony_ci 3988c2ecf20Sopenharmony_ci /* The page next to the pinned page is the first we will try to get */ 3998c2ecf20Sopenharmony_ci start += PAGE_SIZE; 4008c2ecf20Sopenharmony_ci while (start < end) { 4018c2ecf20Sopenharmony_ci struct page *page = NULL; 4028c2ecf20Sopenharmony_ci pte++; 4038c2ecf20Sopenharmony_ci if (pte_present(*pte)) 4048c2ecf20Sopenharmony_ci page = vm_normal_page(vma, start, *pte); 4058c2ecf20Sopenharmony_ci /* 4068c2ecf20Sopenharmony_ci * Break if page could not be obtained or the page's node+zone does not 4078c2ecf20Sopenharmony_ci * match 4088c2ecf20Sopenharmony_ci */ 4098c2ecf20Sopenharmony_ci if (!page || page_zone(page) != zone) 4108c2ecf20Sopenharmony_ci break; 4118c2ecf20Sopenharmony_ci 4128c2ecf20Sopenharmony_ci /* 4138c2ecf20Sopenharmony_ci * Do not use pagevec for PTE-mapped THP, 4148c2ecf20Sopenharmony_ci * munlock_vma_pages_range() will handle them. 4158c2ecf20Sopenharmony_ci */ 4168c2ecf20Sopenharmony_ci if (PageTransCompound(page)) 4178c2ecf20Sopenharmony_ci break; 4188c2ecf20Sopenharmony_ci 4198c2ecf20Sopenharmony_ci get_page(page); 4208c2ecf20Sopenharmony_ci /* 4218c2ecf20Sopenharmony_ci * Increase the address that will be returned *before* the 4228c2ecf20Sopenharmony_ci * eventual break due to pvec becoming full by adding the page 4238c2ecf20Sopenharmony_ci */ 4248c2ecf20Sopenharmony_ci start += PAGE_SIZE; 4258c2ecf20Sopenharmony_ci if (pagevec_add(pvec, page) == 0) 4268c2ecf20Sopenharmony_ci break; 4278c2ecf20Sopenharmony_ci } 4288c2ecf20Sopenharmony_ci pte_unmap_unlock(pte, ptl); 4298c2ecf20Sopenharmony_ci return start; 4308c2ecf20Sopenharmony_ci} 4318c2ecf20Sopenharmony_ci 4328c2ecf20Sopenharmony_ci/* 4338c2ecf20Sopenharmony_ci * munlock_vma_pages_range() - munlock all pages in the vma range.' 4348c2ecf20Sopenharmony_ci * @vma - vma containing range to be munlock()ed. 4358c2ecf20Sopenharmony_ci * @start - start address in @vma of the range 4368c2ecf20Sopenharmony_ci * @end - end of range in @vma. 4378c2ecf20Sopenharmony_ci * 4388c2ecf20Sopenharmony_ci * For mremap(), munmap() and exit(). 4398c2ecf20Sopenharmony_ci * 4408c2ecf20Sopenharmony_ci * Called with @vma VM_LOCKED. 4418c2ecf20Sopenharmony_ci * 4428c2ecf20Sopenharmony_ci * Returns with VM_LOCKED cleared. Callers must be prepared to 4438c2ecf20Sopenharmony_ci * deal with this. 4448c2ecf20Sopenharmony_ci * 4458c2ecf20Sopenharmony_ci * We don't save and restore VM_LOCKED here because pages are 4468c2ecf20Sopenharmony_ci * still on lru. In unmap path, pages might be scanned by reclaim 4478c2ecf20Sopenharmony_ci * and re-mlocked by try_to_{munlock|unmap} before we unmap and 4488c2ecf20Sopenharmony_ci * free them. This will result in freeing mlocked pages. 4498c2ecf20Sopenharmony_ci */ 4508c2ecf20Sopenharmony_civoid munlock_vma_pages_range(struct vm_area_struct *vma, 4518c2ecf20Sopenharmony_ci unsigned long start, unsigned long end) 4528c2ecf20Sopenharmony_ci{ 4538c2ecf20Sopenharmony_ci vma->vm_flags &= VM_LOCKED_CLEAR_MASK; 4548c2ecf20Sopenharmony_ci 4558c2ecf20Sopenharmony_ci while (start < end) { 4568c2ecf20Sopenharmony_ci struct page *page; 4578c2ecf20Sopenharmony_ci unsigned int page_mask = 0; 4588c2ecf20Sopenharmony_ci unsigned long page_increm; 4598c2ecf20Sopenharmony_ci struct pagevec pvec; 4608c2ecf20Sopenharmony_ci struct zone *zone; 4618c2ecf20Sopenharmony_ci 4628c2ecf20Sopenharmony_ci pagevec_init(&pvec); 4638c2ecf20Sopenharmony_ci /* 4648c2ecf20Sopenharmony_ci * Although FOLL_DUMP is intended for get_dump_page(), 4658c2ecf20Sopenharmony_ci * it just so happens that its special treatment of the 4668c2ecf20Sopenharmony_ci * ZERO_PAGE (returning an error instead of doing get_page) 4678c2ecf20Sopenharmony_ci * suits munlock very well (and if somehow an abnormal page 4688c2ecf20Sopenharmony_ci * has sneaked into the range, we won't oops here: great). 4698c2ecf20Sopenharmony_ci */ 4708c2ecf20Sopenharmony_ci page = follow_page(vma, start, FOLL_GET | FOLL_DUMP); 4718c2ecf20Sopenharmony_ci 4728c2ecf20Sopenharmony_ci if (page && !IS_ERR(page)) { 4738c2ecf20Sopenharmony_ci if (PageTransTail(page)) { 4748c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(PageMlocked(page), page); 4758c2ecf20Sopenharmony_ci put_page(page); /* follow_page_mask() */ 4768c2ecf20Sopenharmony_ci } else if (PageTransHuge(page)) { 4778c2ecf20Sopenharmony_ci lock_page(page); 4788c2ecf20Sopenharmony_ci /* 4798c2ecf20Sopenharmony_ci * Any THP page found by follow_page_mask() may 4808c2ecf20Sopenharmony_ci * have gotten split before reaching 4818c2ecf20Sopenharmony_ci * munlock_vma_page(), so we need to compute 4828c2ecf20Sopenharmony_ci * the page_mask here instead. 4838c2ecf20Sopenharmony_ci */ 4848c2ecf20Sopenharmony_ci page_mask = munlock_vma_page(page); 4858c2ecf20Sopenharmony_ci unlock_page(page); 4868c2ecf20Sopenharmony_ci put_page(page); /* follow_page_mask() */ 4878c2ecf20Sopenharmony_ci } else { 4888c2ecf20Sopenharmony_ci /* 4898c2ecf20Sopenharmony_ci * Non-huge pages are handled in batches via 4908c2ecf20Sopenharmony_ci * pagevec. The pin from follow_page_mask() 4918c2ecf20Sopenharmony_ci * prevents them from collapsing by THP. 4928c2ecf20Sopenharmony_ci */ 4938c2ecf20Sopenharmony_ci pagevec_add(&pvec, page); 4948c2ecf20Sopenharmony_ci zone = page_zone(page); 4958c2ecf20Sopenharmony_ci 4968c2ecf20Sopenharmony_ci /* 4978c2ecf20Sopenharmony_ci * Try to fill the rest of pagevec using fast 4988c2ecf20Sopenharmony_ci * pte walk. This will also update start to 4998c2ecf20Sopenharmony_ci * the next page to process. Then munlock the 5008c2ecf20Sopenharmony_ci * pagevec. 5018c2ecf20Sopenharmony_ci */ 5028c2ecf20Sopenharmony_ci start = __munlock_pagevec_fill(&pvec, vma, 5038c2ecf20Sopenharmony_ci zone, start, end); 5048c2ecf20Sopenharmony_ci __munlock_pagevec(&pvec, zone); 5058c2ecf20Sopenharmony_ci goto next; 5068c2ecf20Sopenharmony_ci } 5078c2ecf20Sopenharmony_ci } 5088c2ecf20Sopenharmony_ci page_increm = 1 + page_mask; 5098c2ecf20Sopenharmony_ci start += page_increm * PAGE_SIZE; 5108c2ecf20Sopenharmony_cinext: 5118c2ecf20Sopenharmony_ci cond_resched(); 5128c2ecf20Sopenharmony_ci } 5138c2ecf20Sopenharmony_ci} 5148c2ecf20Sopenharmony_ci 5158c2ecf20Sopenharmony_ci/* 5168c2ecf20Sopenharmony_ci * mlock_fixup - handle mlock[all]/munlock[all] requests. 5178c2ecf20Sopenharmony_ci * 5188c2ecf20Sopenharmony_ci * Filters out "special" vmas -- VM_LOCKED never gets set for these, and 5198c2ecf20Sopenharmony_ci * munlock is a no-op. However, for some special vmas, we go ahead and 5208c2ecf20Sopenharmony_ci * populate the ptes. 5218c2ecf20Sopenharmony_ci * 5228c2ecf20Sopenharmony_ci * For vmas that pass the filters, merge/split as appropriate. 5238c2ecf20Sopenharmony_ci */ 5248c2ecf20Sopenharmony_cistatic int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, 5258c2ecf20Sopenharmony_ci unsigned long start, unsigned long end, vm_flags_t newflags) 5268c2ecf20Sopenharmony_ci{ 5278c2ecf20Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 5288c2ecf20Sopenharmony_ci pgoff_t pgoff; 5298c2ecf20Sopenharmony_ci int nr_pages; 5308c2ecf20Sopenharmony_ci int ret = 0; 5318c2ecf20Sopenharmony_ci int lock = !!(newflags & VM_LOCKED); 5328c2ecf20Sopenharmony_ci vm_flags_t old_flags = vma->vm_flags; 5338c2ecf20Sopenharmony_ci 5348c2ecf20Sopenharmony_ci if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || 5358c2ecf20Sopenharmony_ci is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || 5368c2ecf20Sopenharmony_ci vma_is_dax(vma)) 5378c2ecf20Sopenharmony_ci /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ 5388c2ecf20Sopenharmony_ci goto out; 5398c2ecf20Sopenharmony_ci 5408c2ecf20Sopenharmony_ci pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 5418c2ecf20Sopenharmony_ci *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, 5428c2ecf20Sopenharmony_ci vma->vm_file, pgoff, vma_policy(vma), 5438c2ecf20Sopenharmony_ci vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 5448c2ecf20Sopenharmony_ci if (*prev) { 5458c2ecf20Sopenharmony_ci vma = *prev; 5468c2ecf20Sopenharmony_ci goto success; 5478c2ecf20Sopenharmony_ci } 5488c2ecf20Sopenharmony_ci 5498c2ecf20Sopenharmony_ci if (start != vma->vm_start) { 5508c2ecf20Sopenharmony_ci ret = split_vma(mm, vma, start, 1); 5518c2ecf20Sopenharmony_ci if (ret) 5528c2ecf20Sopenharmony_ci goto out; 5538c2ecf20Sopenharmony_ci } 5548c2ecf20Sopenharmony_ci 5558c2ecf20Sopenharmony_ci if (end != vma->vm_end) { 5568c2ecf20Sopenharmony_ci ret = split_vma(mm, vma, end, 0); 5578c2ecf20Sopenharmony_ci if (ret) 5588c2ecf20Sopenharmony_ci goto out; 5598c2ecf20Sopenharmony_ci } 5608c2ecf20Sopenharmony_ci 5618c2ecf20Sopenharmony_cisuccess: 5628c2ecf20Sopenharmony_ci /* 5638c2ecf20Sopenharmony_ci * Keep track of amount of locked VM. 5648c2ecf20Sopenharmony_ci */ 5658c2ecf20Sopenharmony_ci nr_pages = (end - start) >> PAGE_SHIFT; 5668c2ecf20Sopenharmony_ci if (!lock) 5678c2ecf20Sopenharmony_ci nr_pages = -nr_pages; 5688c2ecf20Sopenharmony_ci else if (old_flags & VM_LOCKED) 5698c2ecf20Sopenharmony_ci nr_pages = 0; 5708c2ecf20Sopenharmony_ci mm->locked_vm += nr_pages; 5718c2ecf20Sopenharmony_ci 5728c2ecf20Sopenharmony_ci /* 5738c2ecf20Sopenharmony_ci * vm_flags is protected by the mmap_lock held in write mode. 5748c2ecf20Sopenharmony_ci * It's okay if try_to_unmap_one unmaps a page just after we 5758c2ecf20Sopenharmony_ci * set VM_LOCKED, populate_vma_page_range will bring it back. 5768c2ecf20Sopenharmony_ci */ 5778c2ecf20Sopenharmony_ci 5788c2ecf20Sopenharmony_ci if (lock) 5798c2ecf20Sopenharmony_ci vma->vm_flags = newflags; 5808c2ecf20Sopenharmony_ci else 5818c2ecf20Sopenharmony_ci munlock_vma_pages_range(vma, start, end); 5828c2ecf20Sopenharmony_ci 5838c2ecf20Sopenharmony_ciout: 5848c2ecf20Sopenharmony_ci *prev = vma; 5858c2ecf20Sopenharmony_ci return ret; 5868c2ecf20Sopenharmony_ci} 5878c2ecf20Sopenharmony_ci 5888c2ecf20Sopenharmony_cistatic int apply_vma_lock_flags(unsigned long start, size_t len, 5898c2ecf20Sopenharmony_ci vm_flags_t flags) 5908c2ecf20Sopenharmony_ci{ 5918c2ecf20Sopenharmony_ci unsigned long nstart, end, tmp; 5928c2ecf20Sopenharmony_ci struct vm_area_struct * vma, * prev; 5938c2ecf20Sopenharmony_ci int error; 5948c2ecf20Sopenharmony_ci 5958c2ecf20Sopenharmony_ci VM_BUG_ON(offset_in_page(start)); 5968c2ecf20Sopenharmony_ci VM_BUG_ON(len != PAGE_ALIGN(len)); 5978c2ecf20Sopenharmony_ci end = start + len; 5988c2ecf20Sopenharmony_ci if (end < start) 5998c2ecf20Sopenharmony_ci return -EINVAL; 6008c2ecf20Sopenharmony_ci if (end == start) 6018c2ecf20Sopenharmony_ci return 0; 6028c2ecf20Sopenharmony_ci vma = find_vma(current->mm, start); 6038c2ecf20Sopenharmony_ci if (!vma || vma->vm_start > start) 6048c2ecf20Sopenharmony_ci return -ENOMEM; 6058c2ecf20Sopenharmony_ci 6068c2ecf20Sopenharmony_ci prev = vma->vm_prev; 6078c2ecf20Sopenharmony_ci if (start > vma->vm_start) 6088c2ecf20Sopenharmony_ci prev = vma; 6098c2ecf20Sopenharmony_ci 6108c2ecf20Sopenharmony_ci for (nstart = start ; ; ) { 6118c2ecf20Sopenharmony_ci vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; 6128c2ecf20Sopenharmony_ci 6138c2ecf20Sopenharmony_ci newflags |= flags; 6148c2ecf20Sopenharmony_ci 6158c2ecf20Sopenharmony_ci /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 6168c2ecf20Sopenharmony_ci tmp = vma->vm_end; 6178c2ecf20Sopenharmony_ci if (tmp > end) 6188c2ecf20Sopenharmony_ci tmp = end; 6198c2ecf20Sopenharmony_ci error = mlock_fixup(vma, &prev, nstart, tmp, newflags); 6208c2ecf20Sopenharmony_ci if (error) 6218c2ecf20Sopenharmony_ci break; 6228c2ecf20Sopenharmony_ci nstart = tmp; 6238c2ecf20Sopenharmony_ci if (nstart < prev->vm_end) 6248c2ecf20Sopenharmony_ci nstart = prev->vm_end; 6258c2ecf20Sopenharmony_ci if (nstart >= end) 6268c2ecf20Sopenharmony_ci break; 6278c2ecf20Sopenharmony_ci 6288c2ecf20Sopenharmony_ci vma = prev->vm_next; 6298c2ecf20Sopenharmony_ci if (!vma || vma->vm_start != nstart) { 6308c2ecf20Sopenharmony_ci error = -ENOMEM; 6318c2ecf20Sopenharmony_ci break; 6328c2ecf20Sopenharmony_ci } 6338c2ecf20Sopenharmony_ci } 6348c2ecf20Sopenharmony_ci return error; 6358c2ecf20Sopenharmony_ci} 6368c2ecf20Sopenharmony_ci 6378c2ecf20Sopenharmony_ci/* 6388c2ecf20Sopenharmony_ci * Go through vma areas and sum size of mlocked 6398c2ecf20Sopenharmony_ci * vma pages, as return value. 6408c2ecf20Sopenharmony_ci * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT) 6418c2ecf20Sopenharmony_ci * is also counted. 6428c2ecf20Sopenharmony_ci * Return value: previously mlocked page counts 6438c2ecf20Sopenharmony_ci */ 6448c2ecf20Sopenharmony_cistatic unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, 6458c2ecf20Sopenharmony_ci unsigned long start, size_t len) 6468c2ecf20Sopenharmony_ci{ 6478c2ecf20Sopenharmony_ci struct vm_area_struct *vma; 6488c2ecf20Sopenharmony_ci unsigned long count = 0; 6498c2ecf20Sopenharmony_ci 6508c2ecf20Sopenharmony_ci if (mm == NULL) 6518c2ecf20Sopenharmony_ci mm = current->mm; 6528c2ecf20Sopenharmony_ci 6538c2ecf20Sopenharmony_ci vma = find_vma(mm, start); 6548c2ecf20Sopenharmony_ci if (vma == NULL) 6558c2ecf20Sopenharmony_ci vma = mm->mmap; 6568c2ecf20Sopenharmony_ci 6578c2ecf20Sopenharmony_ci for (; vma ; vma = vma->vm_next) { 6588c2ecf20Sopenharmony_ci if (start >= vma->vm_end) 6598c2ecf20Sopenharmony_ci continue; 6608c2ecf20Sopenharmony_ci if (start + len <= vma->vm_start) 6618c2ecf20Sopenharmony_ci break; 6628c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_LOCKED) { 6638c2ecf20Sopenharmony_ci if (start > vma->vm_start) 6648c2ecf20Sopenharmony_ci count -= (start - vma->vm_start); 6658c2ecf20Sopenharmony_ci if (start + len < vma->vm_end) { 6668c2ecf20Sopenharmony_ci count += start + len - vma->vm_start; 6678c2ecf20Sopenharmony_ci break; 6688c2ecf20Sopenharmony_ci } 6698c2ecf20Sopenharmony_ci count += vma->vm_end - vma->vm_start; 6708c2ecf20Sopenharmony_ci } 6718c2ecf20Sopenharmony_ci } 6728c2ecf20Sopenharmony_ci 6738c2ecf20Sopenharmony_ci return count >> PAGE_SHIFT; 6748c2ecf20Sopenharmony_ci} 6758c2ecf20Sopenharmony_ci 6768c2ecf20Sopenharmony_cistatic __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) 6778c2ecf20Sopenharmony_ci{ 6788c2ecf20Sopenharmony_ci unsigned long locked; 6798c2ecf20Sopenharmony_ci unsigned long lock_limit; 6808c2ecf20Sopenharmony_ci int error = -ENOMEM; 6818c2ecf20Sopenharmony_ci 6828c2ecf20Sopenharmony_ci start = untagged_addr(start); 6838c2ecf20Sopenharmony_ci 6848c2ecf20Sopenharmony_ci if (!can_do_mlock()) 6858c2ecf20Sopenharmony_ci return -EPERM; 6868c2ecf20Sopenharmony_ci 6878c2ecf20Sopenharmony_ci len = PAGE_ALIGN(len + (offset_in_page(start))); 6888c2ecf20Sopenharmony_ci start &= PAGE_MASK; 6898c2ecf20Sopenharmony_ci 6908c2ecf20Sopenharmony_ci lock_limit = rlimit(RLIMIT_MEMLOCK); 6918c2ecf20Sopenharmony_ci lock_limit >>= PAGE_SHIFT; 6928c2ecf20Sopenharmony_ci locked = len >> PAGE_SHIFT; 6938c2ecf20Sopenharmony_ci 6948c2ecf20Sopenharmony_ci if (mmap_write_lock_killable(current->mm)) 6958c2ecf20Sopenharmony_ci return -EINTR; 6968c2ecf20Sopenharmony_ci 6978c2ecf20Sopenharmony_ci locked += current->mm->locked_vm; 6988c2ecf20Sopenharmony_ci if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) { 6998c2ecf20Sopenharmony_ci /* 7008c2ecf20Sopenharmony_ci * It is possible that the regions requested intersect with 7018c2ecf20Sopenharmony_ci * previously mlocked areas, that part area in "mm->locked_vm" 7028c2ecf20Sopenharmony_ci * should not be counted to new mlock increment count. So check 7038c2ecf20Sopenharmony_ci * and adjust locked count if necessary. 7048c2ecf20Sopenharmony_ci */ 7058c2ecf20Sopenharmony_ci locked -= count_mm_mlocked_page_nr(current->mm, 7068c2ecf20Sopenharmony_ci start, len); 7078c2ecf20Sopenharmony_ci } 7088c2ecf20Sopenharmony_ci 7098c2ecf20Sopenharmony_ci /* check against resource limits */ 7108c2ecf20Sopenharmony_ci if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) 7118c2ecf20Sopenharmony_ci error = apply_vma_lock_flags(start, len, flags); 7128c2ecf20Sopenharmony_ci 7138c2ecf20Sopenharmony_ci mmap_write_unlock(current->mm); 7148c2ecf20Sopenharmony_ci if (error) 7158c2ecf20Sopenharmony_ci return error; 7168c2ecf20Sopenharmony_ci 7178c2ecf20Sopenharmony_ci error = __mm_populate(start, len, 0); 7188c2ecf20Sopenharmony_ci if (error) 7198c2ecf20Sopenharmony_ci return __mlock_posix_error_return(error); 7208c2ecf20Sopenharmony_ci return 0; 7218c2ecf20Sopenharmony_ci} 7228c2ecf20Sopenharmony_ci 7238c2ecf20Sopenharmony_ciSYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) 7248c2ecf20Sopenharmony_ci{ 7258c2ecf20Sopenharmony_ci return do_mlock(start, len, VM_LOCKED); 7268c2ecf20Sopenharmony_ci} 7278c2ecf20Sopenharmony_ci 7288c2ecf20Sopenharmony_ciSYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags) 7298c2ecf20Sopenharmony_ci{ 7308c2ecf20Sopenharmony_ci vm_flags_t vm_flags = VM_LOCKED; 7318c2ecf20Sopenharmony_ci 7328c2ecf20Sopenharmony_ci if (flags & ~MLOCK_ONFAULT) 7338c2ecf20Sopenharmony_ci return -EINVAL; 7348c2ecf20Sopenharmony_ci 7358c2ecf20Sopenharmony_ci if (flags & MLOCK_ONFAULT) 7368c2ecf20Sopenharmony_ci vm_flags |= VM_LOCKONFAULT; 7378c2ecf20Sopenharmony_ci 7388c2ecf20Sopenharmony_ci return do_mlock(start, len, vm_flags); 7398c2ecf20Sopenharmony_ci} 7408c2ecf20Sopenharmony_ci 7418c2ecf20Sopenharmony_ciSYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) 7428c2ecf20Sopenharmony_ci{ 7438c2ecf20Sopenharmony_ci int ret; 7448c2ecf20Sopenharmony_ci 7458c2ecf20Sopenharmony_ci start = untagged_addr(start); 7468c2ecf20Sopenharmony_ci 7478c2ecf20Sopenharmony_ci len = PAGE_ALIGN(len + (offset_in_page(start))); 7488c2ecf20Sopenharmony_ci start &= PAGE_MASK; 7498c2ecf20Sopenharmony_ci 7508c2ecf20Sopenharmony_ci if (mmap_write_lock_killable(current->mm)) 7518c2ecf20Sopenharmony_ci return -EINTR; 7528c2ecf20Sopenharmony_ci ret = apply_vma_lock_flags(start, len, 0); 7538c2ecf20Sopenharmony_ci mmap_write_unlock(current->mm); 7548c2ecf20Sopenharmony_ci 7558c2ecf20Sopenharmony_ci return ret; 7568c2ecf20Sopenharmony_ci} 7578c2ecf20Sopenharmony_ci 7588c2ecf20Sopenharmony_ci/* 7598c2ecf20Sopenharmony_ci * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall) 7608c2ecf20Sopenharmony_ci * and translate into the appropriate modifications to mm->def_flags and/or the 7618c2ecf20Sopenharmony_ci * flags for all current VMAs. 7628c2ecf20Sopenharmony_ci * 7638c2ecf20Sopenharmony_ci * There are a couple of subtleties with this. If mlockall() is called multiple 7648c2ecf20Sopenharmony_ci * times with different flags, the values do not necessarily stack. If mlockall 7658c2ecf20Sopenharmony_ci * is called once including the MCL_FUTURE flag and then a second time without 7668c2ecf20Sopenharmony_ci * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags. 7678c2ecf20Sopenharmony_ci */ 7688c2ecf20Sopenharmony_cistatic int apply_mlockall_flags(int flags) 7698c2ecf20Sopenharmony_ci{ 7708c2ecf20Sopenharmony_ci struct vm_area_struct * vma, * prev = NULL; 7718c2ecf20Sopenharmony_ci vm_flags_t to_add = 0; 7728c2ecf20Sopenharmony_ci 7738c2ecf20Sopenharmony_ci current->mm->def_flags &= VM_LOCKED_CLEAR_MASK; 7748c2ecf20Sopenharmony_ci if (flags & MCL_FUTURE) { 7758c2ecf20Sopenharmony_ci current->mm->def_flags |= VM_LOCKED; 7768c2ecf20Sopenharmony_ci 7778c2ecf20Sopenharmony_ci if (flags & MCL_ONFAULT) 7788c2ecf20Sopenharmony_ci current->mm->def_flags |= VM_LOCKONFAULT; 7798c2ecf20Sopenharmony_ci 7808c2ecf20Sopenharmony_ci if (!(flags & MCL_CURRENT)) 7818c2ecf20Sopenharmony_ci goto out; 7828c2ecf20Sopenharmony_ci } 7838c2ecf20Sopenharmony_ci 7848c2ecf20Sopenharmony_ci if (flags & MCL_CURRENT) { 7858c2ecf20Sopenharmony_ci to_add |= VM_LOCKED; 7868c2ecf20Sopenharmony_ci if (flags & MCL_ONFAULT) 7878c2ecf20Sopenharmony_ci to_add |= VM_LOCKONFAULT; 7888c2ecf20Sopenharmony_ci } 7898c2ecf20Sopenharmony_ci 7908c2ecf20Sopenharmony_ci for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { 7918c2ecf20Sopenharmony_ci vm_flags_t newflags; 7928c2ecf20Sopenharmony_ci 7938c2ecf20Sopenharmony_ci newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; 7948c2ecf20Sopenharmony_ci newflags |= to_add; 7958c2ecf20Sopenharmony_ci 7968c2ecf20Sopenharmony_ci /* Ignore errors */ 7978c2ecf20Sopenharmony_ci mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); 7988c2ecf20Sopenharmony_ci cond_resched(); 7998c2ecf20Sopenharmony_ci } 8008c2ecf20Sopenharmony_ciout: 8018c2ecf20Sopenharmony_ci return 0; 8028c2ecf20Sopenharmony_ci} 8038c2ecf20Sopenharmony_ci 8048c2ecf20Sopenharmony_ciSYSCALL_DEFINE1(mlockall, int, flags) 8058c2ecf20Sopenharmony_ci{ 8068c2ecf20Sopenharmony_ci unsigned long lock_limit; 8078c2ecf20Sopenharmony_ci int ret; 8088c2ecf20Sopenharmony_ci 8098c2ecf20Sopenharmony_ci if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) || 8108c2ecf20Sopenharmony_ci flags == MCL_ONFAULT) 8118c2ecf20Sopenharmony_ci return -EINVAL; 8128c2ecf20Sopenharmony_ci 8138c2ecf20Sopenharmony_ci if (!can_do_mlock()) 8148c2ecf20Sopenharmony_ci return -EPERM; 8158c2ecf20Sopenharmony_ci 8168c2ecf20Sopenharmony_ci lock_limit = rlimit(RLIMIT_MEMLOCK); 8178c2ecf20Sopenharmony_ci lock_limit >>= PAGE_SHIFT; 8188c2ecf20Sopenharmony_ci 8198c2ecf20Sopenharmony_ci if (mmap_write_lock_killable(current->mm)) 8208c2ecf20Sopenharmony_ci return -EINTR; 8218c2ecf20Sopenharmony_ci 8228c2ecf20Sopenharmony_ci ret = -ENOMEM; 8238c2ecf20Sopenharmony_ci if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || 8248c2ecf20Sopenharmony_ci capable(CAP_IPC_LOCK)) 8258c2ecf20Sopenharmony_ci ret = apply_mlockall_flags(flags); 8268c2ecf20Sopenharmony_ci mmap_write_unlock(current->mm); 8278c2ecf20Sopenharmony_ci if (!ret && (flags & MCL_CURRENT)) 8288c2ecf20Sopenharmony_ci mm_populate(0, TASK_SIZE); 8298c2ecf20Sopenharmony_ci 8308c2ecf20Sopenharmony_ci return ret; 8318c2ecf20Sopenharmony_ci} 8328c2ecf20Sopenharmony_ci 8338c2ecf20Sopenharmony_ciSYSCALL_DEFINE0(munlockall) 8348c2ecf20Sopenharmony_ci{ 8358c2ecf20Sopenharmony_ci int ret; 8368c2ecf20Sopenharmony_ci 8378c2ecf20Sopenharmony_ci if (mmap_write_lock_killable(current->mm)) 8388c2ecf20Sopenharmony_ci return -EINTR; 8398c2ecf20Sopenharmony_ci ret = apply_mlockall_flags(0); 8408c2ecf20Sopenharmony_ci mmap_write_unlock(current->mm); 8418c2ecf20Sopenharmony_ci return ret; 8428c2ecf20Sopenharmony_ci} 8438c2ecf20Sopenharmony_ci 8448c2ecf20Sopenharmony_ci/* 8458c2ecf20Sopenharmony_ci * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB 8468c2ecf20Sopenharmony_ci * shm segments) get accounted against the user_struct instead. 8478c2ecf20Sopenharmony_ci */ 8488c2ecf20Sopenharmony_cistatic DEFINE_SPINLOCK(shmlock_user_lock); 8498c2ecf20Sopenharmony_ci 8508c2ecf20Sopenharmony_ciint user_shm_lock(size_t size, struct user_struct *user) 8518c2ecf20Sopenharmony_ci{ 8528c2ecf20Sopenharmony_ci unsigned long lock_limit, locked; 8538c2ecf20Sopenharmony_ci int allowed = 0; 8548c2ecf20Sopenharmony_ci 8558c2ecf20Sopenharmony_ci locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 8568c2ecf20Sopenharmony_ci lock_limit = rlimit(RLIMIT_MEMLOCK); 8578c2ecf20Sopenharmony_ci if (lock_limit == RLIM_INFINITY) 8588c2ecf20Sopenharmony_ci allowed = 1; 8598c2ecf20Sopenharmony_ci lock_limit >>= PAGE_SHIFT; 8608c2ecf20Sopenharmony_ci spin_lock(&shmlock_user_lock); 8618c2ecf20Sopenharmony_ci if (!allowed && 8628c2ecf20Sopenharmony_ci locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) 8638c2ecf20Sopenharmony_ci goto out; 8648c2ecf20Sopenharmony_ci get_uid(user); 8658c2ecf20Sopenharmony_ci user->locked_shm += locked; 8668c2ecf20Sopenharmony_ci allowed = 1; 8678c2ecf20Sopenharmony_ciout: 8688c2ecf20Sopenharmony_ci spin_unlock(&shmlock_user_lock); 8698c2ecf20Sopenharmony_ci return allowed; 8708c2ecf20Sopenharmony_ci} 8718c2ecf20Sopenharmony_ci 8728c2ecf20Sopenharmony_civoid user_shm_unlock(size_t size, struct user_struct *user) 8738c2ecf20Sopenharmony_ci{ 8748c2ecf20Sopenharmony_ci spin_lock(&shmlock_user_lock); 8758c2ecf20Sopenharmony_ci user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 8768c2ecf20Sopenharmony_ci spin_unlock(&shmlock_user_lock); 8778c2ecf20Sopenharmony_ci free_uid(user); 8788c2ecf20Sopenharmony_ci} 879