Line data Source code
1 : /* SPDX-License-Identifier: GPL-2.0 */
2 : #ifndef _LINUX_MM_H
3 : #define _LINUX_MM_H
4 :
5 : #include <linux/errno.h>
6 : #include <linux/mmdebug.h>
7 : #include <linux/gfp.h>
8 : #include <linux/bug.h>
9 : #include <linux/list.h>
10 : #include <linux/mmzone.h>
11 : #include <linux/rbtree.h>
12 : #include <linux/atomic.h>
13 : #include <linux/debug_locks.h>
14 : #include <linux/mm_types.h>
15 : #include <linux/mmap_lock.h>
16 : #include <linux/range.h>
17 : #include <linux/pfn.h>
18 : #include <linux/percpu-refcount.h>
19 : #include <linux/bit_spinlock.h>
20 : #include <linux/shrinker.h>
21 : #include <linux/resource.h>
22 : #include <linux/page_ext.h>
23 : #include <linux/err.h>
24 : #include <linux/page-flags.h>
25 : #include <linux/page_ref.h>
26 : #include <linux/overflow.h>
27 : #include <linux/sizes.h>
28 : #include <linux/sched.h>
29 : #include <linux/pgtable.h>
30 : #include <linux/kasan.h>
31 : #include <linux/memremap.h>
32 : #include <linux/slab.h>
33 :
34 : struct mempolicy;
35 : struct anon_vma;
36 : struct anon_vma_chain;
37 : struct user_struct;
38 : struct pt_regs;
39 :
40 : extern int sysctl_page_lock_unfairness;
41 :
42 : void mm_core_init(void);
43 : void init_mm_internals(void);
44 :
45 : #ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */
46 : extern unsigned long max_mapnr;
47 :
48 : static inline void set_max_mapnr(unsigned long limit)
49 : {
50 : max_mapnr = limit;
51 : }
52 : #else
53 : static inline void set_max_mapnr(unsigned long limit) { }
54 : #endif
55 :
56 : extern atomic_long_t _totalram_pages;
57 : static inline unsigned long totalram_pages(void)
58 : {
59 0 : return (unsigned long)atomic_long_read(&_totalram_pages);
60 : }
61 :
62 : static inline void totalram_pages_inc(void)
63 : {
64 : atomic_long_inc(&_totalram_pages);
65 : }
66 :
67 : static inline void totalram_pages_dec(void)
68 : {
69 : atomic_long_dec(&_totalram_pages);
70 : }
71 :
72 : static inline void totalram_pages_add(long count)
73 : {
74 : atomic_long_add(count, &_totalram_pages);
75 : }
76 :
77 : extern void * high_memory;
78 : extern int page_cluster;
79 : extern const int page_cluster_max;
80 :
81 : #ifdef CONFIG_SYSCTL
82 : extern int sysctl_legacy_va_layout;
83 : #else
84 : #define sysctl_legacy_va_layout 0
85 : #endif
86 :
87 : #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
88 : extern const int mmap_rnd_bits_min;
89 : extern const int mmap_rnd_bits_max;
90 : extern int mmap_rnd_bits __read_mostly;
91 : #endif
92 : #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
93 : extern const int mmap_rnd_compat_bits_min;
94 : extern const int mmap_rnd_compat_bits_max;
95 : extern int mmap_rnd_compat_bits __read_mostly;
96 : #endif
97 :
98 : #include <asm/page.h>
99 : #include <asm/processor.h>
100 :
101 : #ifndef __pa_symbol
102 : #define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0))
103 : #endif
104 :
105 : #ifndef page_to_virt
106 : #define page_to_virt(x) __va(PFN_PHYS(page_to_pfn(x)))
107 : #endif
108 :
109 : #ifndef lm_alias
110 : #define lm_alias(x) __va(__pa_symbol(x))
111 : #endif
112 :
113 : /*
114 : * To prevent common memory management code establishing
115 : * a zero page mapping on a read fault.
116 : * This macro should be defined within <asm/pgtable.h>.
117 : * s390 does this to prevent multiplexing of hardware bits
118 : * related to the physical page in case of virtualization.
119 : */
120 : #ifndef mm_forbids_zeropage
121 : #define mm_forbids_zeropage(X) (0)
122 : #endif
123 :
124 : /*
125 : * On some architectures it is expensive to call memset() for small sizes.
126 : * If an architecture decides to implement their own version of
127 : * mm_zero_struct_page they should wrap the defines below in a #ifndef and
128 : * define their own version of this macro in <asm/pgtable.h>
129 : */
130 : #if BITS_PER_LONG == 64
131 : /* This function must be updated when the size of struct page grows above 96
132 : * or reduces below 56. The idea that compiler optimizes out switch()
133 : * statement, and only leaves move/store instructions. Also the compiler can
134 : * combine write statements if they are both assignments and can be reordered,
135 : * this can result in several of the writes here being dropped.
136 : */
137 : #define mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
138 : static inline void __mm_zero_struct_page(struct page *page)
139 : {
140 : unsigned long *_pp = (void *)page;
141 :
142 : /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */
143 : BUILD_BUG_ON(sizeof(struct page) & 7);
144 : BUILD_BUG_ON(sizeof(struct page) < 56);
145 : BUILD_BUG_ON(sizeof(struct page) > 96);
146 :
147 : switch (sizeof(struct page)) {
148 : case 96:
149 : _pp[11] = 0;
150 : fallthrough;
151 : case 88:
152 : _pp[10] = 0;
153 : fallthrough;
154 : case 80:
155 : _pp[9] = 0;
156 : fallthrough;
157 : case 72:
158 : _pp[8] = 0;
159 : fallthrough;
160 : case 64:
161 : _pp[7] = 0;
162 : fallthrough;
163 : case 56:
164 : _pp[6] = 0;
165 : _pp[5] = 0;
166 : _pp[4] = 0;
167 : _pp[3] = 0;
168 : _pp[2] = 0;
169 : _pp[1] = 0;
170 : _pp[0] = 0;
171 : }
172 : }
173 : #else
174 : #define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page)))
175 : #endif
176 :
177 : /*
178 : * Default maximum number of active map areas, this limits the number of vmas
179 : * per mm struct. Users can overwrite this number by sysctl but there is a
180 : * problem.
181 : *
182 : * When a program's coredump is generated as ELF format, a section is created
183 : * per a vma. In ELF, the number of sections is represented in unsigned short.
184 : * This means the number of sections should be smaller than 65535 at coredump.
185 : * Because the kernel adds some informative sections to a image of program at
186 : * generating coredump, we need some margin. The number of extra sections is
187 : * 1-3 now and depends on arch. We use "5" as safe margin, here.
188 : *
189 : * ELF extended numbering allows more than 65535 sections, so 16-bit bound is
190 : * not a hard limit any more. Although some userspace tools can be surprised by
191 : * that.
192 : */
193 : #define MAPCOUNT_ELF_CORE_MARGIN (5)
194 : #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
195 :
196 : extern int sysctl_max_map_count;
197 :
198 : extern unsigned long sysctl_user_reserve_kbytes;
199 : extern unsigned long sysctl_admin_reserve_kbytes;
200 :
201 : extern int sysctl_overcommit_memory;
202 : extern int sysctl_overcommit_ratio;
203 : extern unsigned long sysctl_overcommit_kbytes;
204 :
205 : int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *,
206 : loff_t *);
207 : int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
208 : loff_t *);
209 : int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
210 : loff_t *);
211 :
212 : #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
213 : #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
214 : #define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio))
215 : #else
216 : #define nth_page(page,n) ((page) + (n))
217 : #define folio_page_idx(folio, p) ((p) - &(folio)->page)
218 : #endif
219 :
220 : /* to align the pointer to the (next) page boundary */
221 : #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)
222 :
223 : /* to align the pointer to the (prev) page boundary */
224 : #define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE)
225 :
226 : /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
227 : #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)
228 :
229 : #define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
230 : static inline struct folio *lru_to_folio(struct list_head *head)
231 : {
232 : return list_entry((head)->prev, struct folio, lru);
233 : }
234 :
235 : void setup_initial_init_mm(void *start_code, void *end_code,
236 : void *end_data, void *brk);
237 :
238 : /*
239 : * Linux kernel virtual memory manager primitives.
240 : * The idea being to have a "virtual" mm in the same way
241 : * we have a virtual fs - giving a cleaner interface to the
242 : * mm details, and allowing different kinds of memory mappings
243 : * (from shared memory to executable loading to arbitrary
244 : * mmap() functions).
245 : */
246 :
247 : struct vm_area_struct *vm_area_alloc(struct mm_struct *);
248 : struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
249 : void vm_area_free(struct vm_area_struct *);
250 : /* Use only if VMA has no other users */
251 : void __vm_area_free(struct vm_area_struct *vma);
252 :
253 : #ifndef CONFIG_MMU
254 : extern struct rb_root nommu_region_tree;
255 : extern struct rw_semaphore nommu_region_sem;
256 :
257 : extern unsigned int kobjsize(const void *objp);
258 : #endif
259 :
260 : /*
261 : * vm_flags in vm_area_struct, see mm_types.h.
262 : * When changing, update also include/trace/events/mmflags.h
263 : */
264 : #define VM_NONE 0x00000000
265 :
266 : #define VM_READ 0x00000001 /* currently active flags */
267 : #define VM_WRITE 0x00000002
268 : #define VM_EXEC 0x00000004
269 : #define VM_SHARED 0x00000008
270 :
271 : /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
272 : #define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */
273 : #define VM_MAYWRITE 0x00000020
274 : #define VM_MAYEXEC 0x00000040
275 : #define VM_MAYSHARE 0x00000080
276 :
277 : #define VM_GROWSDOWN 0x00000100 /* general info on the segment */
278 : #ifdef CONFIG_MMU
279 : #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */
280 : #else /* CONFIG_MMU */
281 : #define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
282 : #define VM_UFFD_MISSING 0
283 : #endif /* CONFIG_MMU */
284 : #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
285 : #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */
286 :
287 : #define VM_LOCKED 0x00002000
288 : #define VM_IO 0x00004000 /* Memory mapped I/O or similar */
289 :
290 : /* Used by sys_madvise() */
291 : #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */
292 : #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */
293 :
294 : #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
295 : #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
296 : #define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */
297 : #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
298 : #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
299 : #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
300 : #define VM_SYNC 0x00800000 /* Synchronous page faults */
301 : #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
302 : #define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */
303 : #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
304 :
305 : #ifdef CONFIG_MEM_SOFT_DIRTY
306 : # define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */
307 : #else
308 : # define VM_SOFTDIRTY 0
309 : #endif
310 :
311 : #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */
312 : #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */
313 : #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */
314 : #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
315 :
316 : #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
317 : #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */
318 : #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */
319 : #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
320 : #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
321 : #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
322 : #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
323 : #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
324 : #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
325 : #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
326 : #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
327 : #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
328 :
329 : #ifdef CONFIG_ARCH_HAS_PKEYS
330 : # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
331 : # define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */
332 : # define VM_PKEY_BIT1 VM_HIGH_ARCH_1 /* on x86 and 5-bit value on ppc64 */
333 : # define VM_PKEY_BIT2 VM_HIGH_ARCH_2
334 : # define VM_PKEY_BIT3 VM_HIGH_ARCH_3
335 : #ifdef CONFIG_PPC
336 : # define VM_PKEY_BIT4 VM_HIGH_ARCH_4
337 : #else
338 : # define VM_PKEY_BIT4 0
339 : #endif
340 : #endif /* CONFIG_ARCH_HAS_PKEYS */
341 :
342 : #if defined(CONFIG_X86)
343 : # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */
344 : #elif defined(CONFIG_PPC)
345 : # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */
346 : #elif defined(CONFIG_PARISC)
347 : # define VM_GROWSUP VM_ARCH_1
348 : #elif defined(CONFIG_IA64)
349 : # define VM_GROWSUP VM_ARCH_1
350 : #elif defined(CONFIG_SPARC64)
351 : # define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */
352 : # define VM_ARCH_CLEAR VM_SPARC_ADI
353 : #elif defined(CONFIG_ARM64)
354 : # define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */
355 : # define VM_ARCH_CLEAR VM_ARM64_BTI
356 : #elif !defined(CONFIG_MMU)
357 : # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */
358 : #endif
359 :
360 : #if defined(CONFIG_ARM64_MTE)
361 : # define VM_MTE VM_HIGH_ARCH_0 /* Use Tagged memory for access control */
362 : # define VM_MTE_ALLOWED VM_HIGH_ARCH_1 /* Tagged memory permitted */
363 : #else
364 : # define VM_MTE VM_NONE
365 : # define VM_MTE_ALLOWED VM_NONE
366 : #endif
367 :
368 : #ifndef VM_GROWSUP
369 : # define VM_GROWSUP VM_NONE
370 : #endif
371 :
372 : #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
373 : # define VM_UFFD_MINOR_BIT 37
374 : # define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */
375 : #else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
376 : # define VM_UFFD_MINOR VM_NONE
377 : #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
378 :
379 : /* Bits set in the VMA until the stack is in its final location */
380 : #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)
381 :
382 : #define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)
383 :
384 : /* Common data flag combinations */
385 : #define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \
386 : VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
387 : #define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \
388 : VM_MAYWRITE | VM_MAYEXEC)
389 : #define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \
390 : VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
391 :
392 : #ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */
393 : #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC
394 : #endif
395 :
396 : #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
397 : #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
398 : #endif
399 :
400 : #ifdef CONFIG_STACK_GROWSUP
401 : #define VM_STACK VM_GROWSUP
402 : #define VM_STACK_EARLY VM_GROWSDOWN
403 : #else
404 : #define VM_STACK VM_GROWSDOWN
405 : #define VM_STACK_EARLY 0
406 : #endif
407 :
408 : #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
409 :
410 : /* VMA basic access permission flags */
411 : #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
412 :
413 :
414 : /*
415 : * Special vmas that are non-mergable, non-mlock()able.
416 : */
417 : #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
418 :
419 : /* This mask prevents VMA from being scanned with khugepaged */
420 : #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
421 :
422 : /* This mask defines which mm->def_flags a process can inherit its parent */
423 : #define VM_INIT_DEF_MASK VM_NOHUGEPAGE
424 :
425 : /* This mask represents all the VMA flag bits used by mlock */
426 : #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT)
427 :
428 : /* Arch-specific flags to clear when updating VM flags on protection change */
429 : #ifndef VM_ARCH_CLEAR
430 : # define VM_ARCH_CLEAR VM_NONE
431 : #endif
432 : #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)
433 :
434 : /*
435 : * mapping from the currently active vm_flags protection bits (the
436 : * low four bits) to a page protection mask..
437 : */
438 :
439 : /*
440 : * The default fault flags that should be used by most of the
441 : * arch-specific page fault handlers.
442 : */
443 : #define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \
444 : FAULT_FLAG_KILLABLE | \
445 : FAULT_FLAG_INTERRUPTIBLE)
446 :
447 : /**
448 : * fault_flag_allow_retry_first - check ALLOW_RETRY the first time
449 : * @flags: Fault flags.
450 : *
451 : * This is mostly used for places where we want to try to avoid taking
452 : * the mmap_lock for too long a time when waiting for another condition
453 : * to change, in which case we can try to be polite to release the
454 : * mmap_lock in the first round to avoid potential starvation of other
455 : * processes that would also want the mmap_lock.
456 : *
457 : * Return: true if the page fault allows retry and this is the first
458 : * attempt of the fault handling; false otherwise.
459 : */
460 : static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
461 : {
462 3621848 : return (flags & FAULT_FLAG_ALLOW_RETRY) &&
463 : (!(flags & FAULT_FLAG_TRIED));
464 : }
465 :
466 : #define FAULT_FLAG_TRACE \
467 : { FAULT_FLAG_WRITE, "WRITE" }, \
468 : { FAULT_FLAG_MKWRITE, "MKWRITE" }, \
469 : { FAULT_FLAG_ALLOW_RETRY, "ALLOW_RETRY" }, \
470 : { FAULT_FLAG_RETRY_NOWAIT, "RETRY_NOWAIT" }, \
471 : { FAULT_FLAG_KILLABLE, "KILLABLE" }, \
472 : { FAULT_FLAG_TRIED, "TRIED" }, \
473 : { FAULT_FLAG_USER, "USER" }, \
474 : { FAULT_FLAG_REMOTE, "REMOTE" }, \
475 : { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \
476 : { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \
477 : { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" }
478 :
479 : /*
480 : * vm_fault is filled by the pagefault handler and passed to the vma's
481 : * ->fault function. The vma's ->fault is responsible for returning a bitmask
482 : * of VM_FAULT_xxx flags that give details about how the fault was handled.
483 : *
484 : * MM layer fills up gfp_mask for page allocations but fault handler might
485 : * alter it if its implementation requires a different allocation context.
486 : *
487 : * pgoff should be used in favour of virtual_address, if possible.
488 : */
489 : struct vm_fault {
490 : const struct {
491 : struct vm_area_struct *vma; /* Target VMA */
492 : gfp_t gfp_mask; /* gfp mask to be used for allocations */
493 : pgoff_t pgoff; /* Logical page offset based on vma */
494 : unsigned long address; /* Faulting virtual address - masked */
495 : unsigned long real_address; /* Faulting virtual address - unmasked */
496 : };
497 : enum fault_flag flags; /* FAULT_FLAG_xxx flags
498 : * XXX: should really be 'const' */
499 : pmd_t *pmd; /* Pointer to pmd entry matching
500 : * the 'address' */
501 : pud_t *pud; /* Pointer to pud entry matching
502 : * the 'address'
503 : */
504 : union {
505 : pte_t orig_pte; /* Value of PTE at the time of fault */
506 : pmd_t orig_pmd; /* Value of PMD at the time of fault,
507 : * used by PMD fault only.
508 : */
509 : };
510 :
511 : struct page *cow_page; /* Page handler may use for COW fault */
512 : struct page *page; /* ->fault handlers should return a
513 : * page here, unless VM_FAULT_NOPAGE
514 : * is set (which is also implied by
515 : * VM_FAULT_ERROR).
516 : */
517 : /* These three entries are valid only while holding ptl lock */
518 : pte_t *pte; /* Pointer to pte entry matching
519 : * the 'address'. NULL if the page
520 : * table hasn't been allocated.
521 : */
522 : spinlock_t *ptl; /* Page table lock.
523 : * Protects pte page table if 'pte'
524 : * is not NULL, otherwise pmd.
525 : */
526 : pgtable_t prealloc_pte; /* Pre-allocated pte page table.
527 : * vm_ops->map_pages() sets up a page
528 : * table from atomic context.
529 : * do_fault_around() pre-allocates
530 : * page table to avoid allocation from
531 : * atomic context.
532 : */
533 : };
534 :
535 : /* page entry size for vm->huge_fault() */
536 : enum page_entry_size {
537 : PE_SIZE_PTE = 0,
538 : PE_SIZE_PMD,
539 : PE_SIZE_PUD,
540 : };
541 :
542 : /*
543 : * These are the virtual MM functions - opening of an area, closing and
544 : * unmapping it (needed to keep files on disk up-to-date etc), pointer
545 : * to the functions called when a no-page or a wp-page exception occurs.
546 : */
547 : struct vm_operations_struct {
548 : void (*open)(struct vm_area_struct * area);
549 : /**
550 : * @close: Called when the VMA is being removed from the MM.
551 : * Context: User context. May sleep. Caller holds mmap_lock.
552 : */
553 : void (*close)(struct vm_area_struct * area);
554 : /* Called any time before splitting to check if it's allowed */
555 : int (*may_split)(struct vm_area_struct *area, unsigned long addr);
556 : int (*mremap)(struct vm_area_struct *area);
557 : /*
558 : * Called by mprotect() to make driver-specific permission
559 : * checks before mprotect() is finalised. The VMA must not
560 : * be modified. Returns 0 if mprotect() can proceed.
561 : */
562 : int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
563 : unsigned long end, unsigned long newflags);
564 : vm_fault_t (*fault)(struct vm_fault *vmf);
565 : vm_fault_t (*huge_fault)(struct vm_fault *vmf,
566 : enum page_entry_size pe_size);
567 : vm_fault_t (*map_pages)(struct vm_fault *vmf,
568 : pgoff_t start_pgoff, pgoff_t end_pgoff);
569 : unsigned long (*pagesize)(struct vm_area_struct * area);
570 :
571 : /* notification that a previously read-only page is about to become
572 : * writable, if an error is returned it will cause a SIGBUS */
573 : vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);
574 :
575 : /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
576 : vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);
577 :
578 : /* called by access_process_vm when get_user_pages() fails, typically
579 : * for use by special VMAs. See also generic_access_phys() for a generic
580 : * implementation useful for any iomem mapping.
581 : */
582 : int (*access)(struct vm_area_struct *vma, unsigned long addr,
583 : void *buf, int len, int write);
584 :
585 : /* Called by the /proc/PID/maps code to ask the vma whether it
586 : * has a special name. Returning non-NULL will also cause this
587 : * vma to be dumped unconditionally. */
588 : const char *(*name)(struct vm_area_struct *vma);
589 :
590 : #ifdef CONFIG_NUMA
591 : /*
592 : * set_policy() op must add a reference to any non-NULL @new mempolicy
593 : * to hold the policy upon return. Caller should pass NULL @new to
594 : * remove a policy and fall back to surrounding context--i.e. do not
595 : * install a MPOL_DEFAULT policy, nor the task or system default
596 : * mempolicy.
597 : */
598 : int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
599 :
600 : /*
601 : * get_policy() op must add reference [mpol_get()] to any policy at
602 : * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure
603 : * in mm/mempolicy.c will do this automatically.
604 : * get_policy() must NOT add a ref if the policy at (vma,addr) is not
605 : * marked as MPOL_SHARED. vma policies are protected by the mmap_lock.
606 : * If no [shared/vma] mempolicy exists at the addr, get_policy() op
607 : * must return NULL--i.e., do not "fallback" to task or system default
608 : * policy.
609 : */
610 : struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
611 : unsigned long addr);
612 : #endif
613 : /*
614 : * Called by vm_normal_page() for special PTEs to find the
615 : * page for @addr. This is useful if the default behavior
616 : * (using pte_page()) would not find the correct page.
617 : */
618 : struct page *(*find_special_page)(struct vm_area_struct *vma,
619 : unsigned long addr);
620 : };
621 :
622 : #ifdef CONFIG_NUMA_BALANCING
623 : static inline void vma_numab_state_init(struct vm_area_struct *vma)
624 : {
625 : vma->numab_state = NULL;
626 : }
627 : static inline void vma_numab_state_free(struct vm_area_struct *vma)
628 : {
629 : kfree(vma->numab_state);
630 : }
631 : #else
632 : static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
633 : static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
634 : #endif /* CONFIG_NUMA_BALANCING */
635 :
636 : #ifdef CONFIG_PER_VMA_LOCK
637 : /*
638 : * Try to read-lock a vma. The function is allowed to occasionally yield false
639 : * locked result to avoid performance overhead, in which case we fall back to
640 : * using mmap_lock. The function should never yield false unlocked result.
641 : */
642 : static inline bool vma_start_read(struct vm_area_struct *vma)
643 : {
644 : /*
645 : * Check before locking. A race might cause false locked result.
646 : * We can use READ_ONCE() for the mm_lock_seq here, and don't need
647 : * ACQUIRE semantics, because this is just a lockless check whose result
648 : * we don't rely on for anything - the mm_lock_seq read against which we
649 : * need ordering is below.
650 : */
651 : if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
652 : return false;
653 :
654 : if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
655 : return false;
656 :
657 : /*
658 : * Overflow might produce false locked result.
659 : * False unlocked result is impossible because we modify and check
660 : * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
661 : * modification invalidates all existing locks.
662 : *
663 : * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
664 : * racing with vma_end_write_all(), we only start reading from the VMA
665 : * after it has been unlocked.
666 : * This pairs with RELEASE semantics in vma_end_write_all().
667 : */
668 : if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
669 : up_read(&vma->vm_lock->lock);
670 : return false;
671 : }
672 : return true;
673 : }
674 :
675 : static inline void vma_end_read(struct vm_area_struct *vma)
676 : {
677 : rcu_read_lock(); /* keeps vma alive till the end of up_read */
678 : up_read(&vma->vm_lock->lock);
679 : rcu_read_unlock();
680 : }
681 :
682 : static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
683 : {
684 15954336 : mmap_assert_write_locked(vma->vm_mm);
685 :
686 : /*
687 : * current task is holding mmap_write_lock, both vma->vm_lock_seq and
688 : * mm->mm_lock_seq can't be concurrently modified.
689 : */
690 15954336 : *mm_lock_seq = vma->vm_mm->mm_lock_seq;
691 15954336 : return (vma->vm_lock_seq == *mm_lock_seq);
692 : }
693 :
694 15954336 : static inline void vma_start_write(struct vm_area_struct *vma)
695 : {
696 15954336 : int mm_lock_seq;
697 :
698 15954336 : if (__is_vma_write_locked(vma, &mm_lock_seq))
699 : return;
700 :
701 480073 : down_write(&vma->vm_lock->lock);
702 : /*
703 : * We should use WRITE_ONCE() here because we can have concurrent reads
704 : * from the early lockless pessimistic check in vma_start_read().
705 : * We don't really care about the correctness of that early check, but
706 : * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
707 : */
708 480076 : WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
709 480076 : up_write(&vma->vm_lock->lock);
710 : }
711 :
712 : static inline bool vma_try_start_write(struct vm_area_struct *vma)
713 : {
714 : int mm_lock_seq;
715 :
716 : if (__is_vma_write_locked(vma, &mm_lock_seq))
717 : return true;
718 :
719 : if (!down_write_trylock(&vma->vm_lock->lock))
720 : return false;
721 :
722 : WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
723 : up_write(&vma->vm_lock->lock);
724 : return true;
725 : }
726 :
727 : static inline void vma_assert_write_locked(struct vm_area_struct *vma)
728 : {
729 : int mm_lock_seq;
730 :
731 : VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
732 : }
733 :
734 : static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
735 : {
736 : /* When detaching vma should be write-locked */
737 : if (detached)
738 : vma_assert_write_locked(vma);
739 : vma->detached = detached;
740 : }
741 :
742 : struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
743 : unsigned long address);
744 :
745 : #else /* CONFIG_PER_VMA_LOCK */
746 :
747 : static inline bool vma_start_read(struct vm_area_struct *vma)
748 : { return false; }
749 : static inline void vma_end_read(struct vm_area_struct *vma) {}
750 : static inline void vma_start_write(struct vm_area_struct *vma) {}
751 : static inline bool vma_try_start_write(struct vm_area_struct *vma)
752 : { return true; }
753 : static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
754 : static inline void vma_mark_detached(struct vm_area_struct *vma,
755 : bool detached) {}
756 :
757 : #endif /* CONFIG_PER_VMA_LOCK */
758 :
759 : /*
760 : * WARNING: vma_init does not initialize vma->vm_lock.
761 : * Use vm_area_alloc()/vm_area_free() if vma needs locking.
762 : */
763 : static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
764 : {
765 : static const struct vm_operations_struct dummy_vm_ops = {};
766 :
767 : memset(vma, 0, sizeof(*vma));
768 : vma->vm_mm = mm;
769 : vma->vm_ops = &dummy_vm_ops;
770 : INIT_LIST_HEAD(&vma->anon_vma_chain);
771 : vma_mark_detached(vma, false);
772 : vma_numab_state_init(vma);
773 : }
774 :
775 : /* Use when VMA is not part of the VMA tree and needs no locking */
776 : static inline void vm_flags_init(struct vm_area_struct *vma,
777 : vm_flags_t flags)
778 : {
779 15919784 : ACCESS_PRIVATE(vma, __vm_flags) = flags;
780 : }
781 :
782 : /* Use when VMA is part of the VMA tree and modifications need coordination */
783 : static inline void vm_flags_reset(struct vm_area_struct *vma,
784 : vm_flags_t flags)
785 : {
786 : vma_start_write(vma);
787 : vm_flags_init(vma, flags);
788 : }
789 :
790 : static inline void vm_flags_reset_once(struct vm_area_struct *vma,
791 : vm_flags_t flags)
792 : {
793 : vma_start_write(vma);
794 : WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
795 : }
796 :
797 : static inline void vm_flags_set(struct vm_area_struct *vma,
798 : vm_flags_t flags)
799 : {
800 479145 : vma_start_write(vma);
801 479144 : ACCESS_PRIVATE(vma, __vm_flags) |= flags;
802 : }
803 :
804 : static inline void vm_flags_clear(struct vm_area_struct *vma,
805 : vm_flags_t flags)
806 : {
807 15479564 : vma_start_write(vma);
808 15475096 : ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
809 : }
810 :
811 : /*
812 : * Use only if VMA is not part of the VMA tree or has no other users and
813 : * therefore needs no locking.
814 : */
815 : static inline void __vm_flags_mod(struct vm_area_struct *vma,
816 : vm_flags_t set, vm_flags_t clear)
817 : {
818 : vm_flags_init(vma, (vma->vm_flags | set) & ~clear);
819 : }
820 :
821 : /*
822 : * Use only when the order of set/clear operations is unimportant, otherwise
823 : * use vm_flags_{set|clear} explicitly.
824 : */
825 : static inline void vm_flags_mod(struct vm_area_struct *vma,
826 : vm_flags_t set, vm_flags_t clear)
827 : {
828 : vma_start_write(vma);
829 : __vm_flags_mod(vma, set, clear);
830 : }
831 :
832 : static inline void vma_set_anonymous(struct vm_area_struct *vma)
833 : {
834 15919894 : vma->vm_ops = NULL;
835 : }
836 :
837 : static inline bool vma_is_anonymous(struct vm_area_struct *vma)
838 : {
839 : return !vma->vm_ops;
840 : }
841 :
842 : static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
843 : {
844 : int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
845 :
846 : if (!maybe_stack)
847 : return false;
848 :
849 : if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
850 : VM_STACK_INCOMPLETE_SETUP)
851 : return true;
852 :
853 : return false;
854 : }
855 :
856 : static inline bool vma_is_foreign(struct vm_area_struct *vma)
857 : {
858 : if (!current->mm)
859 : return true;
860 :
861 : if (current->mm != vma->vm_mm)
862 : return true;
863 :
864 : return false;
865 : }
866 :
867 : static inline bool vma_is_accessible(struct vm_area_struct *vma)
868 : {
869 : return vma->vm_flags & VM_ACCESS_FLAGS;
870 : }
871 :
872 : static inline
873 : struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
874 : {
875 : return mas_find(&vmi->mas, max - 1);
876 : }
877 :
878 : static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
879 : {
880 : /*
881 : * Uses mas_find() to get the first VMA when the iterator starts.
882 : * Calling mas_next() could skip the first entry.
883 : */
884 30947383 : return mas_find(&vmi->mas, ULONG_MAX);
885 : }
886 :
887 : static inline
888 : struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
889 : {
890 : return mas_next_range(&vmi->mas, ULONG_MAX);
891 : }
892 :
893 :
894 : static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
895 : {
896 15474286 : return mas_prev(&vmi->mas, 0);
897 : }
898 :
899 : static inline
900 : struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi)
901 : {
902 : return mas_prev_range(&vmi->mas, 0);
903 : }
904 :
905 : static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
906 : {
907 : return vmi->mas.index;
908 : }
909 :
910 : static inline unsigned long vma_iter_end(struct vma_iterator *vmi)
911 : {
912 : return vmi->mas.last + 1;
913 : }
914 : static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
915 : unsigned long count)
916 : {
917 : return mas_expected_entries(&vmi->mas, count);
918 : }
919 :
920 : /* Free any unused preallocations */
921 : static inline void vma_iter_free(struct vma_iterator *vmi)
922 : {
923 : mas_destroy(&vmi->mas);
924 : }
925 :
926 : static inline int vma_iter_bulk_store(struct vma_iterator *vmi,
927 : struct vm_area_struct *vma)
928 : {
929 : vmi->mas.index = vma->vm_start;
930 : vmi->mas.last = vma->vm_end - 1;
931 : mas_store(&vmi->mas, vma);
932 : if (unlikely(mas_is_err(&vmi->mas)))
933 : return -ENOMEM;
934 :
935 : return 0;
936 : }
937 :
938 : static inline void vma_iter_invalidate(struct vma_iterator *vmi)
939 : {
940 : mas_pause(&vmi->mas);
941 : }
942 :
943 : static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr)
944 : {
945 : mas_set(&vmi->mas, addr);
946 : }
947 :
948 : #define for_each_vma(__vmi, __vma) \
949 : while (((__vma) = vma_next(&(__vmi))) != NULL)
950 :
951 : /* The MM code likes to work with exclusive end addresses */
952 : #define for_each_vma_range(__vmi, __vma, __end) \
953 : while (((__vma) = vma_find(&(__vmi), (__end))) != NULL)
954 :
955 : #ifdef CONFIG_SHMEM
956 : /*
957 : * The vma_is_shmem is not inline because it is used only by slow
958 : * paths in userfault.
959 : */
960 : bool vma_is_shmem(struct vm_area_struct *vma);
961 : bool vma_is_anon_shmem(struct vm_area_struct *vma);
962 : #else
963 : static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
964 : static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; }
965 : #endif
966 :
967 : int vma_is_stack_for_current(struct vm_area_struct *vma);
968 :
969 : /* flush_tlb_range() takes a vma, not a mm, and can care about flags */
970 : #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }
971 :
972 : struct mmu_gather;
973 : struct inode;
974 :
975 : /*
976 : * compound_order() can be called without holding a reference, which means
977 : * that niceties like page_folio() don't work. These callers should be
978 : * prepared to handle wild return values. For example, PG_head may be
979 : * set before _folio_order is initialised, or this may be a tail page.
980 : * See compaction.c for some good examples.
981 : */
982 846451156 : static inline unsigned int compound_order(struct page *page)
983 : {
984 846451156 : struct folio *folio = (struct folio *)page;
985 :
986 846451156 : if (!test_bit(PG_head, &folio->flags))
987 : return 0;
988 32055507 : return folio->_folio_order;
989 : }
990 :
991 : /**
992 : * folio_order - The allocation order of a folio.
993 : * @folio: The folio.
994 : *
995 : * A folio is composed of 2^order pages. See get_order() for the definition
996 : * of order.
997 : *
998 : * Return: The order of the folio.
999 : */
1000 : static inline unsigned int folio_order(struct folio *folio)
1001 : {
1002 3131890662 : if (!folio_test_large(folio))
1003 : return 0;
1004 329688116 : return folio->_folio_order;
1005 : }
1006 :
1007 : #include <linux/huge_mm.h>
1008 :
1009 : /*
1010 : * Methods to modify the page usage count.
1011 : *
1012 : * What counts for a page usage:
1013 : * - cache mapping (page->mapping)
1014 : * - private data (page->private)
1015 : * - page mapped in a task's page tables, each mapping
1016 : * is counted separately
1017 : *
1018 : * Also, many kernel routines increase the page count before a critical
1019 : * routine so they can be sure the page doesn't go away from under them.
1020 : */
1021 :
1022 : /*
1023 : * Drop a ref, return true if the refcount fell to zero (the page has no users)
1024 : */
1025 : static inline int put_page_testzero(struct page *page)
1026 : {
1027 28113875351 : VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
1028 28113875351 : return page_ref_dec_and_test(page);
1029 : }
1030 :
1031 : static inline int folio_put_testzero(struct folio *folio)
1032 : {
1033 28113875351 : return put_page_testzero(&folio->page);
1034 : }
1035 :
1036 : /*
1037 : * Try to grab a ref unless the page has a refcount of zero, return false if
1038 : * that is the case.
1039 : * This can be called when MMU is off so it must not access
1040 : * any of the virtual mappings.
1041 : */
1042 : static inline bool get_page_unless_zero(struct page *page)
1043 : {
1044 : return page_ref_add_unless(page, 1, 0);
1045 : }
1046 :
1047 : static inline struct folio *folio_get_nontail_page(struct page *page)
1048 : {
1049 : if (unlikely(!get_page_unless_zero(page)))
1050 : return NULL;
1051 : return (struct folio *)page;
1052 : }
1053 :
1054 : extern int page_is_ram(unsigned long pfn);
1055 :
1056 : enum {
1057 : REGION_INTERSECTS,
1058 : REGION_DISJOINT,
1059 : REGION_MIXED,
1060 : };
1061 :
1062 : int region_intersects(resource_size_t offset, size_t size, unsigned long flags,
1063 : unsigned long desc);
1064 :
1065 : /* Support for virtually mapped pages */
1066 : struct page *vmalloc_to_page(const void *addr);
1067 : unsigned long vmalloc_to_pfn(const void *addr);
1068 :
1069 : /*
1070 : * Determine if an address is within the vmalloc range
1071 : *
1072 : * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
1073 : * is no special casing required.
1074 : */
1075 :
1076 : #ifndef is_ioremap_addr
1077 : #define is_ioremap_addr(x) is_vmalloc_addr(x)
1078 : #endif
1079 :
1080 : #ifdef CONFIG_MMU
1081 : extern bool is_vmalloc_addr(const void *x);
1082 : extern int is_vmalloc_or_module_addr(const void *x);
1083 : #else
1084 : static inline bool is_vmalloc_addr(const void *x)
1085 : {
1086 : return false;
1087 : }
1088 : static inline int is_vmalloc_or_module_addr(const void *x)
1089 : {
1090 : return 0;
1091 : }
1092 : #endif
1093 :
1094 : /*
1095 : * How many times the entire folio is mapped as a single unit (eg by a
1096 : * PMD or PUD entry). This is probably not what you want, except for
1097 : * debugging purposes - it does not include PTE-mapped sub-pages; look
1098 : * at folio_mapcount() or page_mapcount() or total_mapcount() instead.
1099 : */
1100 : static inline int folio_entire_mapcount(struct folio *folio)
1101 : {
1102 0 : VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
1103 0 : return atomic_read(&folio->_entire_mapcount) + 1;
1104 : }
1105 :
1106 : /*
1107 : * The atomic page->_mapcount, starts from -1: so that transitions
1108 : * both from it and to it can be tracked, using atomic_inc_and_test
1109 : * and atomic_add_negative(-1).
1110 : */
1111 : static inline void page_mapcount_reset(struct page *page)
1112 : {
1113 0 : atomic_set(&(page)->_mapcount, -1);
1114 : }
1115 :
1116 : /**
1117 : * page_mapcount() - Number of times this precise page is mapped.
1118 : * @page: The page.
1119 : *
1120 : * The number of times this page is mapped. If this page is part of
1121 : * a large folio, it includes the number of times this page is mapped
1122 : * as part of that folio.
1123 : *
1124 : * The result is undefined for pages which cannot be mapped into userspace.
1125 : * For example SLAB or special types of pages. See function page_has_type().
1126 : * They use this field in struct page differently.
1127 : */
1128 0 : static inline int page_mapcount(struct page *page)
1129 : {
1130 0 : int mapcount = atomic_read(&page->_mapcount) + 1;
1131 :
1132 0 : if (unlikely(PageCompound(page)))
1133 0 : mapcount += folio_entire_mapcount(page_folio(page));
1134 :
1135 0 : return mapcount;
1136 : }
1137 :
1138 : int folio_total_mapcount(struct folio *folio);
1139 :
1140 : /**
1141 : * folio_mapcount() - Calculate the number of mappings of this folio.
1142 : * @folio: The folio.
1143 : *
1144 : * A large folio tracks both how many times the entire folio is mapped,
1145 : * and how many times each individual page in the folio is mapped.
1146 : * This function calculates the total number of times the folio is
1147 : * mapped.
1148 : *
1149 : * Return: The number of times this folio is mapped.
1150 : */
1151 : static inline int folio_mapcount(struct folio *folio)
1152 : {
1153 : if (likely(!folio_test_large(folio)))
1154 : return atomic_read(&folio->_mapcount) + 1;
1155 : return folio_total_mapcount(folio);
1156 : }
1157 :
1158 : static inline int total_mapcount(struct page *page)
1159 : {
1160 : if (likely(!PageCompound(page)))
1161 : return atomic_read(&page->_mapcount) + 1;
1162 : return folio_total_mapcount(page_folio(page));
1163 : }
1164 :
1165 : static inline bool folio_large_is_mapped(struct folio *folio)
1166 : {
1167 : /*
1168 : * Reading _entire_mapcount below could be omitted if hugetlb
1169 : * participated in incrementing nr_pages_mapped when compound mapped.
1170 : */
1171 12980673 : return atomic_read(&folio->_nr_pages_mapped) > 0 ||
1172 : atomic_read(&folio->_entire_mapcount) >= 0;
1173 : }
1174 :
1175 : /**
1176 : * folio_mapped - Is this folio mapped into userspace?
1177 : * @folio: The folio.
1178 : *
1179 : * Return: True if any page in this folio is referenced by user page tables.
1180 : */
1181 805566349 : static inline bool folio_mapped(struct folio *folio)
1182 : {
1183 805566349 : if (likely(!folio_test_large(folio)))
1184 792585421 : return atomic_read(&folio->_mapcount) >= 0;
1185 25961601 : return folio_large_is_mapped(folio);
1186 : }
1187 :
1188 : /*
1189 : * Return true if this page is mapped into pagetables.
1190 : * For compound page it returns true if any sub-page of compound page is mapped,
1191 : * even if this particular sub-page is not itself mapped by any PTE or PMD.
1192 : */
1193 : static inline bool page_mapped(struct page *page)
1194 : {
1195 : if (likely(!PageCompound(page)))
1196 : return atomic_read(&page->_mapcount) >= 0;
1197 : return folio_large_is_mapped(page_folio(page));
1198 : }
1199 :
1200 : static inline struct page *virt_to_head_page(const void *x)
1201 : {
1202 : struct page *page = virt_to_page(x);
1203 :
1204 : return compound_head(page);
1205 : }
1206 :
1207 : static inline struct folio *virt_to_folio(const void *x)
1208 : {
1209 0 : struct page *page = virt_to_page(x);
1210 :
1211 0 : return page_folio(page);
1212 : }
1213 :
1214 : void __folio_put(struct folio *folio);
1215 :
1216 : void put_pages_list(struct list_head *pages);
1217 :
1218 : void split_page(struct page *page, unsigned int order);
1219 : void folio_copy(struct folio *dst, struct folio *src);
1220 :
1221 : unsigned long nr_free_buffer_pages(void);
1222 :
1223 : /*
1224 : * Compound pages have a destructor function. Provide a
1225 : * prototype for that function and accessor functions.
1226 : * These are _only_ valid on the head of a compound page.
1227 : */
1228 : typedef void compound_page_dtor(struct page *);
1229 :
1230 : /* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c */
1231 : enum compound_dtor_id {
1232 : NULL_COMPOUND_DTOR,
1233 : COMPOUND_PAGE_DTOR,
1234 : #ifdef CONFIG_HUGETLB_PAGE
1235 : HUGETLB_PAGE_DTOR,
1236 : #endif
1237 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1238 : TRANSHUGE_PAGE_DTOR,
1239 : #endif
1240 : NR_COMPOUND_DTORS,
1241 : };
1242 :
1243 : static inline void folio_set_compound_dtor(struct folio *folio,
1244 : enum compound_dtor_id compound_dtor)
1245 : {
1246 : VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio);
1247 : folio->_folio_dtor = compound_dtor;
1248 : }
1249 :
1250 : void destroy_large_folio(struct folio *folio);
1251 :
1252 : /* Returns the number of bytes in this potentially compound page. */
1253 : static inline unsigned long page_size(struct page *page)
1254 : {
1255 423234585 : return PAGE_SIZE << compound_order(page);
1256 : }
1257 :
1258 : /* Returns the number of bits needed for the number of bytes in a page */
1259 : static inline unsigned int page_shift(struct page *page)
1260 : {
1261 : return PAGE_SHIFT + compound_order(page);
1262 : }
1263 :
1264 : /**
1265 : * thp_order - Order of a transparent huge page.
1266 : * @page: Head page of a transparent huge page.
1267 : */
1268 : static inline unsigned int thp_order(struct page *page)
1269 : {
1270 : VM_BUG_ON_PGFLAGS(PageTail(page), page);
1271 : return compound_order(page);
1272 : }
1273 :
1274 : /**
1275 : * thp_size - Size of a transparent huge page.
1276 : * @page: Head page of a transparent huge page.
1277 : *
1278 : * Return: Number of bytes in this page.
1279 : */
1280 : static inline unsigned long thp_size(struct page *page)
1281 : {
1282 : return PAGE_SIZE << thp_order(page);
1283 : }
1284 :
1285 : void free_compound_page(struct page *page);
1286 :
1287 : #ifdef CONFIG_MMU
1288 : /*
1289 : * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
1290 : * servicing faults for write access. In the normal case, do always want
1291 : * pte_mkwrite. But get_user_pages can cause write faults for mappings
1292 : * that do not have writing enabled, when used by access_process_vm.
1293 : */
1294 : static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1295 : {
1296 : if (likely(vma->vm_flags & VM_WRITE))
1297 : pte = pte_mkwrite(pte);
1298 : return pte;
1299 : }
1300 :
1301 : vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
1302 : void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
1303 :
1304 : vm_fault_t finish_fault(struct vm_fault *vmf);
1305 : vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
1306 : #endif
1307 :
1308 : /*
1309 : * Multiple processes may "see" the same page. E.g. for untouched
1310 : * mappings of /dev/null, all processes see the same page full of
1311 : * zeroes, and text pages of executables and shared libraries have
1312 : * only one copy in memory, at most, normally.
1313 : *
1314 : * For the non-reserved pages, page_count(page) denotes a reference count.
1315 : * page_count() == 0 means the page is free. page->lru is then used for
1316 : * freelist management in the buddy allocator.
1317 : * page_count() > 0 means the page has been allocated.
1318 : *
1319 : * Pages are allocated by the slab allocator in order to provide memory
1320 : * to kmalloc and kmem_cache_alloc. In this case, the management of the
1321 : * page, and the fields in 'struct page' are the responsibility of mm/slab.c
1322 : * unless a particular usage is carefully commented. (the responsibility of
1323 : * freeing the kmalloc memory is the caller's, of course).
1324 : *
1325 : * A page may be used by anyone else who does a __get_free_page().
1326 : * In this case, page_count still tracks the references, and should only
1327 : * be used through the normal accessor functions. The top bits of page->flags
1328 : * and page->virtual store page management information, but all other fields
1329 : * are unused and could be used privately, carefully. The management of this
1330 : * page is the responsibility of the one who allocated it, and those who have
1331 : * subsequently been given references to it.
1332 : *
1333 : * The other pages (we may call them "pagecache pages") are completely
1334 : * managed by the Linux memory manager: I/O, buffers, swapping etc.
1335 : * The following discussion applies only to them.
1336 : *
1337 : * A pagecache page contains an opaque `private' member, which belongs to the
1338 : * page's address_space. Usually, this is the address of a circular list of
1339 : * the page's disk buffers. PG_private must be set to tell the VM to call
1340 : * into the filesystem to release these pages.
1341 : *
1342 : * A page may belong to an inode's memory mapping. In this case, page->mapping
1343 : * is the pointer to the inode, and page->index is the file offset of the page,
1344 : * in units of PAGE_SIZE.
1345 : *
1346 : * If pagecache pages are not associated with an inode, they are said to be
1347 : * anonymous pages. These may become associated with the swapcache, and in that
1348 : * case PG_swapcache is set, and page->private is an offset into the swapcache.
1349 : *
1350 : * In either case (swapcache or inode backed), the pagecache itself holds one
1351 : * reference to the page. Setting PG_private should also increment the
1352 : * refcount. The each user mapping also has a reference to the page.
1353 : *
1354 : * The pagecache pages are stored in a per-mapping radix tree, which is
1355 : * rooted at mapping->i_pages, and indexed by offset.
1356 : * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
1357 : * lists, we instead now tag pages as dirty/writeback in the radix tree.
1358 : *
1359 : * All pagecache pages may be subject to I/O:
1360 : * - inode pages may need to be read from disk,
1361 : * - inode pages which have been modified and are MAP_SHARED may need
1362 : * to be written back to the inode on disk,
1363 : * - anonymous pages (including MAP_PRIVATE file mappings) which have been
1364 : * modified may need to be swapped out to swap space and (later) to be read
1365 : * back into memory.
1366 : */
1367 :
1368 : #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
1369 : DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
1370 :
1371 : bool __put_devmap_managed_page_refs(struct page *page, int refs);
1372 : static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
1373 : {
1374 : if (!static_branch_unlikely(&devmap_managed_key))
1375 : return false;
1376 : if (!is_zone_device_page(page))
1377 : return false;
1378 : return __put_devmap_managed_page_refs(page, refs);
1379 : }
1380 : #else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
1381 : static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
1382 : {
1383 : return false;
1384 : }
1385 : #endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
1386 :
1387 : static inline bool put_devmap_managed_page(struct page *page)
1388 : {
1389 : return put_devmap_managed_page_refs(page, 1);
1390 : }
1391 :
1392 : /* 127: arbitrary random number, small enough to assemble well */
1393 : #define folio_ref_zero_or_close_to_overflow(folio) \
1394 : ((unsigned int) folio_ref_count(folio) + 127u <= 127u)
1395 :
1396 : /**
1397 : * folio_get - Increment the reference count on a folio.
1398 : * @folio: The folio.
1399 : *
1400 : * Context: May be called in any context, as long as you know that
1401 : * you have a refcount on the folio. If you do not already have one,
1402 : * folio_try_get() may be the right interface for you to use.
1403 : */
1404 : static inline void folio_get(struct folio *folio)
1405 : {
1406 26696060787 : VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio);
1407 26696060787 : folio_ref_inc(folio);
1408 : }
1409 :
1410 26249778579 : static inline void get_page(struct page *page)
1411 : {
1412 52499557158 : folio_get(page_folio(page));
1413 26250054647 : }
1414 :
1415 0 : static inline __must_check bool try_get_page(struct page *page)
1416 : {
1417 0 : page = compound_head(page);
1418 0 : if (WARN_ON_ONCE(page_ref_count(page) <= 0))
1419 : return false;
1420 0 : page_ref_inc(page);
1421 0 : return true;
1422 : }
1423 :
1424 : /**
1425 : * folio_put - Decrement the reference count on a folio.
1426 : * @folio: The folio.
1427 : *
1428 : * If the folio's reference count reaches zero, the memory will be
1429 : * released back to the page allocator and may be used by another
1430 : * allocation immediately. Do not access the memory or the struct folio
1431 : * after calling folio_put() unless you can be sure that it wasn't the
1432 : * last reference.
1433 : *
1434 : * Context: May be called in process or interrupt context, but not in NMI
1435 : * context. May be called while holding a spinlock.
1436 : */
1437 28113875351 : static inline void folio_put(struct folio *folio)
1438 : {
1439 28113875351 : if (folio_put_testzero(folio))
1440 2391513 : __folio_put(folio);
1441 28117299967 : }
1442 :
1443 : /**
1444 : * folio_put_refs - Reduce the reference count on a folio.
1445 : * @folio: The folio.
1446 : * @refs: The amount to subtract from the folio's reference count.
1447 : *
1448 : * If the folio's reference count reaches zero, the memory will be
1449 : * released back to the page allocator and may be used by another
1450 : * allocation immediately. Do not access the memory or the struct folio
1451 : * after calling folio_put_refs() unless you can be sure that these weren't
1452 : * the last references.
1453 : *
1454 : * Context: May be called in process or interrupt context, but not in NMI
1455 : * context. May be called while holding a spinlock.
1456 : */
1457 362153623 : static inline void folio_put_refs(struct folio *folio, int refs)
1458 : {
1459 362153623 : if (folio_ref_sub_and_test(folio, refs))
1460 0 : __folio_put(folio);
1461 362158107 : }
1462 :
1463 : /*
1464 : * union release_pages_arg - an array of pages or folios
1465 : *
1466 : * release_pages() releases a simple array of multiple pages, and
1467 : * accepts various different forms of said page array: either
1468 : * a regular old boring array of pages, an array of folios, or
1469 : * an array of encoded page pointers.
1470 : *
1471 : * The transparent union syntax for this kind of "any of these
1472 : * argument types" is all kinds of ugly, so look away.
1473 : */
1474 : typedef union {
1475 : struct page **pages;
1476 : struct folio **folios;
1477 : struct encoded_page **encoded_pages;
1478 : } release_pages_arg __attribute__ ((__transparent_union__));
1479 :
1480 : void release_pages(release_pages_arg, int nr);
1481 :
1482 : /**
1483 : * folios_put - Decrement the reference count on an array of folios.
1484 : * @folios: The folios.
1485 : * @nr: How many folios there are.
1486 : *
1487 : * Like folio_put(), but for an array of folios. This is more efficient
1488 : * than writing the loop yourself as it will optimise the locks which
1489 : * need to be taken if the folios are freed.
1490 : *
1491 : * Context: May be called in process or interrupt context, but not in NMI
1492 : * context. May be called while holding a spinlock.
1493 : */
1494 : static inline void folios_put(struct folio **folios, unsigned int nr)
1495 : {
1496 : release_pages(folios, nr);
1497 : }
1498 :
1499 26451001014 : static inline void put_page(struct page *page)
1500 : {
1501 26451001014 : struct folio *folio = page_folio(page);
1502 :
1503 : /*
1504 : * For some devmap managed pages we need to catch refcount transition
1505 : * from 2 to 1:
1506 : */
1507 26451001014 : if (put_devmap_managed_page(&folio->page))
1508 : return;
1509 26451001014 : folio_put(folio);
1510 : }
1511 :
1512 : /*
1513 : * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
1514 : * the page's refcount so that two separate items are tracked: the original page
1515 : * reference count, and also a new count of how many pin_user_pages() calls were
1516 : * made against the page. ("gup-pinned" is another term for the latter).
1517 : *
1518 : * With this scheme, pin_user_pages() becomes special: such pages are marked as
1519 : * distinct from normal pages. As such, the unpin_user_page() call (and its
1520 : * variants) must be used in order to release gup-pinned pages.
1521 : *
1522 : * Choice of value:
1523 : *
1524 : * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
1525 : * counts with respect to pin_user_pages() and unpin_user_page() becomes
1526 : * simpler, due to the fact that adding an even power of two to the page
1527 : * refcount has the effect of using only the upper N bits, for the code that
1528 : * counts up using the bias value. This means that the lower bits are left for
1529 : * the exclusive use of the original code that increments and decrements by one
1530 : * (or at least, by much smaller values than the bias value).
1531 : *
1532 : * Of course, once the lower bits overflow into the upper bits (and this is
1533 : * OK, because subtraction recovers the original values), then visual inspection
1534 : * no longer suffices to directly view the separate counts. However, for normal
1535 : * applications that don't have huge page reference counts, this won't be an
1536 : * issue.
1537 : *
1538 : * Locking: the lockless algorithm described in folio_try_get_rcu()
1539 : * provides safe operation for get_user_pages(), page_mkclean() and
1540 : * other calls that race to set up page table entries.
1541 : */
1542 : #define GUP_PIN_COUNTING_BIAS (1U << 10)
1543 :
1544 : void unpin_user_page(struct page *page);
1545 : void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
1546 : bool make_dirty);
1547 : void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
1548 : bool make_dirty);
1549 : void unpin_user_pages(struct page **pages, unsigned long npages);
1550 :
1551 : static inline bool is_cow_mapping(vm_flags_t flags)
1552 : {
1553 : return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
1554 : }
1555 :
1556 : #ifndef CONFIG_MMU
1557 : static inline bool is_nommu_shared_mapping(vm_flags_t flags)
1558 : {
1559 : /*
1560 : * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected
1561 : * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of
1562 : * a file mapping. R/O MAP_PRIVATE mappings might still modify
1563 : * underlying memory if ptrace is active, so this is only possible if
1564 : * ptrace does not apply. Note that there is no mprotect() to upgrade
1565 : * write permissions later.
1566 : */
1567 : return flags & (VM_MAYSHARE | VM_MAYOVERLAY);
1568 : }
1569 : #endif
1570 :
1571 : #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
1572 : #define SECTION_IN_PAGE_FLAGS
1573 : #endif
1574 :
1575 : /*
1576 : * The identification function is mainly used by the buddy allocator for
1577 : * determining if two pages could be buddies. We are not really identifying
1578 : * the zone since we could be using the section number id if we do not have
1579 : * node id available in page flags.
1580 : * We only guarantee that it will return the same value for two combinable
1581 : * pages in a zone.
1582 : */
1583 : static inline int page_zone_id(struct page *page)
1584 : {
1585 : return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
1586 : }
1587 :
1588 : #ifdef NODE_NOT_IN_PAGE_FLAGS
1589 : extern int page_to_nid(const struct page *page);
1590 : #else
1591 : static inline int page_to_nid(const struct page *page)
1592 : {
1593 411306599 : struct page *p = (struct page *)page;
1594 :
1595 411306599 : return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK;
1596 : }
1597 : #endif
1598 :
1599 : static inline int folio_nid(const struct folio *folio)
1600 : {
1601 : return page_to_nid(&folio->page);
1602 : }
1603 :
1604 : #ifdef CONFIG_NUMA_BALANCING
1605 : /* page access time bits needs to hold at least 4 seconds */
1606 : #define PAGE_ACCESS_TIME_MIN_BITS 12
1607 : #if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS
1608 : #define PAGE_ACCESS_TIME_BUCKETS \
1609 : (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT)
1610 : #else
1611 : #define PAGE_ACCESS_TIME_BUCKETS 0
1612 : #endif
1613 :
1614 : #define PAGE_ACCESS_TIME_MASK \
1615 : (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS)
1616 :
1617 : static inline int cpu_pid_to_cpupid(int cpu, int pid)
1618 : {
1619 : return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
1620 : }
1621 :
1622 : static inline int cpupid_to_pid(int cpupid)
1623 : {
1624 : return cpupid & LAST__PID_MASK;
1625 : }
1626 :
1627 : static inline int cpupid_to_cpu(int cpupid)
1628 : {
1629 : return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK;
1630 : }
1631 :
1632 : static inline int cpupid_to_nid(int cpupid)
1633 : {
1634 : return cpu_to_node(cpupid_to_cpu(cpupid));
1635 : }
1636 :
1637 : static inline bool cpupid_pid_unset(int cpupid)
1638 : {
1639 : return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK);
1640 : }
1641 :
1642 : static inline bool cpupid_cpu_unset(int cpupid)
1643 : {
1644 : return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK);
1645 : }
1646 :
1647 : static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
1648 : {
1649 : return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid);
1650 : }
1651 :
1652 : #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
1653 : #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
1654 : static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
1655 : {
1656 : return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK);
1657 : }
1658 :
1659 : static inline int page_cpupid_last(struct page *page)
1660 : {
1661 : return page->_last_cpupid;
1662 : }
1663 : static inline void page_cpupid_reset_last(struct page *page)
1664 : {
1665 : page->_last_cpupid = -1 & LAST_CPUPID_MASK;
1666 : }
1667 : #else
1668 : static inline int page_cpupid_last(struct page *page)
1669 : {
1670 : return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
1671 : }
1672 :
1673 : extern int page_cpupid_xchg_last(struct page *page, int cpupid);
1674 :
1675 : static inline void page_cpupid_reset_last(struct page *page)
1676 : {
1677 : page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
1678 : }
1679 : #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
1680 :
1681 : static inline int xchg_page_access_time(struct page *page, int time)
1682 : {
1683 : int last_time;
1684 :
1685 : last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
1686 : return last_time << PAGE_ACCESS_TIME_BUCKETS;
1687 : }
1688 :
1689 : static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
1690 : {
1691 : unsigned int pid_bit;
1692 :
1693 : pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
1694 : if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
1695 : __set_bit(pid_bit, &vma->numab_state->access_pids[1]);
1696 : }
1697 : }
1698 : #else /* !CONFIG_NUMA_BALANCING */
1699 : static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
1700 : {
1701 : return page_to_nid(page); /* XXX */
1702 : }
1703 :
1704 : static inline int xchg_page_access_time(struct page *page, int time)
1705 : {
1706 : return 0;
1707 : }
1708 :
1709 : static inline int page_cpupid_last(struct page *page)
1710 : {
1711 : return page_to_nid(page); /* XXX */
1712 : }
1713 :
1714 : static inline int cpupid_to_nid(int cpupid)
1715 : {
1716 : return -1;
1717 : }
1718 :
1719 : static inline int cpupid_to_pid(int cpupid)
1720 : {
1721 : return -1;
1722 : }
1723 :
1724 : static inline int cpupid_to_cpu(int cpupid)
1725 : {
1726 : return -1;
1727 : }
1728 :
1729 : static inline int cpu_pid_to_cpupid(int nid, int pid)
1730 : {
1731 : return -1;
1732 : }
1733 :
1734 : static inline bool cpupid_pid_unset(int cpupid)
1735 : {
1736 : return true;
1737 : }
1738 :
1739 : static inline void page_cpupid_reset_last(struct page *page)
1740 : {
1741 : }
1742 :
1743 : static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
1744 : {
1745 : return false;
1746 : }
1747 :
1748 : static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
1749 : {
1750 : }
1751 : #endif /* CONFIG_NUMA_BALANCING */
1752 :
1753 : #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
1754 :
1755 : /*
1756 : * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid
1757 : * setting tags for all pages to native kernel tag value 0xff, as the default
1758 : * value 0x00 maps to 0xff.
1759 : */
1760 :
1761 : static inline u8 page_kasan_tag(const struct page *page)
1762 : {
1763 : u8 tag = 0xff;
1764 :
1765 : if (kasan_enabled()) {
1766 : tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
1767 : tag ^= 0xff;
1768 : }
1769 :
1770 : return tag;
1771 : }
1772 :
1773 : static inline void page_kasan_tag_set(struct page *page, u8 tag)
1774 : {
1775 : unsigned long old_flags, flags;
1776 :
1777 : if (!kasan_enabled())
1778 : return;
1779 :
1780 : tag ^= 0xff;
1781 : old_flags = READ_ONCE(page->flags);
1782 : do {
1783 : flags = old_flags;
1784 : flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
1785 : flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
1786 : } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
1787 : }
1788 :
1789 : static inline void page_kasan_tag_reset(struct page *page)
1790 : {
1791 : if (kasan_enabled())
1792 : page_kasan_tag_set(page, 0xff);
1793 : }
1794 :
1795 : #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
1796 :
1797 : static inline u8 page_kasan_tag(const struct page *page)
1798 : {
1799 : return 0xff;
1800 : }
1801 :
1802 : static inline void page_kasan_tag_set(struct page *page, u8 tag) { }
1803 : static inline void page_kasan_tag_reset(struct page *page) { }
1804 :
1805 : #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
1806 :
1807 : static inline struct zone *page_zone(const struct page *page)
1808 : {
1809 235988013 : return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
1810 : }
1811 :
1812 : static inline pg_data_t *page_pgdat(const struct page *page)
1813 : {
1814 175318586 : return NODE_DATA(page_to_nid(page));
1815 : }
1816 :
1817 : static inline struct zone *folio_zone(const struct folio *folio)
1818 : {
1819 235988013 : return page_zone(&folio->page);
1820 : }
1821 :
1822 : static inline pg_data_t *folio_pgdat(const struct folio *folio)
1823 : {
1824 175318586 : return page_pgdat(&folio->page);
1825 : }
1826 :
1827 : #ifdef SECTION_IN_PAGE_FLAGS
1828 : static inline void set_page_section(struct page *page, unsigned long section)
1829 : {
1830 : page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
1831 : page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
1832 : }
1833 :
1834 : static inline unsigned long page_to_section(const struct page *page)
1835 : {
1836 : return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
1837 : }
1838 : #endif
1839 :
1840 : /**
1841 : * folio_pfn - Return the Page Frame Number of a folio.
1842 : * @folio: The folio.
1843 : *
1844 : * A folio may contain multiple pages. The pages have consecutive
1845 : * Page Frame Numbers.
1846 : *
1847 : * Return: The Page Frame Number of the first page in the folio.
1848 : */
1849 : static inline unsigned long folio_pfn(struct folio *folio)
1850 : {
1851 0 : return page_to_pfn(&folio->page);
1852 : }
1853 :
1854 : static inline struct folio *pfn_folio(unsigned long pfn)
1855 : {
1856 : return page_folio(pfn_to_page(pfn));
1857 : }
1858 :
1859 : /**
1860 : * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA.
1861 : * @folio: The folio.
1862 : *
1863 : * This function checks if a folio has been pinned via a call to
1864 : * a function in the pin_user_pages() family.
1865 : *
1866 : * For small folios, the return value is partially fuzzy: false is not fuzzy,
1867 : * because it means "definitely not pinned for DMA", but true means "probably
1868 : * pinned for DMA, but possibly a false positive due to having at least
1869 : * GUP_PIN_COUNTING_BIAS worth of normal folio references".
1870 : *
1871 : * False positives are OK, because: a) it's unlikely for a folio to
1872 : * get that many refcounts, and b) all the callers of this routine are
1873 : * expected to be able to deal gracefully with a false positive.
1874 : *
1875 : * For large folios, the result will be exactly correct. That's because
1876 : * we have more tracking data available: the _pincount field is used
1877 : * instead of the GUP_PIN_COUNTING_BIAS scheme.
1878 : *
1879 : * For more information, please see Documentation/core-api/pin_user_pages.rst.
1880 : *
1881 : * Return: True, if it is likely that the page has been "dma-pinned".
1882 : * False, if the page is definitely not dma-pinned.
1883 : */
1884 : static inline bool folio_maybe_dma_pinned(struct folio *folio)
1885 : {
1886 0 : if (folio_test_large(folio))
1887 0 : return atomic_read(&folio->_pincount) > 0;
1888 :
1889 : /*
1890 : * folio_ref_count() is signed. If that refcount overflows, then
1891 : * folio_ref_count() returns a negative value, and callers will avoid
1892 : * further incrementing the refcount.
1893 : *
1894 : * Here, for that overflow case, use the sign bit to count a little
1895 : * bit higher via unsigned math, and thus still get an accurate result.
1896 : */
1897 0 : return ((unsigned int)folio_ref_count(folio)) >=
1898 : GUP_PIN_COUNTING_BIAS;
1899 : }
1900 :
1901 : static inline bool page_maybe_dma_pinned(struct page *page)
1902 : {
1903 : return folio_maybe_dma_pinned(page_folio(page));
1904 : }
1905 :
1906 : /*
1907 : * This should most likely only be called during fork() to see whether we
1908 : * should break the cow immediately for an anon page on the src mm.
1909 : *
1910 : * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq.
1911 : */
1912 : static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
1913 : struct page *page)
1914 : {
1915 : VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1));
1916 :
1917 : if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
1918 : return false;
1919 :
1920 : return page_maybe_dma_pinned(page);
1921 : }
1922 :
1923 : /**
1924 : * is_zero_page - Query if a page is a zero page
1925 : * @page: The page to query
1926 : *
1927 : * This returns true if @page is one of the permanent zero pages.
1928 : */
1929 : static inline bool is_zero_page(const struct page *page)
1930 : {
1931 : return is_zero_pfn(page_to_pfn(page));
1932 : }
1933 :
1934 : /**
1935 : * is_zero_folio - Query if a folio is a zero page
1936 : * @folio: The folio to query
1937 : *
1938 : * This returns true if @folio is one of the permanent zero pages.
1939 : */
1940 : static inline bool is_zero_folio(const struct folio *folio)
1941 : {
1942 : return is_zero_page(&folio->page);
1943 : }
1944 :
1945 : /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */
1946 : #ifdef CONFIG_MIGRATION
1947 : static inline bool folio_is_longterm_pinnable(struct folio *folio)
1948 : {
1949 : #ifdef CONFIG_CMA
1950 : int mt = folio_migratetype(folio);
1951 :
1952 : if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
1953 : return false;
1954 : #endif
1955 : /* The zero page can be "pinned" but gets special handling. */
1956 : if (is_zero_folio(folio))
1957 : return true;
1958 :
1959 : /* Coherent device memory must always allow eviction. */
1960 : if (folio_is_device_coherent(folio))
1961 : return false;
1962 :
1963 : /* Otherwise, non-movable zone folios can be pinned. */
1964 : return !folio_is_zone_movable(folio);
1965 :
1966 : }
1967 : #else
1968 : static inline bool folio_is_longterm_pinnable(struct folio *folio)
1969 : {
1970 : return true;
1971 : }
1972 : #endif
1973 :
1974 : static inline void set_page_zone(struct page *page, enum zone_type zone)
1975 : {
1976 : page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
1977 : page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
1978 : }
1979 :
1980 : static inline void set_page_node(struct page *page, unsigned long node)
1981 : {
1982 : page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
1983 : page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
1984 : }
1985 :
1986 : static inline void set_page_links(struct page *page, enum zone_type zone,
1987 : unsigned long node, unsigned long pfn)
1988 : {
1989 : set_page_zone(page, zone);
1990 : set_page_node(page, node);
1991 : #ifdef SECTION_IN_PAGE_FLAGS
1992 : set_page_section(page, pfn_to_section_nr(pfn));
1993 : #endif
1994 : }
1995 :
1996 : /**
1997 : * folio_nr_pages - The number of pages in the folio.
1998 : * @folio: The folio.
1999 : *
2000 : * Return: A positive power of two.
2001 : */
2002 : static inline long folio_nr_pages(struct folio *folio)
2003 : {
2004 3550868945 : if (!folio_test_large(folio))
2005 : return 1;
2006 : #ifdef CONFIG_64BIT
2007 102527676 : return folio->_folio_nr_pages;
2008 : #else
2009 : return 1L << folio->_folio_order;
2010 : #endif
2011 : }
2012 :
2013 : /*
2014 : * compound_nr() returns the number of pages in this potentially compound
2015 : * page. compound_nr() can be called on a tail page, and is defined to
2016 : * return 1 in that case.
2017 : */
2018 908085368 : static inline unsigned long compound_nr(struct page *page)
2019 : {
2020 908085368 : struct folio *folio = (struct folio *)page;
2021 :
2022 908085368 : if (!test_bit(PG_head, &folio->flags))
2023 : return 1;
2024 : #ifdef CONFIG_64BIT
2025 93726775 : return folio->_folio_nr_pages;
2026 : #else
2027 : return 1L << folio->_folio_order;
2028 : #endif
2029 : }
2030 :
2031 : /**
2032 : * thp_nr_pages - The number of regular pages in this huge page.
2033 : * @page: The head page of a huge page.
2034 : */
2035 : static inline int thp_nr_pages(struct page *page)
2036 : {
2037 0 : return folio_nr_pages((struct folio *)page);
2038 : }
2039 :
2040 : /**
2041 : * folio_next - Move to the next physical folio.
2042 : * @folio: The folio we're currently operating on.
2043 : *
2044 : * If you have physically contiguous memory which may span more than
2045 : * one folio (eg a &struct bio_vec), use this function to move from one
2046 : * folio to the next. Do not use it if the memory is only virtually
2047 : * contiguous as the folios are almost certainly not adjacent to each
2048 : * other. This is the folio equivalent to writing ``page++``.
2049 : *
2050 : * Context: We assume that the folios are refcounted and/or locked at a
2051 : * higher level and do not adjust the reference counts.
2052 : * Return: The next struct folio.
2053 : */
2054 : static inline struct folio *folio_next(struct folio *folio)
2055 : {
2056 112081126 : return (struct folio *)folio_page(folio, folio_nr_pages(folio));
2057 : }
2058 :
2059 : /**
2060 : * folio_shift - The size of the memory described by this folio.
2061 : * @folio: The folio.
2062 : *
2063 : * A folio represents a number of bytes which is a power-of-two in size.
2064 : * This function tells you which power-of-two the folio is. See also
2065 : * folio_size() and folio_order().
2066 : *
2067 : * Context: The caller should have a reference on the folio to prevent
2068 : * it from being split. It is not necessary for the folio to be locked.
2069 : * Return: The base-2 logarithm of the size of this folio.
2070 : */
2071 : static inline unsigned int folio_shift(struct folio *folio)
2072 : {
2073 258291489 : return PAGE_SHIFT + folio_order(folio);
2074 : }
2075 :
2076 : /**
2077 : * folio_size - The number of bytes in a folio.
2078 : * @folio: The folio.
2079 : *
2080 : * Context: The caller should have a reference on the folio to prevent
2081 : * it from being split. It is not necessary for the folio to be locked.
2082 : * Return: The number of bytes in this folio.
2083 : */
2084 : static inline size_t folio_size(struct folio *folio)
2085 : {
2086 5449680446 : return PAGE_SIZE << folio_order(folio);
2087 : }
2088 :
2089 : /**
2090 : * folio_estimated_sharers - Estimate the number of sharers of a folio.
2091 : * @folio: The folio.
2092 : *
2093 : * folio_estimated_sharers() aims to serve as a function to efficiently
2094 : * estimate the number of processes sharing a folio. This is done by
2095 : * looking at the precise mapcount of the first subpage in the folio, and
2096 : * assuming the other subpages are the same. This may not be true for large
2097 : * folios. If you want exact mapcounts for exact calculations, look at
2098 : * page_mapcount() or folio_total_mapcount().
2099 : *
2100 : * Return: The estimated number of processes sharing a folio.
2101 : */
2102 : static inline int folio_estimated_sharers(struct folio *folio)
2103 : {
2104 : return page_mapcount(folio_page(folio, 0));
2105 : }
2106 :
2107 : #ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
2108 : static inline int arch_make_page_accessible(struct page *page)
2109 : {
2110 : return 0;
2111 : }
2112 : #endif
2113 :
2114 : #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
2115 : static inline int arch_make_folio_accessible(struct folio *folio)
2116 : {
2117 57355412 : int ret;
2118 57355412 : long i, nr = folio_nr_pages(folio);
2119 :
2120 57355412 : for (i = 0; i < nr; i++) {
2121 : ret = arch_make_page_accessible(folio_page(folio, i));
2122 : if (ret)
2123 : break;
2124 : }
2125 :
2126 57355412 : return ret;
2127 : }
2128 : #endif
2129 :
2130 : /*
2131 : * Some inline functions in vmstat.h depend on page_zone()
2132 : */
2133 : #include <linux/vmstat.h>
2134 :
2135 : static __always_inline void *lowmem_page_address(const struct page *page)
2136 : {
2137 27215341375 : return page_to_virt(page);
2138 : }
2139 :
2140 : #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
2141 : #define HASHED_PAGE_VIRTUAL
2142 : #endif
2143 :
2144 : #if defined(WANT_PAGE_VIRTUAL)
2145 : static inline void *page_address(const struct page *page)
2146 : {
2147 : return page->virtual;
2148 : }
2149 : static inline void set_page_address(struct page *page, void *address)
2150 : {
2151 : page->virtual = address;
2152 : }
2153 : #define page_address_init() do { } while(0)
2154 : #endif
2155 :
2156 : #if defined(HASHED_PAGE_VIRTUAL)
2157 : void *page_address(const struct page *page);
2158 : void set_page_address(struct page *page, void *virtual);
2159 : void page_address_init(void);
2160 : #endif
2161 :
2162 : #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
2163 : #define page_address(page) lowmem_page_address(page)
2164 : #define set_page_address(page, address) do { } while(0)
2165 : #define page_address_init() do { } while(0)
2166 : #endif
2167 :
2168 : static inline void *folio_address(const struct folio *folio)
2169 : {
2170 349674 : return page_address(&folio->page);
2171 : }
2172 :
2173 : extern void *page_rmapping(struct page *page);
2174 : extern pgoff_t __page_file_index(struct page *page);
2175 :
2176 : /*
2177 : * Return the pagecache index of the passed page. Regular pagecache pages
2178 : * use ->index whereas swapcache pages use swp_offset(->private)
2179 : */
2180 0 : static inline pgoff_t page_index(struct page *page)
2181 : {
2182 0 : if (unlikely(PageSwapCache(page)))
2183 0 : return __page_file_index(page);
2184 0 : return page->index;
2185 : }
2186 :
2187 : /*
2188 : * Return true only if the page has been allocated with
2189 : * ALLOC_NO_WATERMARKS and the low watermark was not
2190 : * met implying that the system is under some pressure.
2191 : */
2192 : static inline bool page_is_pfmemalloc(const struct page *page)
2193 : {
2194 : /*
2195 : * lru.next has bit 1 set if the page is allocated from the
2196 : * pfmemalloc reserves. Callers may simply overwrite it if
2197 : * they do not need to preserve that information.
2198 : */
2199 : return (uintptr_t)page->lru.next & BIT(1);
2200 : }
2201 :
2202 : /*
2203 : * Return true only if the folio has been allocated with
2204 : * ALLOC_NO_WATERMARKS and the low watermark was not
2205 : * met implying that the system is under some pressure.
2206 : */
2207 : static inline bool folio_is_pfmemalloc(const struct folio *folio)
2208 : {
2209 : /*
2210 : * lru.next has bit 1 set if the page is allocated from the
2211 : * pfmemalloc reserves. Callers may simply overwrite it if
2212 : * they do not need to preserve that information.
2213 : */
2214 : return (uintptr_t)folio->lru.next & BIT(1);
2215 : }
2216 :
2217 : /*
2218 : * Only to be called by the page allocator on a freshly allocated
2219 : * page.
2220 : */
2221 : static inline void set_page_pfmemalloc(struct page *page)
2222 : {
2223 : page->lru.next = (void *)BIT(1);
2224 : }
2225 :
2226 : static inline void clear_page_pfmemalloc(struct page *page)
2227 : {
2228 : page->lru.next = NULL;
2229 : }
2230 :
2231 : /*
2232 : * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
2233 : */
2234 : extern void pagefault_out_of_memory(void);
2235 :
2236 : #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
2237 : #define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1))
2238 : #define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))
2239 :
2240 : /*
2241 : * Flags passed to show_mem() and show_free_areas() to suppress output in
2242 : * various contexts.
2243 : */
2244 : #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */
2245 :
2246 : extern void __show_free_areas(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
2247 : static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodemask)
2248 : {
2249 : __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1);
2250 : }
2251 :
2252 : /*
2253 : * Parameter block passed down to zap_pte_range in exceptional cases.
2254 : */
2255 : struct zap_details {
2256 : struct folio *single_folio; /* Locked folio to be unmapped */
2257 : bool even_cows; /* Zap COWed private pages too? */
2258 : zap_flags_t zap_flags; /* Extra flags for zapping */
2259 : };
2260 :
2261 : /*
2262 : * Whether to drop the pte markers, for example, the uffd-wp information for
2263 : * file-backed memory. This should only be specified when we will completely
2264 : * drop the page in the mm, either by truncation or unmapping of the vma. By
2265 : * default, the flag is not set.
2266 : */
2267 : #define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0))
2268 : /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */
2269 : #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1))
2270 :
2271 : #ifdef CONFIG_SCHED_MM_CID
2272 : void sched_mm_cid_before_execve(struct task_struct *t);
2273 : void sched_mm_cid_after_execve(struct task_struct *t);
2274 : void sched_mm_cid_fork(struct task_struct *t);
2275 : void sched_mm_cid_exit_signals(struct task_struct *t);
2276 : static inline int task_mm_cid(struct task_struct *t)
2277 : {
2278 : return t->mm_cid;
2279 : }
2280 : #else
2281 : static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
2282 : static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
2283 : static inline void sched_mm_cid_fork(struct task_struct *t) { }
2284 : static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
2285 : static inline int task_mm_cid(struct task_struct *t)
2286 : {
2287 : /*
2288 : * Use the processor id as a fall-back when the mm cid feature is
2289 : * disabled. This provides functional per-cpu data structure accesses
2290 : * in user-space, althrough it won't provide the memory usage benefits.
2291 : */
2292 : return raw_smp_processor_id();
2293 : }
2294 : #endif
2295 :
2296 : #ifdef CONFIG_MMU
2297 : extern bool can_do_mlock(void);
2298 : #else
2299 : static inline bool can_do_mlock(void) { return false; }
2300 : #endif
2301 : extern int user_shm_lock(size_t, struct ucounts *);
2302 : extern void user_shm_unlock(size_t, struct ucounts *);
2303 :
2304 : struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
2305 : pte_t pte);
2306 : struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
2307 : pte_t pte);
2308 : struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
2309 : pmd_t pmd);
2310 :
2311 : void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
2312 : unsigned long size);
2313 : void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
2314 : unsigned long size, struct zap_details *details);
2315 : static inline void zap_vma_pages(struct vm_area_struct *vma)
2316 : {
2317 : zap_page_range_single(vma, vma->vm_start,
2318 : vma->vm_end - vma->vm_start, NULL);
2319 : }
2320 : void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
2321 : struct vm_area_struct *start_vma, unsigned long start,
2322 : unsigned long end, bool mm_wr_locked);
2323 :
2324 : struct mmu_notifier_range;
2325 :
2326 : void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
2327 : unsigned long end, unsigned long floor, unsigned long ceiling);
2328 : int
2329 : copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
2330 : int follow_pte(struct mm_struct *mm, unsigned long address,
2331 : pte_t **ptepp, spinlock_t **ptlp);
2332 : int follow_pfn(struct vm_area_struct *vma, unsigned long address,
2333 : unsigned long *pfn);
2334 : int follow_phys(struct vm_area_struct *vma, unsigned long address,
2335 : unsigned int flags, unsigned long *prot, resource_size_t *phys);
2336 : int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
2337 : void *buf, int len, int write);
2338 :
2339 : extern void truncate_pagecache(struct inode *inode, loff_t new);
2340 : extern void truncate_setsize(struct inode *inode, loff_t newsize);
2341 : void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
2342 : void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
2343 : int generic_error_remove_page(struct address_space *mapping, struct page *page);
2344 :
2345 : struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
2346 : unsigned long address, struct pt_regs *regs);
2347 :
2348 : #ifdef CONFIG_MMU
2349 : extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
2350 : unsigned long address, unsigned int flags,
2351 : struct pt_regs *regs);
2352 : extern int fixup_user_fault(struct mm_struct *mm,
2353 : unsigned long address, unsigned int fault_flags,
2354 : bool *unlocked);
2355 : void unmap_mapping_pages(struct address_space *mapping,
2356 : pgoff_t start, pgoff_t nr, bool even_cows);
2357 : void unmap_mapping_range(struct address_space *mapping,
2358 : loff_t const holebegin, loff_t const holelen, int even_cows);
2359 : #else
2360 : static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
2361 : unsigned long address, unsigned int flags,
2362 : struct pt_regs *regs)
2363 : {
2364 : /* should never happen if there's no MMU */
2365 : BUG();
2366 : return VM_FAULT_SIGBUS;
2367 : }
2368 : static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address,
2369 : unsigned int fault_flags, bool *unlocked)
2370 : {
2371 : /* should never happen if there's no MMU */
2372 : BUG();
2373 : return -EFAULT;
2374 : }
2375 : static inline void unmap_mapping_pages(struct address_space *mapping,
2376 : pgoff_t start, pgoff_t nr, bool even_cows) { }
2377 : static inline void unmap_mapping_range(struct address_space *mapping,
2378 : loff_t const holebegin, loff_t const holelen, int even_cows) { }
2379 : #endif
2380 :
2381 : static inline void unmap_shared_mapping_range(struct address_space *mapping,
2382 : loff_t const holebegin, loff_t const holelen)
2383 : {
2384 : unmap_mapping_range(mapping, holebegin, holelen, 0);
2385 : }
2386 :
2387 : static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm,
2388 : unsigned long addr);
2389 :
2390 : extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
2391 : void *buf, int len, unsigned int gup_flags);
2392 : extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
2393 : void *buf, int len, unsigned int gup_flags);
2394 : extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
2395 : void *buf, int len, unsigned int gup_flags);
2396 :
2397 : long get_user_pages_remote(struct mm_struct *mm,
2398 : unsigned long start, unsigned long nr_pages,
2399 : unsigned int gup_flags, struct page **pages,
2400 : int *locked);
2401 : long pin_user_pages_remote(struct mm_struct *mm,
2402 : unsigned long start, unsigned long nr_pages,
2403 : unsigned int gup_flags, struct page **pages,
2404 : int *locked);
2405 :
2406 : static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
2407 : unsigned long addr,
2408 : int gup_flags,
2409 : struct vm_area_struct **vmap)
2410 : {
2411 : struct page *page;
2412 : struct vm_area_struct *vma;
2413 : int got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);
2414 :
2415 : if (got < 0)
2416 : return ERR_PTR(got);
2417 : if (got == 0)
2418 : return NULL;
2419 :
2420 : vma = vma_lookup(mm, addr);
2421 : if (WARN_ON_ONCE(!vma)) {
2422 : put_page(page);
2423 : return ERR_PTR(-EINVAL);
2424 : }
2425 :
2426 : *vmap = vma;
2427 : return page;
2428 : }
2429 :
2430 : long get_user_pages(unsigned long start, unsigned long nr_pages,
2431 : unsigned int gup_flags, struct page **pages);
2432 : long pin_user_pages(unsigned long start, unsigned long nr_pages,
2433 : unsigned int gup_flags, struct page **pages);
2434 : long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
2435 : struct page **pages, unsigned int gup_flags);
2436 : long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
2437 : struct page **pages, unsigned int gup_flags);
2438 :
2439 : int get_user_pages_fast(unsigned long start, int nr_pages,
2440 : unsigned int gup_flags, struct page **pages);
2441 : int pin_user_pages_fast(unsigned long start, int nr_pages,
2442 : unsigned int gup_flags, struct page **pages);
2443 : void folio_add_pin(struct folio *folio);
2444 :
2445 : int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
2446 : int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
2447 : struct task_struct *task, bool bypass_rlim);
2448 :
2449 : struct kvec;
2450 : struct page *get_dump_page(unsigned long addr);
2451 :
2452 : bool folio_mark_dirty(struct folio *folio);
2453 : bool set_page_dirty(struct page *page);
2454 : int set_page_dirty_lock(struct page *page);
2455 :
2456 : int get_cmdline(struct task_struct *task, char *buffer, int buflen);
2457 :
2458 : extern unsigned long move_page_tables(struct vm_area_struct *vma,
2459 : unsigned long old_addr, struct vm_area_struct *new_vma,
2460 : unsigned long new_addr, unsigned long len,
2461 : bool need_rmap_locks);
2462 :
2463 : /*
2464 : * Flags used by change_protection(). For now we make it a bitmap so
2465 : * that we can pass in multiple flags just like parameters. However
2466 : * for now all the callers are only use one of the flags at the same
2467 : * time.
2468 : */
2469 : /*
2470 : * Whether we should manually check if we can map individual PTEs writable,
2471 : * because something (e.g., COW, uffd-wp) blocks that from happening for all
2472 : * PTEs automatically in a writable mapping.
2473 : */
2474 : #define MM_CP_TRY_CHANGE_WRITABLE (1UL << 0)
2475 : /* Whether this protection change is for NUMA hints */
2476 : #define MM_CP_PROT_NUMA (1UL << 1)
2477 : /* Whether this change is for write protecting */
2478 : #define MM_CP_UFFD_WP (1UL << 2) /* do wp */
2479 : #define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */
2480 : #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \
2481 : MM_CP_UFFD_WP_RESOLVE)
2482 :
2483 : bool vma_needs_dirty_tracking(struct vm_area_struct *vma);
2484 : int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
2485 : static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
2486 : {
2487 : /*
2488 : * We want to check manually if we can change individual PTEs writable
2489 : * if we can't do that automatically for all PTEs in a mapping. For
2490 : * private mappings, that's always the case when we have write
2491 : * permissions as we properly have to handle COW.
2492 : */
2493 : if (vma->vm_flags & VM_SHARED)
2494 : return vma_wants_writenotify(vma, vma->vm_page_prot);
2495 : return !!(vma->vm_flags & VM_WRITE);
2496 :
2497 : }
2498 : bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
2499 : pte_t pte);
2500 : extern long change_protection(struct mmu_gather *tlb,
2501 : struct vm_area_struct *vma, unsigned long start,
2502 : unsigned long end, unsigned long cp_flags);
2503 : extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
2504 : struct vm_area_struct *vma, struct vm_area_struct **pprev,
2505 : unsigned long start, unsigned long end, unsigned long newflags);
2506 :
2507 : /*
2508 : * doesn't attempt to fault and will return short.
2509 : */
2510 : int get_user_pages_fast_only(unsigned long start, int nr_pages,
2511 : unsigned int gup_flags, struct page **pages);
2512 :
2513 : static inline bool get_user_page_fast_only(unsigned long addr,
2514 : unsigned int gup_flags, struct page **pagep)
2515 : {
2516 : return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1;
2517 : }
2518 : /*
2519 : * per-process(per-mm_struct) statistics.
2520 : */
2521 : static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
2522 : {
2523 15430776 : return percpu_counter_read_positive(&mm->rss_stat[member]);
2524 : }
2525 :
2526 : void mm_trace_rss_stat(struct mm_struct *mm, int member);
2527 :
2528 31751561 : static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
2529 : {
2530 31751561 : percpu_counter_add(&mm->rss_stat[member], value);
2531 :
2532 31749974 : mm_trace_rss_stat(mm, member);
2533 31753221 : }
2534 :
2535 : static inline void inc_mm_counter(struct mm_struct *mm, int member)
2536 : {
2537 : percpu_counter_inc(&mm->rss_stat[member]);
2538 :
2539 : mm_trace_rss_stat(mm, member);
2540 : }
2541 :
2542 : static inline void dec_mm_counter(struct mm_struct *mm, int member)
2543 : {
2544 : percpu_counter_dec(&mm->rss_stat[member]);
2545 :
2546 : mm_trace_rss_stat(mm, member);
2547 : }
2548 :
2549 : /* Optimized variant when page is already known not to be PageAnon */
2550 : static inline int mm_counter_file(struct page *page)
2551 : {
2552 : if (PageSwapBacked(page))
2553 : return MM_SHMEMPAGES;
2554 : return MM_FILEPAGES;
2555 : }
2556 :
2557 : static inline int mm_counter(struct page *page)
2558 : {
2559 : if (PageAnon(page))
2560 : return MM_ANONPAGES;
2561 : return mm_counter_file(page);
2562 : }
2563 :
2564 : static inline unsigned long get_mm_rss(struct mm_struct *mm)
2565 : {
2566 15430776 : return get_mm_counter(mm, MM_FILEPAGES) +
2567 15430776 : get_mm_counter(mm, MM_ANONPAGES) +
2568 : get_mm_counter(mm, MM_SHMEMPAGES);
2569 : }
2570 :
2571 : static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
2572 : {
2573 15430776 : return max(mm->hiwater_rss, get_mm_rss(mm));
2574 : }
2575 :
2576 : static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
2577 : {
2578 : return max(mm->hiwater_vm, mm->total_vm);
2579 : }
2580 :
2581 : static inline void update_hiwater_rss(struct mm_struct *mm)
2582 : {
2583 : unsigned long _rss = get_mm_rss(mm);
2584 :
2585 : if ((mm)->hiwater_rss < _rss)
2586 : (mm)->hiwater_rss = _rss;
2587 : }
2588 :
2589 : static inline void update_hiwater_vm(struct mm_struct *mm)
2590 : {
2591 : if (mm->hiwater_vm < mm->total_vm)
2592 : mm->hiwater_vm = mm->total_vm;
2593 : }
2594 :
2595 : static inline void reset_mm_hiwater_rss(struct mm_struct *mm)
2596 : {
2597 : mm->hiwater_rss = get_mm_rss(mm);
2598 : }
2599 :
2600 15430776 : static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
2601 : struct mm_struct *mm)
2602 : {
2603 15430776 : unsigned long hiwater_rss = get_mm_hiwater_rss(mm);
2604 :
2605 15430776 : if (*maxrss < hiwater_rss)
2606 15248923 : *maxrss = hiwater_rss;
2607 15430776 : }
2608 :
2609 : #if defined(SPLIT_RSS_COUNTING)
2610 : void sync_mm_rss(struct mm_struct *mm);
2611 : #else
2612 : static inline void sync_mm_rss(struct mm_struct *mm)
2613 : {
2614 : }
2615 : #endif
2616 :
2617 : #ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
2618 : static inline int pte_special(pte_t pte)
2619 : {
2620 : return 0;
2621 : }
2622 :
2623 : static inline pte_t pte_mkspecial(pte_t pte)
2624 : {
2625 : return pte;
2626 : }
2627 : #endif
2628 :
2629 : #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
2630 : static inline int pte_devmap(pte_t pte)
2631 : {
2632 : return 0;
2633 : }
2634 : #endif
2635 :
2636 : extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
2637 : spinlock_t **ptl);
2638 : static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
2639 : spinlock_t **ptl)
2640 : {
2641 : pte_t *ptep;
2642 : __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl));
2643 : return ptep;
2644 : }
2645 :
2646 : #ifdef __PAGETABLE_P4D_FOLDED
2647 : static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
2648 : unsigned long address)
2649 : {
2650 : return 0;
2651 : }
2652 : #else
2653 : int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
2654 : #endif
2655 :
2656 : #if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU)
2657 : static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
2658 : unsigned long address)
2659 : {
2660 : return 0;
2661 : }
2662 : static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
2663 : static inline void mm_dec_nr_puds(struct mm_struct *mm) {}
2664 :
2665 : #else
2666 : int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);
2667 :
2668 : static inline void mm_inc_nr_puds(struct mm_struct *mm)
2669 : {
2670 : if (mm_pud_folded(mm))
2671 : return;
2672 : atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
2673 : }
2674 :
2675 : static inline void mm_dec_nr_puds(struct mm_struct *mm)
2676 : {
2677 : if (mm_pud_folded(mm))
2678 : return;
2679 : atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
2680 : }
2681 : #endif
2682 :
2683 : #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
2684 : static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
2685 : unsigned long address)
2686 : {
2687 : return 0;
2688 : }
2689 :
2690 : static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
2691 : static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
2692 :
2693 : #else
2694 : int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
2695 :
2696 : static inline void mm_inc_nr_pmds(struct mm_struct *mm)
2697 : {
2698 : if (mm_pmd_folded(mm))
2699 : return;
2700 : atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
2701 : }
2702 :
2703 : static inline void mm_dec_nr_pmds(struct mm_struct *mm)
2704 : {
2705 : if (mm_pmd_folded(mm))
2706 : return;
2707 : atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
2708 : }
2709 : #endif
2710 :
2711 : #ifdef CONFIG_MMU
2712 : static inline void mm_pgtables_bytes_init(struct mm_struct *mm)
2713 : {
2714 : atomic_long_set(&mm->pgtables_bytes, 0);
2715 : }
2716 :
2717 : static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
2718 : {
2719 : return atomic_long_read(&mm->pgtables_bytes);
2720 : }
2721 :
2722 : static inline void mm_inc_nr_ptes(struct mm_struct *mm)
2723 : {
2724 : atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
2725 : }
2726 :
2727 : static inline void mm_dec_nr_ptes(struct mm_struct *mm)
2728 : {
2729 : atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
2730 : }
2731 : #else
2732 :
2733 : static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {}
2734 : static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
2735 : {
2736 : return 0;
2737 : }
2738 :
2739 : static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
2740 : static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
2741 : #endif
2742 :
2743 : int __pte_alloc(struct mm_struct *mm, pmd_t *pmd);
2744 : int __pte_alloc_kernel(pmd_t *pmd);
2745 :
2746 : #if defined(CONFIG_MMU)
2747 :
2748 : static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
2749 : unsigned long address)
2750 : {
2751 : return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ?
2752 : NULL : p4d_offset(pgd, address);
2753 : }
2754 :
2755 : static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d,
2756 : unsigned long address)
2757 : {
2758 : return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ?
2759 : NULL : pud_offset(p4d, address);
2760 : }
2761 :
2762 : static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2763 : {
2764 : return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
2765 : NULL: pmd_offset(pud, address);
2766 : }
2767 : #endif /* CONFIG_MMU */
2768 :
2769 : #if USE_SPLIT_PTE_PTLOCKS
2770 : #if ALLOC_SPLIT_PTLOCKS
2771 : void __init ptlock_cache_init(void);
2772 : extern bool ptlock_alloc(struct page *page);
2773 : extern void ptlock_free(struct page *page);
2774 :
2775 : static inline spinlock_t *ptlock_ptr(struct page *page)
2776 : {
2777 : return page->ptl;
2778 : }
2779 : #else /* ALLOC_SPLIT_PTLOCKS */
2780 : static inline void ptlock_cache_init(void)
2781 : {
2782 : }
2783 :
2784 : static inline bool ptlock_alloc(struct page *page)
2785 : {
2786 : return true;
2787 : }
2788 :
2789 : static inline void ptlock_free(struct page *page)
2790 : {
2791 : }
2792 :
2793 : static inline spinlock_t *ptlock_ptr(struct page *page)
2794 : {
2795 : return &page->ptl;
2796 : }
2797 : #endif /* ALLOC_SPLIT_PTLOCKS */
2798 :
2799 : static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
2800 : {
2801 : return ptlock_ptr(pmd_page(*pmd));
2802 : }
2803 :
2804 : static inline bool ptlock_init(struct page *page)
2805 : {
2806 : /*
2807 : * prep_new_page() initialize page->private (and therefore page->ptl)
2808 : * with 0. Make sure nobody took it in use in between.
2809 : *
2810 : * It can happen if arch try to use slab for page table allocation:
2811 : * slab code uses page->slab_cache, which share storage with page->ptl.
2812 : */
2813 : VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page);
2814 : if (!ptlock_alloc(page))
2815 : return false;
2816 : spin_lock_init(ptlock_ptr(page));
2817 : return true;
2818 : }
2819 :
2820 : #else /* !USE_SPLIT_PTE_PTLOCKS */
2821 : /*
2822 : * We use mm->page_table_lock to guard all pagetable pages of the mm.
2823 : */
2824 : static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
2825 : {
2826 : return &mm->page_table_lock;
2827 : }
2828 : static inline void ptlock_cache_init(void) {}
2829 : static inline bool ptlock_init(struct page *page) { return true; }
2830 : static inline void ptlock_free(struct page *page) {}
2831 : #endif /* USE_SPLIT_PTE_PTLOCKS */
2832 :
2833 : static inline bool pgtable_pte_page_ctor(struct page *page)
2834 : {
2835 : if (!ptlock_init(page))
2836 : return false;
2837 : __SetPageTable(page);
2838 : inc_lruvec_page_state(page, NR_PAGETABLE);
2839 : return true;
2840 : }
2841 :
2842 : static inline void pgtable_pte_page_dtor(struct page *page)
2843 : {
2844 : ptlock_free(page);
2845 : __ClearPageTable(page);
2846 : dec_lruvec_page_state(page, NR_PAGETABLE);
2847 : }
2848 :
2849 : pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
2850 : static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr)
2851 : {
2852 : return __pte_offset_map(pmd, addr, NULL);
2853 : }
2854 :
2855 : pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
2856 : unsigned long addr, spinlock_t **ptlp);
2857 : static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
2858 : unsigned long addr, spinlock_t **ptlp)
2859 : {
2860 0 : pte_t *pte;
2861 :
2862 0 : __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp));
2863 0 : return pte;
2864 : }
2865 :
2866 : pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
2867 : unsigned long addr, spinlock_t **ptlp);
2868 :
2869 : #define pte_unmap_unlock(pte, ptl) do { \
2870 : spin_unlock(ptl); \
2871 : pte_unmap(pte); \
2872 : } while (0)
2873 :
2874 : #define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd))
2875 :
2876 : #define pte_alloc_map(mm, pmd, address) \
2877 : (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address))
2878 :
2879 : #define pte_alloc_map_lock(mm, pmd, address, ptlp) \
2880 : (pte_alloc(mm, pmd) ? \
2881 : NULL : pte_offset_map_lock(mm, pmd, address, ptlp))
2882 :
2883 : #define pte_alloc_kernel(pmd, address) \
2884 : ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
2885 : NULL: pte_offset_kernel(pmd, address))
2886 :
2887 : #if USE_SPLIT_PMD_PTLOCKS
2888 :
2889 : static inline struct page *pmd_pgtable_page(pmd_t *pmd)
2890 : {
2891 : unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
2892 : return virt_to_page((void *)((unsigned long) pmd & mask));
2893 : }
2894 :
2895 : static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
2896 : {
2897 : return ptlock_ptr(pmd_pgtable_page(pmd));
2898 : }
2899 :
2900 : static inline bool pmd_ptlock_init(struct page *page)
2901 : {
2902 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2903 : page->pmd_huge_pte = NULL;
2904 : #endif
2905 : return ptlock_init(page);
2906 : }
2907 :
2908 : static inline void pmd_ptlock_free(struct page *page)
2909 : {
2910 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2911 : VM_BUG_ON_PAGE(page->pmd_huge_pte, page);
2912 : #endif
2913 : ptlock_free(page);
2914 : }
2915 :
2916 : #define pmd_huge_pte(mm, pmd) (pmd_pgtable_page(pmd)->pmd_huge_pte)
2917 :
2918 : #else
2919 :
2920 : static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
2921 : {
2922 : return &mm->page_table_lock;
2923 : }
2924 :
2925 : static inline bool pmd_ptlock_init(struct page *page) { return true; }
2926 : static inline void pmd_ptlock_free(struct page *page) {}
2927 :
2928 : #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
2929 :
2930 : #endif
2931 :
2932 : static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
2933 : {
2934 : spinlock_t *ptl = pmd_lockptr(mm, pmd);
2935 : spin_lock(ptl);
2936 : return ptl;
2937 : }
2938 :
2939 : static inline bool pgtable_pmd_page_ctor(struct page *page)
2940 : {
2941 : if (!pmd_ptlock_init(page))
2942 : return false;
2943 : __SetPageTable(page);
2944 : inc_lruvec_page_state(page, NR_PAGETABLE);
2945 : return true;
2946 : }
2947 :
2948 : static inline void pgtable_pmd_page_dtor(struct page *page)
2949 : {
2950 : pmd_ptlock_free(page);
2951 : __ClearPageTable(page);
2952 : dec_lruvec_page_state(page, NR_PAGETABLE);
2953 : }
2954 :
2955 : /*
2956 : * No scalability reason to split PUD locks yet, but follow the same pattern
2957 : * as the PMD locks to make it easier if we decide to. The VM should not be
2958 : * considered ready to switch to split PUD locks yet; there may be places
2959 : * which need to be converted from page_table_lock.
2960 : */
2961 : static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud)
2962 : {
2963 : return &mm->page_table_lock;
2964 : }
2965 :
2966 : static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
2967 : {
2968 : spinlock_t *ptl = pud_lockptr(mm, pud);
2969 :
2970 : spin_lock(ptl);
2971 : return ptl;
2972 : }
2973 :
2974 : extern void __init pagecache_init(void);
2975 : extern void free_initmem(void);
2976 :
2977 : /*
2978 : * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
2979 : * into the buddy system. The freed pages will be poisoned with pattern
2980 : * "poison" if it's within range [0, UCHAR_MAX].
2981 : * Return pages freed into the buddy system.
2982 : */
2983 : extern unsigned long free_reserved_area(void *start, void *end,
2984 : int poison, const char *s);
2985 :
2986 : extern void adjust_managed_page_count(struct page *page, long count);
2987 :
2988 : extern void reserve_bootmem_region(phys_addr_t start,
2989 : phys_addr_t end, int nid);
2990 :
2991 : /* Free the reserved page into the buddy system, so it gets managed. */
2992 : static inline void free_reserved_page(struct page *page)
2993 : {
2994 : ClearPageReserved(page);
2995 : init_page_count(page);
2996 : __free_page(page);
2997 : adjust_managed_page_count(page, 1);
2998 : }
2999 : #define free_highmem_page(page) free_reserved_page(page)
3000 :
3001 : static inline void mark_page_reserved(struct page *page)
3002 : {
3003 : SetPageReserved(page);
3004 : adjust_managed_page_count(page, -1);
3005 : }
3006 :
3007 : /*
3008 : * Default method to free all the __init memory into the buddy system.
3009 : * The freed pages will be poisoned with pattern "poison" if it's within
3010 : * range [0, UCHAR_MAX].
3011 : * Return pages freed into the buddy system.
3012 : */
3013 : static inline unsigned long free_initmem_default(int poison)
3014 : {
3015 : extern char __init_begin[], __init_end[];
3016 :
3017 : return free_reserved_area(&__init_begin, &__init_end,
3018 : poison, "unused kernel image (initmem)");
3019 : }
3020 :
3021 : static inline unsigned long get_num_physpages(void)
3022 : {
3023 : int nid;
3024 : unsigned long phys_pages = 0;
3025 :
3026 : for_each_online_node(nid)
3027 : phys_pages += node_present_pages(nid);
3028 :
3029 : return phys_pages;
3030 : }
3031 :
3032 : /*
3033 : * Using memblock node mappings, an architecture may initialise its
3034 : * zones, allocate the backing mem_map and account for memory holes in an
3035 : * architecture independent manner.
3036 : *
3037 : * An architecture is expected to register range of page frames backed by
3038 : * physical memory with memblock_add[_node]() before calling
3039 : * free_area_init() passing in the PFN each zone ends at. At a basic
3040 : * usage, an architecture is expected to do something like
3041 : *
3042 : * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
3043 : * max_highmem_pfn};
3044 : * for_each_valid_physical_page_range()
3045 : * memblock_add_node(base, size, nid, MEMBLOCK_NONE)
3046 : * free_area_init(max_zone_pfns);
3047 : */
3048 : void free_area_init(unsigned long *max_zone_pfn);
3049 : unsigned long node_map_pfn_alignment(void);
3050 : unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
3051 : unsigned long end_pfn);
3052 : extern unsigned long absent_pages_in_range(unsigned long start_pfn,
3053 : unsigned long end_pfn);
3054 : extern void get_pfn_range_for_nid(unsigned int nid,
3055 : unsigned long *start_pfn, unsigned long *end_pfn);
3056 :
3057 : #ifndef CONFIG_NUMA
3058 : static inline int early_pfn_to_nid(unsigned long pfn)
3059 : {
3060 : return 0;
3061 : }
3062 : #else
3063 : /* please see mm/page_alloc.c */
3064 : extern int __meminit early_pfn_to_nid(unsigned long pfn);
3065 : #endif
3066 :
3067 : extern void set_dma_reserve(unsigned long new_dma_reserve);
3068 : extern void mem_init(void);
3069 : extern void __init mmap_init(void);
3070 :
3071 : extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
3072 : static inline void show_mem(unsigned int flags, nodemask_t *nodemask)
3073 : {
3074 : __show_mem(flags, nodemask, MAX_NR_ZONES - 1);
3075 : }
3076 : extern long si_mem_available(void);
3077 : extern void si_meminfo(struct sysinfo * val);
3078 : extern void si_meminfo_node(struct sysinfo *val, int nid);
3079 : #ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES
3080 : extern unsigned long arch_reserved_kernel_pages(void);
3081 : #endif
3082 :
3083 : extern __printf(3, 4)
3084 : void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);
3085 :
3086 : extern void setup_per_cpu_pageset(void);
3087 :
3088 : /* nommu.c */
3089 : extern atomic_long_t mmap_pages_allocated;
3090 : extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
3091 :
3092 : /* interval_tree.c */
3093 : void vma_interval_tree_insert(struct vm_area_struct *node,
3094 : struct rb_root_cached *root);
3095 : void vma_interval_tree_insert_after(struct vm_area_struct *node,
3096 : struct vm_area_struct *prev,
3097 : struct rb_root_cached *root);
3098 : void vma_interval_tree_remove(struct vm_area_struct *node,
3099 : struct rb_root_cached *root);
3100 : struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root,
3101 : unsigned long start, unsigned long last);
3102 : struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
3103 : unsigned long start, unsigned long last);
3104 :
3105 : #define vma_interval_tree_foreach(vma, root, start, last) \
3106 : for (vma = vma_interval_tree_iter_first(root, start, last); \
3107 : vma; vma = vma_interval_tree_iter_next(vma, start, last))
3108 :
3109 : void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
3110 : struct rb_root_cached *root);
3111 : void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
3112 : struct rb_root_cached *root);
3113 : struct anon_vma_chain *
3114 : anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
3115 : unsigned long start, unsigned long last);
3116 : struct anon_vma_chain *anon_vma_interval_tree_iter_next(
3117 : struct anon_vma_chain *node, unsigned long start, unsigned long last);
3118 : #ifdef CONFIG_DEBUG_VM_RB
3119 : void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
3120 : #endif
3121 :
3122 : #define anon_vma_interval_tree_foreach(avc, root, start, last) \
3123 : for (avc = anon_vma_interval_tree_iter_first(root, start, last); \
3124 : avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))
3125 :
3126 : /* mmap.c */
3127 : extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
3128 : extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
3129 : unsigned long start, unsigned long end, pgoff_t pgoff,
3130 : struct vm_area_struct *next);
3131 : extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
3132 : unsigned long start, unsigned long end, pgoff_t pgoff);
3133 : extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi,
3134 : struct mm_struct *, struct vm_area_struct *prev, unsigned long addr,
3135 : unsigned long end, unsigned long vm_flags, struct anon_vma *,
3136 : struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx,
3137 : struct anon_vma_name *);
3138 : extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
3139 : extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *,
3140 : unsigned long addr, int new_below);
3141 : extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *,
3142 : unsigned long addr, int new_below);
3143 : extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
3144 : extern void unlink_file_vma(struct vm_area_struct *);
3145 : extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
3146 : unsigned long addr, unsigned long len, pgoff_t pgoff,
3147 : bool *need_rmap_locks);
3148 : extern void exit_mmap(struct mm_struct *);
3149 :
3150 : static inline int check_data_rlimit(unsigned long rlim,
3151 : unsigned long new,
3152 : unsigned long start,
3153 : unsigned long end_data,
3154 : unsigned long start_data)
3155 : {
3156 : if (rlim < RLIM_INFINITY) {
3157 : if (((new - start) + (end_data - start_data)) > rlim)
3158 : return -ENOSPC;
3159 : }
3160 :
3161 : return 0;
3162 : }
3163 :
3164 : extern int mm_take_all_locks(struct mm_struct *mm);
3165 : extern void mm_drop_all_locks(struct mm_struct *mm);
3166 :
3167 : extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
3168 : extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
3169 : extern struct file *get_mm_exe_file(struct mm_struct *mm);
3170 : extern struct file *get_task_exe_file(struct task_struct *task);
3171 :
3172 : extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages);
3173 : extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);
3174 :
3175 : extern bool vma_is_special_mapping(const struct vm_area_struct *vma,
3176 : const struct vm_special_mapping *sm);
3177 : extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
3178 : unsigned long addr, unsigned long len,
3179 : unsigned long flags,
3180 : const struct vm_special_mapping *spec);
3181 : /* This is an obsolete alternative to _install_special_mapping. */
3182 : extern int install_special_mapping(struct mm_struct *mm,
3183 : unsigned long addr, unsigned long len,
3184 : unsigned long flags, struct page **pages);
3185 :
3186 : unsigned long randomize_stack_top(unsigned long stack_top);
3187 : unsigned long randomize_page(unsigned long start, unsigned long range);
3188 :
3189 : extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
3190 :
3191 : extern unsigned long mmap_region(struct file *file, unsigned long addr,
3192 : unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
3193 : struct list_head *uf);
3194 : extern unsigned long do_mmap(struct file *file, unsigned long addr,
3195 : unsigned long len, unsigned long prot, unsigned long flags,
3196 : unsigned long pgoff, unsigned long *populate, struct list_head *uf);
3197 : extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
3198 : unsigned long start, size_t len, struct list_head *uf,
3199 : bool unlock);
3200 : extern int do_munmap(struct mm_struct *, unsigned long, size_t,
3201 : struct list_head *uf);
3202 : extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
3203 :
3204 : #ifdef CONFIG_MMU
3205 : extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
3206 : unsigned long start, unsigned long end,
3207 : struct list_head *uf, bool unlock);
3208 : extern int __mm_populate(unsigned long addr, unsigned long len,
3209 : int ignore_errors);
3210 : static inline void mm_populate(unsigned long addr, unsigned long len)
3211 : {
3212 : /* Ignore errors */
3213 : (void) __mm_populate(addr, len, 1);
3214 : }
3215 : #else
3216 : static inline void mm_populate(unsigned long addr, unsigned long len) {}
3217 : #endif
3218 :
3219 : /* These take the mm semaphore themselves */
3220 : extern int __must_check vm_brk(unsigned long, unsigned long);
3221 : extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
3222 : extern int vm_munmap(unsigned long, size_t);
3223 : extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
3224 : unsigned long, unsigned long,
3225 : unsigned long, unsigned long);
3226 :
3227 : struct vm_unmapped_area_info {
3228 : #define VM_UNMAPPED_AREA_TOPDOWN 1
3229 : unsigned long flags;
3230 : unsigned long length;
3231 : unsigned long low_limit;
3232 : unsigned long high_limit;
3233 : unsigned long align_mask;
3234 : unsigned long align_offset;
3235 : };
3236 :
3237 : extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);
3238 :
3239 : /* truncate.c */
3240 : extern void truncate_inode_pages(struct address_space *, loff_t);
3241 : extern void truncate_inode_pages_range(struct address_space *,
3242 : loff_t lstart, loff_t lend);
3243 : extern void truncate_inode_pages_final(struct address_space *);
3244 :
3245 : /* generic vm_area_ops exported for stackable file systems */
3246 : extern vm_fault_t filemap_fault(struct vm_fault *vmf);
3247 : extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
3248 : pgoff_t start_pgoff, pgoff_t end_pgoff);
3249 : extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
3250 :
3251 : extern unsigned long stack_guard_gap;
3252 : /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
3253 : int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
3254 : struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);
3255 :
3256 : /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
3257 : int expand_downwards(struct vm_area_struct *vma, unsigned long address);
3258 :
3259 : /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
3260 : extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
3261 : extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
3262 : struct vm_area_struct **pprev);
3263 :
3264 : /*
3265 : * Look up the first VMA which intersects the interval [start_addr, end_addr)
3266 : * NULL if none. Assume start_addr < end_addr.
3267 : */
3268 : struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
3269 : unsigned long start_addr, unsigned long end_addr);
3270 :
3271 : /**
3272 : * vma_lookup() - Find a VMA at a specific address
3273 : * @mm: The process address space.
3274 : * @addr: The user address.
3275 : *
3276 : * Return: The vm_area_struct at the given address, %NULL otherwise.
3277 : */
3278 : static inline
3279 : struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
3280 : {
3281 : return mtree_load(&mm->mm_mt, addr);
3282 : }
3283 :
3284 : static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
3285 : {
3286 : unsigned long vm_start = vma->vm_start;
3287 :
3288 : if (vma->vm_flags & VM_GROWSDOWN) {
3289 : vm_start -= stack_guard_gap;
3290 : if (vm_start > vma->vm_start)
3291 : vm_start = 0;
3292 : }
3293 : return vm_start;
3294 : }
3295 :
3296 : static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
3297 : {
3298 : unsigned long vm_end = vma->vm_end;
3299 :
3300 : if (vma->vm_flags & VM_GROWSUP) {
3301 : vm_end += stack_guard_gap;
3302 : if (vm_end < vma->vm_end)
3303 : vm_end = -PAGE_SIZE;
3304 : }
3305 : return vm_end;
3306 : }
3307 :
3308 : static inline unsigned long vma_pages(struct vm_area_struct *vma)
3309 : {
3310 48027910 : return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
3311 : }
3312 :
3313 : /* Look up the first VMA which exactly match the interval vm_start ... vm_end */
3314 : static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
3315 : unsigned long vm_start, unsigned long vm_end)
3316 : {
3317 : struct vm_area_struct *vma = vma_lookup(mm, vm_start);
3318 :
3319 : if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
3320 : vma = NULL;
3321 :
3322 : return vma;
3323 : }
3324 :
3325 : static inline bool range_in_vma(struct vm_area_struct *vma,
3326 : unsigned long start, unsigned long end)
3327 : {
3328 : return (vma && vma->vm_start <= start && end <= vma->vm_end);
3329 : }
3330 :
3331 : #ifdef CONFIG_MMU
3332 : pgprot_t vm_get_page_prot(unsigned long vm_flags);
3333 : void vma_set_page_prot(struct vm_area_struct *vma);
3334 : #else
3335 : static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
3336 : {
3337 : return __pgprot(0);
3338 : }
3339 : static inline void vma_set_page_prot(struct vm_area_struct *vma)
3340 : {
3341 : vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
3342 : }
3343 : #endif
3344 :
3345 : void vma_set_file(struct vm_area_struct *vma, struct file *file);
3346 :
3347 : #ifdef CONFIG_NUMA_BALANCING
3348 : unsigned long change_prot_numa(struct vm_area_struct *vma,
3349 : unsigned long start, unsigned long end);
3350 : #endif
3351 :
3352 : struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
3353 : unsigned long addr);
3354 : int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
3355 : unsigned long pfn, unsigned long size, pgprot_t);
3356 : int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
3357 : unsigned long pfn, unsigned long size, pgprot_t prot);
3358 : int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
3359 : int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
3360 : struct page **pages, unsigned long *num);
3361 : int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
3362 : unsigned long num);
3363 : int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
3364 : unsigned long num);
3365 : vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
3366 : unsigned long pfn);
3367 : vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
3368 : unsigned long pfn, pgprot_t pgprot);
3369 : vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
3370 : pfn_t pfn);
3371 : vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
3372 : unsigned long addr, pfn_t pfn);
3373 : int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
3374 :
3375 : static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
3376 : unsigned long addr, struct page *page)
3377 : {
3378 : int err = vm_insert_page(vma, addr, page);
3379 :
3380 : if (err == -ENOMEM)
3381 : return VM_FAULT_OOM;
3382 : if (err < 0 && err != -EBUSY)
3383 : return VM_FAULT_SIGBUS;
3384 :
3385 : return VM_FAULT_NOPAGE;
3386 : }
3387 :
3388 : #ifndef io_remap_pfn_range
3389 : static inline int io_remap_pfn_range(struct vm_area_struct *vma,
3390 : unsigned long addr, unsigned long pfn,
3391 : unsigned long size, pgprot_t prot)
3392 : {
3393 : return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot));
3394 : }
3395 : #endif
3396 :
3397 : static inline vm_fault_t vmf_error(int err)
3398 : {
3399 0 : if (err == -ENOMEM)
3400 : return VM_FAULT_OOM;
3401 0 : else if (err == -EHWPOISON)
3402 0 : return VM_FAULT_HWPOISON;
3403 : return VM_FAULT_SIGBUS;
3404 : }
3405 :
3406 : struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
3407 : unsigned int foll_flags);
3408 :
3409 : static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
3410 : {
3411 : if (vm_fault & VM_FAULT_OOM)
3412 : return -ENOMEM;
3413 : if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
3414 : return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT;
3415 : if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
3416 : return -EFAULT;
3417 : return 0;
3418 : }
3419 :
3420 : /*
3421 : * Indicates whether GUP can follow a PROT_NONE mapped page, or whether
3422 : * a (NUMA hinting) fault is required.
3423 : */
3424 : static inline bool gup_can_follow_protnone(unsigned int flags)
3425 : {
3426 : /*
3427 : * FOLL_FORCE has to be able to make progress even if the VMA is
3428 : * inaccessible. Further, FOLL_FORCE access usually does not represent
3429 : * application behaviour and we should avoid triggering NUMA hinting
3430 : * faults.
3431 : */
3432 : return flags & FOLL_FORCE;
3433 : }
3434 :
3435 : typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
3436 : extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
3437 : unsigned long size, pte_fn_t fn, void *data);
3438 : extern int apply_to_existing_page_range(struct mm_struct *mm,
3439 : unsigned long address, unsigned long size,
3440 : pte_fn_t fn, void *data);
3441 :
3442 : #ifdef CONFIG_PAGE_POISONING
3443 : extern void __kernel_poison_pages(struct page *page, int numpages);
3444 : extern void __kernel_unpoison_pages(struct page *page, int numpages);
3445 : extern bool _page_poisoning_enabled_early;
3446 : DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled);
3447 : static inline bool page_poisoning_enabled(void)
3448 : {
3449 : return _page_poisoning_enabled_early;
3450 : }
3451 : /*
3452 : * For use in fast paths after init_mem_debugging() has run, or when a
3453 : * false negative result is not harmful when called too early.
3454 : */
3455 : static inline bool page_poisoning_enabled_static(void)
3456 : {
3457 : return static_branch_unlikely(&_page_poisoning_enabled);
3458 : }
3459 : static inline void kernel_poison_pages(struct page *page, int numpages)
3460 : {
3461 : if (page_poisoning_enabled_static())
3462 : __kernel_poison_pages(page, numpages);
3463 : }
3464 : static inline void kernel_unpoison_pages(struct page *page, int numpages)
3465 : {
3466 : if (page_poisoning_enabled_static())
3467 : __kernel_unpoison_pages(page, numpages);
3468 : }
3469 : #else
3470 : static inline bool page_poisoning_enabled(void) { return false; }
3471 : static inline bool page_poisoning_enabled_static(void) { return false; }
3472 : static inline void __kernel_poison_pages(struct page *page, int nunmpages) { }
3473 : static inline void kernel_poison_pages(struct page *page, int numpages) { }
3474 : static inline void kernel_unpoison_pages(struct page *page, int numpages) { }
3475 : #endif
3476 :
3477 : DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
3478 : static inline bool want_init_on_alloc(gfp_t flags)
3479 : {
3480 : if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
3481 : &init_on_alloc))
3482 : return true;
3483 : return flags & __GFP_ZERO;
3484 : }
3485 :
3486 : DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
3487 : static inline bool want_init_on_free(void)
3488 : {
3489 : return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON,
3490 : &init_on_free);
3491 : }
3492 :
3493 : extern bool _debug_pagealloc_enabled_early;
3494 : DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
3495 :
3496 : static inline bool debug_pagealloc_enabled(void)
3497 : {
3498 : return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
3499 : _debug_pagealloc_enabled_early;
3500 : }
3501 :
3502 : /*
3503 : * For use in fast paths after init_debug_pagealloc() has run, or when a
3504 : * false negative result is not harmful when called too early.
3505 : */
3506 : static inline bool debug_pagealloc_enabled_static(void)
3507 : {
3508 : if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC))
3509 : return false;
3510 :
3511 : return static_branch_unlikely(&_debug_pagealloc_enabled);
3512 : }
3513 :
3514 : /*
3515 : * To support DEBUG_PAGEALLOC architecture must ensure that
3516 : * __kernel_map_pages() never fails
3517 : */
3518 : extern void __kernel_map_pages(struct page *page, int numpages, int enable);
3519 : #ifdef CONFIG_DEBUG_PAGEALLOC
3520 : static inline void debug_pagealloc_map_pages(struct page *page, int numpages)
3521 : {
3522 : if (debug_pagealloc_enabled_static())
3523 : __kernel_map_pages(page, numpages, 1);
3524 : }
3525 :
3526 : static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages)
3527 : {
3528 : if (debug_pagealloc_enabled_static())
3529 : __kernel_map_pages(page, numpages, 0);
3530 : }
3531 :
3532 : extern unsigned int _debug_guardpage_minorder;
3533 : DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
3534 :
3535 : static inline unsigned int debug_guardpage_minorder(void)
3536 : {
3537 : return _debug_guardpage_minorder;
3538 : }
3539 :
3540 : static inline bool debug_guardpage_enabled(void)
3541 : {
3542 : return static_branch_unlikely(&_debug_guardpage_enabled);
3543 : }
3544 :
3545 : static inline bool page_is_guard(struct page *page)
3546 : {
3547 : if (!debug_guardpage_enabled())
3548 : return false;
3549 :
3550 : return PageGuard(page);
3551 : }
3552 :
3553 : bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
3554 : int migratetype);
3555 : static inline bool set_page_guard(struct zone *zone, struct page *page,
3556 : unsigned int order, int migratetype)
3557 : {
3558 : if (!debug_guardpage_enabled())
3559 : return false;
3560 : return __set_page_guard(zone, page, order, migratetype);
3561 : }
3562 :
3563 : void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
3564 : int migratetype);
3565 : static inline void clear_page_guard(struct zone *zone, struct page *page,
3566 : unsigned int order, int migratetype)
3567 : {
3568 : if (!debug_guardpage_enabled())
3569 : return;
3570 : __clear_page_guard(zone, page, order, migratetype);
3571 : }
3572 :
3573 : #else /* CONFIG_DEBUG_PAGEALLOC */
3574 : static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {}
3575 : static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {}
3576 : static inline unsigned int debug_guardpage_minorder(void) { return 0; }
3577 : static inline bool debug_guardpage_enabled(void) { return false; }
3578 : static inline bool page_is_guard(struct page *page) { return false; }
3579 : static inline bool set_page_guard(struct zone *zone, struct page *page,
3580 : unsigned int order, int migratetype) { return false; }
3581 : static inline void clear_page_guard(struct zone *zone, struct page *page,
3582 : unsigned int order, int migratetype) {}
3583 : #endif /* CONFIG_DEBUG_PAGEALLOC */
3584 :
3585 : #ifdef __HAVE_ARCH_GATE_AREA
3586 : extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
3587 : extern int in_gate_area_no_mm(unsigned long addr);
3588 : extern int in_gate_area(struct mm_struct *mm, unsigned long addr);
3589 : #else
3590 : static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
3591 : {
3592 : return NULL;
3593 : }
3594 : static inline int in_gate_area_no_mm(unsigned long addr) { return 0; }
3595 : static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
3596 : {
3597 : return 0;
3598 : }
3599 : #endif /* __HAVE_ARCH_GATE_AREA */
3600 :
3601 : extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
3602 :
3603 : #ifdef CONFIG_SYSCTL
3604 : extern int sysctl_drop_caches;
3605 : int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *,
3606 : loff_t *);
3607 : #endif
3608 :
3609 : void drop_slab(void);
3610 :
3611 : #ifndef CONFIG_MMU
3612 : #define randomize_va_space 0
3613 : #else
3614 : extern int randomize_va_space;
3615 : #endif
3616 :
3617 : const char * arch_vma_name(struct vm_area_struct *vma);
3618 : #ifdef CONFIG_MMU
3619 : void print_vma_addr(char *prefix, unsigned long rip);
3620 : #else
3621 : static inline void print_vma_addr(char *prefix, unsigned long rip)
3622 : {
3623 : }
3624 : #endif
3625 :
3626 : void *sparse_buffer_alloc(unsigned long size);
3627 : struct page * __populate_section_memmap(unsigned long pfn,
3628 : unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
3629 : struct dev_pagemap *pgmap);
3630 : void pmd_init(void *addr);
3631 : void pud_init(void *addr);
3632 : pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
3633 : p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
3634 : pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
3635 : pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
3636 : pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
3637 : struct vmem_altmap *altmap, struct page *reuse);
3638 : void *vmemmap_alloc_block(unsigned long size, int node);
3639 : struct vmem_altmap;
3640 : void *vmemmap_alloc_block_buf(unsigned long size, int node,
3641 : struct vmem_altmap *altmap);
3642 : void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
3643 : void vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
3644 : unsigned long addr, unsigned long next);
3645 : int vmemmap_check_pmd(pmd_t *pmd, int node,
3646 : unsigned long addr, unsigned long next);
3647 : int vmemmap_populate_basepages(unsigned long start, unsigned long end,
3648 : int node, struct vmem_altmap *altmap);
3649 : int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
3650 : int node, struct vmem_altmap *altmap);
3651 : int vmemmap_populate(unsigned long start, unsigned long end, int node,
3652 : struct vmem_altmap *altmap);
3653 : void vmemmap_populate_print_last(void);
3654 : #ifdef CONFIG_MEMORY_HOTPLUG
3655 : void vmemmap_free(unsigned long start, unsigned long end,
3656 : struct vmem_altmap *altmap);
3657 : #endif
3658 :
3659 : #ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
3660 : static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
3661 : struct dev_pagemap *pgmap)
3662 : {
3663 : return is_power_of_2(sizeof(struct page)) &&
3664 : pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
3665 : }
3666 : #else
3667 : static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
3668 : struct dev_pagemap *pgmap)
3669 : {
3670 : return false;
3671 : }
3672 : #endif
3673 :
3674 : void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
3675 : unsigned long nr_pages);
3676 :
3677 : enum mf_flags {
3678 : MF_COUNT_INCREASED = 1 << 0,
3679 : MF_ACTION_REQUIRED = 1 << 1,
3680 : MF_MUST_KILL = 1 << 2,
3681 : MF_SOFT_OFFLINE = 1 << 3,
3682 : MF_UNPOISON = 1 << 4,
3683 : MF_SW_SIMULATED = 1 << 5,
3684 : MF_NO_RETRY = 1 << 6,
3685 : };
3686 : int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
3687 : unsigned long count, int mf_flags);
3688 : extern int memory_failure(unsigned long pfn, int flags);
3689 : extern void memory_failure_queue_kick(int cpu);
3690 : extern int unpoison_memory(unsigned long pfn);
3691 : extern void shake_page(struct page *p);
3692 : extern atomic_long_t num_poisoned_pages __read_mostly;
3693 : extern int soft_offline_page(unsigned long pfn, int flags);
3694 : #ifdef CONFIG_MEMORY_FAILURE
3695 : /*
3696 : * Sysfs entries for memory failure handling statistics.
3697 : */
3698 : extern const struct attribute_group memory_failure_attr_group;
3699 : extern void memory_failure_queue(unsigned long pfn, int flags);
3700 : extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
3701 : bool *migratable_cleared);
3702 : void num_poisoned_pages_inc(unsigned long pfn);
3703 : void num_poisoned_pages_sub(unsigned long pfn, long i);
3704 : struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
3705 : #else
3706 : static inline void memory_failure_queue(unsigned long pfn, int flags)
3707 : {
3708 : }
3709 :
3710 : static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
3711 : bool *migratable_cleared)
3712 : {
3713 : return 0;
3714 : }
3715 :
3716 : static inline void num_poisoned_pages_inc(unsigned long pfn)
3717 : {
3718 : }
3719 :
3720 : static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
3721 : {
3722 : }
3723 : #endif
3724 :
3725 : #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM)
3726 : void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
3727 : struct vm_area_struct *vma, struct list_head *to_kill,
3728 : unsigned long ksm_addr);
3729 : #endif
3730 :
3731 : #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
3732 : extern void memblk_nr_poison_inc(unsigned long pfn);
3733 : extern void memblk_nr_poison_sub(unsigned long pfn, long i);
3734 : #else
3735 : static inline void memblk_nr_poison_inc(unsigned long pfn)
3736 : {
3737 : }
3738 :
3739 : static inline void memblk_nr_poison_sub(unsigned long pfn, long i)
3740 : {
3741 : }
3742 : #endif
3743 :
3744 : #ifndef arch_memory_failure
3745 : static inline int arch_memory_failure(unsigned long pfn, int flags)
3746 : {
3747 : return -ENXIO;
3748 : }
3749 : #endif
3750 :
3751 : #ifndef arch_is_platform_page
3752 : static inline bool arch_is_platform_page(u64 paddr)
3753 : {
3754 : return false;
3755 : }
3756 : #endif
3757 :
3758 : /*
3759 : * Error handlers for various types of pages.
3760 : */
3761 : enum mf_result {
3762 : MF_IGNORED, /* Error: cannot be handled */
3763 : MF_FAILED, /* Error: handling failed */
3764 : MF_DELAYED, /* Will be handled later */
3765 : MF_RECOVERED, /* Successfully recovered */
3766 : };
3767 :
3768 : enum mf_action_page_type {
3769 : MF_MSG_KERNEL,
3770 : MF_MSG_KERNEL_HIGH_ORDER,
3771 : MF_MSG_SLAB,
3772 : MF_MSG_DIFFERENT_COMPOUND,
3773 : MF_MSG_HUGE,
3774 : MF_MSG_FREE_HUGE,
3775 : MF_MSG_UNMAP_FAILED,
3776 : MF_MSG_DIRTY_SWAPCACHE,
3777 : MF_MSG_CLEAN_SWAPCACHE,
3778 : MF_MSG_DIRTY_MLOCKED_LRU,
3779 : MF_MSG_CLEAN_MLOCKED_LRU,
3780 : MF_MSG_DIRTY_UNEVICTABLE_LRU,
3781 : MF_MSG_CLEAN_UNEVICTABLE_LRU,
3782 : MF_MSG_DIRTY_LRU,
3783 : MF_MSG_CLEAN_LRU,
3784 : MF_MSG_TRUNCATED_LRU,
3785 : MF_MSG_BUDDY,
3786 : MF_MSG_DAX,
3787 : MF_MSG_UNSPLIT_THP,
3788 : MF_MSG_UNKNOWN,
3789 : };
3790 :
3791 : #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3792 : extern void clear_huge_page(struct page *page,
3793 : unsigned long addr_hint,
3794 : unsigned int pages_per_huge_page);
3795 : int copy_user_large_folio(struct folio *dst, struct folio *src,
3796 : unsigned long addr_hint,
3797 : struct vm_area_struct *vma);
3798 : long copy_folio_from_user(struct folio *dst_folio,
3799 : const void __user *usr_src,
3800 : bool allow_pagefault);
3801 :
3802 : /**
3803 : * vma_is_special_huge - Are transhuge page-table entries considered special?
3804 : * @vma: Pointer to the struct vm_area_struct to consider
3805 : *
3806 : * Whether transhuge page-table entries are considered "special" following
3807 : * the definition in vm_normal_page().
3808 : *
3809 : * Return: true if transhuge page-table entries should be considered special,
3810 : * false otherwise.
3811 : */
3812 : static inline bool vma_is_special_huge(const struct vm_area_struct *vma)
3813 : {
3814 : return vma_is_dax(vma) || (vma->vm_file &&
3815 : (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
3816 : }
3817 :
3818 : #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
3819 :
3820 : #if MAX_NUMNODES > 1
3821 : void __init setup_nr_node_ids(void);
3822 : #else
3823 : static inline void setup_nr_node_ids(void) {}
3824 : #endif
3825 :
3826 : extern int memcmp_pages(struct page *page1, struct page *page2);
3827 :
3828 : static inline int pages_identical(struct page *page1, struct page *page2)
3829 : {
3830 : return !memcmp_pages(page1, page2);
3831 : }
3832 :
3833 : #ifdef CONFIG_MAPPING_DIRTY_HELPERS
3834 : unsigned long clean_record_shared_mapping_range(struct address_space *mapping,
3835 : pgoff_t first_index, pgoff_t nr,
3836 : pgoff_t bitmap_pgoff,
3837 : unsigned long *bitmap,
3838 : pgoff_t *start,
3839 : pgoff_t *end);
3840 :
3841 : unsigned long wp_shared_mapping_range(struct address_space *mapping,
3842 : pgoff_t first_index, pgoff_t nr);
3843 : #endif
3844 :
3845 : extern int sysctl_nr_trim_pages;
3846 :
3847 : #ifdef CONFIG_PRINTK
3848 : void mem_dump_obj(void *object);
3849 : #else
3850 : static inline void mem_dump_obj(void *object) {}
3851 : #endif
3852 :
3853 : /**
3854 : * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it
3855 : * @seals: the seals to check
3856 : * @vma: the vma to operate on
3857 : *
3858 : * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on
3859 : * the vma flags. Return 0 if check pass, or <0 for errors.
3860 : */
3861 : static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
3862 : {
3863 : if (seals & F_SEAL_FUTURE_WRITE) {
3864 : /*
3865 : * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
3866 : * "future write" seal active.
3867 : */
3868 : if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
3869 : return -EPERM;
3870 :
3871 : /*
3872 : * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
3873 : * MAP_SHARED and read-only, take care to not allow mprotect to
3874 : * revert protections on such mappings. Do this only for shared
3875 : * mappings. For private mappings, don't need to mask
3876 : * VM_MAYWRITE as we still want them to be COW-writable.
3877 : */
3878 : if (vma->vm_flags & VM_SHARED)
3879 : vm_flags_clear(vma, VM_MAYWRITE);
3880 : }
3881 :
3882 : return 0;
3883 : }
3884 :
3885 : #ifdef CONFIG_ANON_VMA_NAME
3886 : int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
3887 : unsigned long len_in,
3888 : struct anon_vma_name *anon_name);
3889 : #else
3890 : static inline int
3891 : madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
3892 : unsigned long len_in, struct anon_vma_name *anon_name) {
3893 : return 0;
3894 : }
3895 : #endif
3896 :
3897 : #ifdef CONFIG_UNACCEPTED_MEMORY
3898 :
3899 : bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end);
3900 : void accept_memory(phys_addr_t start, phys_addr_t end);
3901 :
3902 : #else
3903 :
3904 : static inline bool range_contains_unaccepted_memory(phys_addr_t start,
3905 : phys_addr_t end)
3906 : {
3907 : return false;
3908 : }
3909 :
3910 : static inline void accept_memory(phys_addr_t start, phys_addr_t end)
3911 : {
3912 : }
3913 :
3914 : #endif
3915 :
3916 : #endif /* _LINUX_MM_H */
|