LCOV - code coverage report
Current view: top level - include/linux - rmap.h (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc4-xfsa @ Mon Jul 31 20:08:27 PDT 2023 Lines: 2 2 100.0 %
Date: 2023-07-31 20:08:27 Functions: 1 1 100.0 %

          Line data    Source code
       1             : /* SPDX-License-Identifier: GPL-2.0 */
       2             : #ifndef _LINUX_RMAP_H
       3             : #define _LINUX_RMAP_H
       4             : /*
       5             :  * Declarations for Reverse Mapping functions in mm/rmap.c
       6             :  */
       7             : 
       8             : #include <linux/list.h>
       9             : #include <linux/slab.h>
      10             : #include <linux/mm.h>
      11             : #include <linux/rwsem.h>
      12             : #include <linux/memcontrol.h>
      13             : #include <linux/highmem.h>
      14             : #include <linux/pagemap.h>
      15             : #include <linux/memremap.h>
      16             : 
      17             : /*
      18             :  * The anon_vma heads a list of private "related" vmas, to scan if
      19             :  * an anonymous page pointing to this anon_vma needs to be unmapped:
      20             :  * the vmas on the list will be related by forking, or by splitting.
      21             :  *
      22             :  * Since vmas come and go as they are split and merged (particularly
      23             :  * in mprotect), the mapping field of an anonymous page cannot point
      24             :  * directly to a vma: instead it points to an anon_vma, on whose list
      25             :  * the related vmas can be easily linked or unlinked.
      26             :  *
      27             :  * After unlinking the last vma on the list, we must garbage collect
      28             :  * the anon_vma object itself: we're guaranteed no page can be
      29             :  * pointing to this anon_vma once its vma list is empty.
      30             :  */
      31             : struct anon_vma {
      32             :         struct anon_vma *root;          /* Root of this anon_vma tree */
      33             :         struct rw_semaphore rwsem;      /* W: modification, R: walking the list */
      34             :         /*
      35             :          * The refcount is taken on an anon_vma when there is no
      36             :          * guarantee that the vma of page tables will exist for
      37             :          * the duration of the operation. A caller that takes
      38             :          * the reference is responsible for clearing up the
      39             :          * anon_vma if they are the last user on release
      40             :          */
      41             :         atomic_t refcount;
      42             : 
      43             :         /*
      44             :          * Count of child anon_vmas. Equals to the count of all anon_vmas that
      45             :          * have ->parent pointing to this one, including itself.
      46             :          *
      47             :          * This counter is used for making decision about reusing anon_vma
      48             :          * instead of forking new one. See comments in function anon_vma_clone.
      49             :          */
      50             :         unsigned long num_children;
      51             :         /* Count of VMAs whose ->anon_vma pointer points to this object. */
      52             :         unsigned long num_active_vmas;
      53             : 
      54             :         struct anon_vma *parent;        /* Parent of this anon_vma */
      55             : 
      56             :         /*
      57             :          * NOTE: the LSB of the rb_root.rb_node is set by
      58             :          * mm_take_all_locks() _after_ taking the above lock. So the
      59             :          * rb_root must only be read/written after taking the above lock
      60             :          * to be sure to see a valid next pointer. The LSB bit itself
      61             :          * is serialized by a system wide lock only visible to
      62             :          * mm_take_all_locks() (mm_all_locks_mutex).
      63             :          */
      64             : 
      65             :         /* Interval tree of private "related" vmas */
      66             :         struct rb_root_cached rb_root;
      67             : };
      68             : 
      69             : /*
      70             :  * The copy-on-write semantics of fork mean that an anon_vma
      71             :  * can become associated with multiple processes. Furthermore,
      72             :  * each child process will have its own anon_vma, where new
      73             :  * pages for that process are instantiated.
      74             :  *
      75             :  * This structure allows us to find the anon_vmas associated
      76             :  * with a VMA, or the VMAs associated with an anon_vma.
      77             :  * The "same_vma" list contains the anon_vma_chains linking
      78             :  * all the anon_vmas associated with this VMA.
      79             :  * The "rb" field indexes on an interval tree the anon_vma_chains
      80             :  * which link all the VMAs associated with this anon_vma.
      81             :  */
      82             : struct anon_vma_chain {
      83             :         struct vm_area_struct *vma;
      84             :         struct anon_vma *anon_vma;
      85             :         struct list_head same_vma;   /* locked by mmap_lock & page_table_lock */
      86             :         struct rb_node rb;                      /* locked by anon_vma->rwsem */
      87             :         unsigned long rb_subtree_last;
      88             : #ifdef CONFIG_DEBUG_VM_RB
      89             :         unsigned long cached_vma_start, cached_vma_last;
      90             : #endif
      91             : };
      92             : 
      93             : enum ttu_flags {
      94             :         TTU_SPLIT_HUGE_PMD      = 0x4,  /* split huge PMD if any */
      95             :         TTU_IGNORE_MLOCK        = 0x8,  /* ignore mlock */
      96             :         TTU_SYNC                = 0x10, /* avoid racy checks with PVMW_SYNC */
      97             :         TTU_HWPOISON            = 0x20, /* do convert pte to hwpoison entry */
      98             :         TTU_BATCH_FLUSH         = 0x40, /* Batch TLB flushes where possible
      99             :                                          * and caller guarantees they will
     100             :                                          * do a final flush if necessary */
     101             :         TTU_RMAP_LOCKED         = 0x80, /* do not grab rmap lock:
     102             :                                          * caller holds it */
     103             : };
     104             : 
     105             : #ifdef CONFIG_MMU
     106             : static inline void get_anon_vma(struct anon_vma *anon_vma)
     107             : {
     108             :         atomic_inc(&anon_vma->refcount);
     109             : }
     110             : 
     111             : void __put_anon_vma(struct anon_vma *anon_vma);
     112             : 
     113             : static inline void put_anon_vma(struct anon_vma *anon_vma)
     114             : {
     115             :         if (atomic_dec_and_test(&anon_vma->refcount))
     116             :                 __put_anon_vma(anon_vma);
     117             : }
     118             : 
     119             : static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
     120             : {
     121             :         down_write(&anon_vma->root->rwsem);
     122             : }
     123             : 
     124             : static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
     125             : {
     126             :         up_write(&anon_vma->root->rwsem);
     127             : }
     128             : 
     129             : static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
     130             : {
     131             :         down_read(&anon_vma->root->rwsem);
     132             : }
     133             : 
     134             : static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
     135             : {
     136             :         return down_read_trylock(&anon_vma->root->rwsem);
     137             : }
     138             : 
     139             : static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
     140             : {
     141             :         up_read(&anon_vma->root->rwsem);
     142             : }
     143             : 
     144             : 
     145             : /*
     146             :  * anon_vma helper functions.
     147             :  */
     148             : void anon_vma_init(void);       /* create anon_vma_cachep */
     149             : int  __anon_vma_prepare(struct vm_area_struct *);
     150             : void unlink_anon_vmas(struct vm_area_struct *);
     151             : int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
     152             : int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
     153             : 
     154             : static inline int anon_vma_prepare(struct vm_area_struct *vma)
     155             : {
     156             :         if (likely(vma->anon_vma))
     157             :                 return 0;
     158             : 
     159             :         return __anon_vma_prepare(vma);
     160             : }
     161             : 
     162             : static inline void anon_vma_merge(struct vm_area_struct *vma,
     163             :                                   struct vm_area_struct *next)
     164             : {
     165             :         VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
     166             :         unlink_anon_vmas(next);
     167             : }
     168             : 
     169             : struct anon_vma *folio_get_anon_vma(struct folio *folio);
     170             : 
     171             : /* RMAP flags, currently only relevant for some anon rmap operations. */
     172             : typedef int __bitwise rmap_t;
     173             : 
     174             : /*
     175             :  * No special request: if the page is a subpage of a compound page, it is
     176             :  * mapped via a PTE. The mapped (sub)page is possibly shared between processes.
     177             :  */
     178             : #define RMAP_NONE               ((__force rmap_t)0)
     179             : 
     180             : /* The (sub)page is exclusive to a single process. */
     181             : #define RMAP_EXCLUSIVE          ((__force rmap_t)BIT(0))
     182             : 
     183             : /*
     184             :  * The compound page is not mapped via PTEs, but instead via a single PMD and
     185             :  * should be accounted accordingly.
     186             :  */
     187             : #define RMAP_COMPOUND           ((__force rmap_t)BIT(1))
     188             : 
     189             : /*
     190             :  * rmap interfaces called when adding or removing pte of page
     191             :  */
     192             : void page_move_anon_rmap(struct page *, struct vm_area_struct *);
     193             : void page_add_anon_rmap(struct page *, struct vm_area_struct *,
     194             :                 unsigned long address, rmap_t flags);
     195             : void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
     196             :                 unsigned long address);
     197             : void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
     198             :                 unsigned long address);
     199             : void page_add_file_rmap(struct page *, struct vm_area_struct *,
     200             :                 bool compound);
     201             : void page_remove_rmap(struct page *, struct vm_area_struct *,
     202             :                 bool compound);
     203             : 
     204             : void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
     205             :                 unsigned long address, rmap_t flags);
     206             : void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
     207             :                 unsigned long address);
     208             : 
     209             : static inline void __page_dup_rmap(struct page *page, bool compound)
     210             : {
     211             :         if (compound) {
     212             :                 struct folio *folio = (struct folio *)page;
     213             : 
     214             :                 VM_BUG_ON_PAGE(compound && !PageHead(page), page);
     215             :                 atomic_inc(&folio->_entire_mapcount);
     216             :         } else {
     217             :                 atomic_inc(&page->_mapcount);
     218             :         }
     219             : }
     220             : 
     221             : static inline void page_dup_file_rmap(struct page *page, bool compound)
     222             : {
     223             :         __page_dup_rmap(page, compound);
     224             : }
     225             : 
     226             : /**
     227             :  * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped
     228             :  *                          anonymous page
     229             :  * @page: the page to duplicate the mapping for
     230             :  * @compound: the page is mapped as compound or as a small page
     231             :  * @vma: the source vma
     232             :  *
     233             :  * The caller needs to hold the PT lock and the vma->vma_mm->write_protect_seq.
     234             :  *
     235             :  * Duplicating the mapping can only fail if the page may be pinned; device
     236             :  * private pages cannot get pinned and consequently this function cannot fail.
     237             :  *
     238             :  * If duplicating the mapping succeeds, the page has to be mapped R/O into
     239             :  * the parent and the child. It must *not* get mapped writable after this call.
     240             :  *
     241             :  * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
     242             :  */
     243             : static inline int page_try_dup_anon_rmap(struct page *page, bool compound,
     244             :                                          struct vm_area_struct *vma)
     245             : {
     246             :         VM_BUG_ON_PAGE(!PageAnon(page), page);
     247             : 
     248             :         /*
     249             :          * No need to check+clear for already shared pages, including KSM
     250             :          * pages.
     251             :          */
     252             :         if (!PageAnonExclusive(page))
     253             :                 goto dup;
     254             : 
     255             :         /*
     256             :          * If this page may have been pinned by the parent process,
     257             :          * don't allow to duplicate the mapping but instead require to e.g.,
     258             :          * copy the page immediately for the child so that we'll always
     259             :          * guarantee the pinned page won't be randomly replaced in the
     260             :          * future on write faults.
     261             :          */
     262             :         if (likely(!is_device_private_page(page) &&
     263             :             unlikely(page_needs_cow_for_dma(vma, page))))
     264             :                 return -EBUSY;
     265             : 
     266             :         ClearPageAnonExclusive(page);
     267             :         /*
     268             :          * It's okay to share the anon page between both processes, mapping
     269             :          * the page R/O into both processes.
     270             :          */
     271             : dup:
     272             :         __page_dup_rmap(page, compound);
     273             :         return 0;
     274             : }
     275             : 
     276             : /**
     277             :  * page_try_share_anon_rmap - try marking an exclusive anonymous page possibly
     278             :  *                            shared to prepare for KSM or temporary unmapping
     279             :  * @page: the exclusive anonymous page to try marking possibly shared
     280             :  *
     281             :  * The caller needs to hold the PT lock and has to have the page table entry
     282             :  * cleared/invalidated.
     283             :  *
     284             :  * This is similar to page_try_dup_anon_rmap(), however, not used during fork()
     285             :  * to duplicate a mapping, but instead to prepare for KSM or temporarily
     286             :  * unmapping a page (swap, migration) via page_remove_rmap().
     287             :  *
     288             :  * Marking the page shared can only fail if the page may be pinned; device
     289             :  * private pages cannot get pinned and consequently this function cannot fail.
     290             :  *
     291             :  * Returns 0 if marking the page possibly shared succeeded. Returns -EBUSY
     292             :  * otherwise.
     293             :  */
     294             : static inline int page_try_share_anon_rmap(struct page *page)
     295             : {
     296             :         VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page);
     297             : 
     298             :         /* device private pages cannot get pinned via GUP. */
     299             :         if (unlikely(is_device_private_page(page))) {
     300             :                 ClearPageAnonExclusive(page);
     301             :                 return 0;
     302             :         }
     303             : 
     304             :         /*
     305             :          * We have to make sure that when we clear PageAnonExclusive, that
     306             :          * the page is not pinned and that concurrent GUP-fast won't succeed in
     307             :          * concurrently pinning the page.
     308             :          *
     309             :          * Conceptually, PageAnonExclusive clearing consists of:
     310             :          * (A1) Clear PTE
     311             :          * (A2) Check if the page is pinned; back off if so.
     312             :          * (A3) Clear PageAnonExclusive
     313             :          * (A4) Restore PTE (optional, but certainly not writable)
     314             :          *
     315             :          * When clearing PageAnonExclusive, we cannot possibly map the page
     316             :          * writable again, because anon pages that may be shared must never
     317             :          * be writable. So in any case, if the PTE was writable it cannot
     318             :          * be writable anymore afterwards and there would be a PTE change. Only
     319             :          * if the PTE wasn't writable, there might not be a PTE change.
     320             :          *
     321             :          * Conceptually, GUP-fast pinning of an anon page consists of:
     322             :          * (B1) Read the PTE
     323             :          * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
     324             :          * (B3) Pin the mapped page
     325             :          * (B4) Check if the PTE changed by re-reading it; back off if so.
     326             :          * (B5) If the original PTE is not writable, check if
     327             :          *      PageAnonExclusive is not set; back off if so.
     328             :          *
     329             :          * If the PTE was writable, we only have to make sure that GUP-fast
     330             :          * observes a PTE change and properly backs off.
     331             :          *
     332             :          * If the PTE was not writable, we have to make sure that GUP-fast either
     333             :          * detects a (temporary) PTE change or that PageAnonExclusive is cleared
     334             :          * and properly backs off.
     335             :          *
     336             :          * Consequently, when clearing PageAnonExclusive(), we have to make
     337             :          * sure that (A1), (A2)/(A3) and (A4) happen in the right memory
     338             :          * order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
     339             :          * and (B5) happen in the right memory order.
     340             :          *
     341             :          * We assume that there might not be a memory barrier after
     342             :          * clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
     343             :          * so we use explicit ones here.
     344             :          */
     345             : 
     346             :         /* Paired with the memory barrier in try_grab_folio(). */
     347             :         if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
     348             :                 smp_mb();
     349             : 
     350             :         if (unlikely(page_maybe_dma_pinned(page)))
     351             :                 return -EBUSY;
     352             :         ClearPageAnonExclusive(page);
     353             : 
     354             :         /*
     355             :          * This is conceptually a smp_wmb() paired with the smp_rmb() in
     356             :          * gup_must_unshare().
     357             :          */
     358             :         if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
     359             :                 smp_mb__after_atomic();
     360             :         return 0;
     361             : }
     362             : 
     363             : /*
     364             :  * Called from mm/vmscan.c to handle paging out
     365             :  */
     366             : int folio_referenced(struct folio *, int is_locked,
     367             :                         struct mem_cgroup *memcg, unsigned long *vm_flags);
     368             : 
     369             : void try_to_migrate(struct folio *folio, enum ttu_flags flags);
     370             : void try_to_unmap(struct folio *, enum ttu_flags flags);
     371             : 
     372             : int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
     373             :                                 unsigned long end, struct page **pages,
     374             :                                 void *arg);
     375             : 
     376             : /* Avoid racy checks */
     377             : #define PVMW_SYNC               (1 << 0)
     378             : /* Look for migration entries rather than present PTEs */
     379             : #define PVMW_MIGRATION          (1 << 1)
     380             : 
     381             : struct page_vma_mapped_walk {
     382             :         unsigned long pfn;
     383             :         unsigned long nr_pages;
     384             :         pgoff_t pgoff;
     385             :         struct vm_area_struct *vma;
     386             :         unsigned long address;
     387             :         pmd_t *pmd;
     388             :         pte_t *pte;
     389             :         spinlock_t *ptl;
     390             :         unsigned int flags;
     391             : };
     392             : 
     393             : #define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags)       \
     394             :         struct page_vma_mapped_walk name = {                            \
     395             :                 .pfn = page_to_pfn(_page),                              \
     396             :                 .nr_pages = compound_nr(_page),                         \
     397             :                 .pgoff = page_to_pgoff(_page),                          \
     398             :                 .vma = _vma,                                            \
     399             :                 .address = _address,                                    \
     400             :                 .flags = _flags,                                        \
     401             :         }
     402             : 
     403             : #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags)     \
     404             :         struct page_vma_mapped_walk name = {                            \
     405             :                 .pfn = folio_pfn(_folio),                               \
     406             :                 .nr_pages = folio_nr_pages(_folio),                     \
     407             :                 .pgoff = folio_pgoff(_folio),                           \
     408             :                 .vma = _vma,                                            \
     409             :                 .address = _address,                                    \
     410             :                 .flags = _flags,                                        \
     411             :         }
     412             : 
     413             : static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
     414             : {
     415             :         /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
     416             :         if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma))
     417             :                 pte_unmap(pvmw->pte);
     418             :         if (pvmw->ptl)
     419             :                 spin_unlock(pvmw->ptl);
     420             : }
     421             : 
     422             : bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
     423             : 
     424             : /*
     425             :  * Used by swapoff to help locate where page is expected in vma.
     426             :  */
     427             : unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
     428             : 
     429             : /*
     430             :  * Cleans the PTEs of shared mappings.
     431             :  * (and since clean PTEs should also be readonly, write protects them too)
     432             :  *
     433             :  * returns the number of cleaned PTEs.
     434             :  */
     435             : int folio_mkclean(struct folio *);
     436             : 
     437             : int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
     438             :                       struct vm_area_struct *vma);
     439             : 
     440             : void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
     441             : 
     442             : int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
     443             : 
     444             : /*
     445             :  * rmap_walk_control: To control rmap traversing for specific needs
     446             :  *
     447             :  * arg: passed to rmap_one() and invalid_vma()
     448             :  * try_lock: bail out if the rmap lock is contended
     449             :  * contended: indicate the rmap traversal bailed out due to lock contention
     450             :  * rmap_one: executed on each vma where page is mapped
     451             :  * done: for checking traversing termination condition
     452             :  * anon_lock: for getting anon_lock by optimized way rather than default
     453             :  * invalid_vma: for skipping uninterested vma
     454             :  */
     455             : struct rmap_walk_control {
     456             :         void *arg;
     457             :         bool try_lock;
     458             :         bool contended;
     459             :         /*
     460             :          * Return false if page table scanning in rmap_walk should be stopped.
     461             :          * Otherwise, return true.
     462             :          */
     463             :         bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma,
     464             :                                         unsigned long addr, void *arg);
     465             :         int (*done)(struct folio *folio);
     466             :         struct anon_vma *(*anon_lock)(struct folio *folio,
     467             :                                       struct rmap_walk_control *rwc);
     468             :         bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
     469             : };
     470             : 
     471             : void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc);
     472             : void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc);
     473             : struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
     474             :                                           struct rmap_walk_control *rwc);
     475             : 
     476             : #else   /* !CONFIG_MMU */
     477             : 
     478             : #define anon_vma_init()         do {} while (0)
     479             : #define anon_vma_prepare(vma)   (0)
     480             : #define anon_vma_link(vma)      do {} while (0)
     481             : 
     482             : static inline int folio_referenced(struct folio *folio, int is_locked,
     483             :                                   struct mem_cgroup *memcg,
     484             :                                   unsigned long *vm_flags)
     485             : {
     486             :         *vm_flags = 0;
     487             :         return 0;
     488             : }
     489             : 
     490             : static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags)
     491             : {
     492             : }
     493             : 
     494             : static inline int folio_mkclean(struct folio *folio)
     495             : {
     496             :         return 0;
     497             : }
     498             : #endif  /* CONFIG_MMU */
     499             : 
     500     4403475 : static inline int page_mkclean(struct page *page)
     501             : {
     502     8806950 :         return folio_mkclean(page_folio(page));
     503             : }
     504             : #endif  /* _LINUX_RMAP_H */

Generated by: LCOV version 1.14