LCOV - fstests of 6.5.0-rc3-achx @ Mon Jul 31 20:08:12 PDT 2023

LCOV - code coverage report

Current view:	top level - fs - dax.c (source / functions)		Hit	Total	Coverage
Test:	fstests of 6.5.0-rc3-achx @ Mon Jul 31 20:08:12 PDT 2023	Lines:	11	934	1.2 %
Date:	2023-07-31 20:08:12	Functions:	2	52	3.8 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  * fs/dax.c - Direct Access filesystem code
       4             :  * Copyright (c) 2013-2014 Intel Corporation
       5             :  * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
       6             :  * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
       7             :  */
       8             : 
       9             : #include <linux/atomic.h>
      10             : #include <linux/blkdev.h>
      11             : #include <linux/buffer_head.h>
      12             : #include <linux/dax.h>
      13             : #include <linux/fs.h>
      14             : #include <linux/highmem.h>
      15             : #include <linux/memcontrol.h>
      16             : #include <linux/mm.h>
      17             : #include <linux/mutex.h>
      18             : #include <linux/pagevec.h>
      19             : #include <linux/sched.h>
      20             : #include <linux/sched/signal.h>
      21             : #include <linux/uio.h>
      22             : #include <linux/vmstat.h>
      23             : #include <linux/pfn_t.h>
      24             : #include <linux/sizes.h>
      25             : #include <linux/mmu_notifier.h>
      26             : #include <linux/iomap.h>
      27             : #include <linux/rmap.h>
      28             : #include <asm/pgalloc.h>
      29             : 
      30             : #define CREATE_TRACE_POINTS
      31             : #include <trace/events/fs_dax.h>
      32             : 
      33             : static inline unsigned int pe_order(enum page_entry_size pe_size)
      34             : {
      35           0 :         if (pe_size == PE_SIZE_PTE)
      36             :                 return PAGE_SHIFT - PAGE_SHIFT;
      37           0 :         if (pe_size == PE_SIZE_PMD)
      38             :                 return PMD_SHIFT - PAGE_SHIFT;
      39           0 :         if (pe_size == PE_SIZE_PUD)
      40             :                 return PUD_SHIFT - PAGE_SHIFT;
      41             :         return ~0;
      42             : }
      43             : 
      44             : /* We choose 4096 entries - same as per-zone page wait tables */
      45             : #define DAX_WAIT_TABLE_BITS 12
      46             : #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
      47             : 
      48             : /* The 'colour' (ie low bits) within a PMD of a page offset.  */
      49             : #define PG_PMD_COLOUR   ((PMD_SIZE >> PAGE_SHIFT) - 1)
      50             : #define PG_PMD_NR       (PMD_SIZE >> PAGE_SHIFT)
      51             : 
      52             : /* The order of a PMD entry */
      53             : #define PMD_ORDER       (PMD_SHIFT - PAGE_SHIFT)
      54             : 
      55             : static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
      56             : 
      57           0 : static int __init init_dax_wait_table(void)
      58             : {
      59           0 :         int i;
      60             : 
      61           0 :         for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
      62           0 :                 init_waitqueue_head(wait_table + i);
      63           0 :         return 0;
      64             : }
      65             : fs_initcall(init_dax_wait_table);
      66             : 
      67             : /*
      68             :  * DAX pagecache entries use XArray value entries so they can't be mistaken
      69             :  * for pages.  We use one bit for locking, one bit for the entry size (PMD)
      70             :  * and two more to tell us if the entry is a zero page or an empty entry that
      71             :  * is just used for locking.  In total four special bits.
      72             :  *
      73             :  * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
      74             :  * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
      75             :  * block allocation.
      76             :  */
      77             : #define DAX_SHIFT       (4)
      78             : #define DAX_LOCKED      (1UL << 0)
      79             : #define DAX_PMD         (1UL << 1)
      80             : #define DAX_ZERO_PAGE   (1UL << 2)
      81             : #define DAX_EMPTY       (1UL << 3)
      82             : 
      83             : static unsigned long dax_to_pfn(void *entry)
      84             : {
      85           0 :         return xa_to_value(entry) >> DAX_SHIFT;
      86             : }
      87             : 
      88             : static void *dax_make_entry(pfn_t pfn, unsigned long flags)
      89             : {
      90           0 :         return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
      91             : }
      92             : 
      93             : static bool dax_is_locked(void *entry)
      94             : {
      95           0 :         return xa_to_value(entry) & DAX_LOCKED;
      96             : }
      97             : 
      98             : static unsigned int dax_entry_order(void *entry)
      99             : {
     100           0 :         if (xa_to_value(entry) & DAX_PMD)
     101           0 :                 return PMD_ORDER;
     102             :         return 0;
     103             : }
     104             : 
     105             : static unsigned long dax_is_pmd_entry(void *entry)
     106             : {
     107           0 :         return xa_to_value(entry) & DAX_PMD;
     108             : }
     109             : 
     110             : static bool dax_is_pte_entry(void *entry)
     111             : {
     112           0 :         return !(xa_to_value(entry) & DAX_PMD);
     113             : }
     114             : 
     115             : static int dax_is_zero_entry(void *entry)
     116             : {
     117           0 :         return xa_to_value(entry) & DAX_ZERO_PAGE;
     118             : }
     119             : 
     120             : static int dax_is_empty_entry(void *entry)
     121             : {
     122           0 :         return xa_to_value(entry) & DAX_EMPTY;
     123             : }
     124             : 
     125             : /*
     126             :  * true if the entry that was found is of a smaller order than the entry
     127             :  * we were looking for
     128             :  */
     129             : static bool dax_is_conflict(void *entry)
     130             : {
     131             :         return entry == XA_RETRY_ENTRY;
     132             : }
     133             : 
     134             : /*
     135             :  * DAX page cache entry locking
     136             :  */
     137             : struct exceptional_entry_key {
     138             :         struct xarray *xa;
     139             :         pgoff_t entry_start;
     140             : };
     141             : 
     142             : struct wait_exceptional_entry_queue {
     143             :         wait_queue_entry_t wait;
     144             :         struct exceptional_entry_key key;
     145             : };
     146             : 
     147             : /**
     148             :  * enum dax_wake_mode: waitqueue wakeup behaviour
     149             :  * @WAKE_ALL: wake all waiters in the waitqueue
     150             :  * @WAKE_NEXT: wake only the first waiter in the waitqueue
     151             :  */
     152             : enum dax_wake_mode {
     153             :         WAKE_ALL,
     154             :         WAKE_NEXT,
     155             : };
     156             : 
     157           0 : static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
     158             :                 void *entry, struct exceptional_entry_key *key)
     159             : {
     160           0 :         unsigned long hash;
     161           0 :         unsigned long index = xas->xa_index;
     162             : 
     163             :         /*
     164             :          * If 'entry' is a PMD, align the 'index' that we use for the wait
     165             :          * queue to the start of that PMD.  This ensures that all offsets in
     166             :          * the range covered by the PMD map to the same bit lock.
     167             :          */
     168           0 :         if (dax_is_pmd_entry(entry))
     169           0 :                 index &= ~PG_PMD_COLOUR;
     170           0 :         key->xa = xas->xa;
     171           0 :         key->entry_start = index;
     172             : 
     173           0 :         hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
     174           0 :         return wait_table + hash;
     175             : }
     176             : 
     177           0 : static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
     178             :                 unsigned int mode, int sync, void *keyp)
     179             : {
     180           0 :         struct exceptional_entry_key *key = keyp;
     181           0 :         struct wait_exceptional_entry_queue *ewait =
     182           0 :                 container_of(wait, struct wait_exceptional_entry_queue, wait);
     183             : 
     184           0 :         if (key->xa != ewait->key.xa ||
     185           0 :             key->entry_start != ewait->key.entry_start)
     186             :                 return 0;
     187           0 :         return autoremove_wake_function(wait, mode, sync, NULL);
     188             : }
     189             : 
     190             : /*
     191             :  * @entry may no longer be the entry at the index in the mapping.
     192             :  * The important information it's conveying is whether the entry at
     193             :  * this index used to be a PMD entry.
     194             :  */
     195           0 : static void dax_wake_entry(struct xa_state *xas, void *entry,
     196             :                            enum dax_wake_mode mode)
     197             : {
     198           0 :         struct exceptional_entry_key key;
     199           0 :         wait_queue_head_t *wq;
     200             : 
     201           0 :         wq = dax_entry_waitqueue(xas, entry, &key);
     202             : 
     203             :         /*
     204             :          * Checking for locked entry and prepare_to_wait_exclusive() happens
     205             :          * under the i_pages lock, ditto for entry handling in our callers.
     206             :          * So at this point all tasks that could have seen our entry locked
     207             :          * must be in the waitqueue and the following check will see them.
     208             :          */
     209           0 :         if (waitqueue_active(wq))
     210           0 :                 __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
     211           0 : }
     212             : 
     213             : /*
     214             :  * Look up entry in page cache, wait for it to become unlocked if it
     215             :  * is a DAX entry and return it.  The caller must subsequently call
     216             :  * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
     217             :  * if it did.  The entry returned may have a larger order than @order.
     218             :  * If @order is larger than the order of the entry found in i_pages, this
     219             :  * function returns a dax_is_conflict entry.
     220             :  *
     221             :  * Must be called with the i_pages lock held.
     222             :  */
     223           0 : static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
     224             : {
     225           0 :         void *entry;
     226           0 :         struct wait_exceptional_entry_queue ewait;
     227           0 :         wait_queue_head_t *wq;
     228             : 
     229           0 :         init_wait(&ewait.wait);
     230           0 :         ewait.wait.func = wake_exceptional_entry_func;
     231             : 
     232           0 :         for (;;) {
     233           0 :                 entry = xas_find_conflict(xas);
     234           0 :                 if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
     235           0 :                         return entry;
     236           0 :                 if (dax_entry_order(entry) < order)
     237             :                         return XA_RETRY_ENTRY;
     238           0 :                 if (!dax_is_locked(entry))
     239           0 :                         return entry;
     240             : 
     241           0 :                 wq = dax_entry_waitqueue(xas, entry, &ewait.key);
     242           0 :                 prepare_to_wait_exclusive(wq, &ewait.wait,
     243             :                                           TASK_UNINTERRUPTIBLE);
     244           0 :                 xas_unlock_irq(xas);
     245           0 :                 xas_reset(xas);
     246           0 :                 schedule();
     247           0 :                 finish_wait(wq, &ewait.wait);
     248           0 :                 xas_lock_irq(xas);
     249             :         }
     250             : }
     251             : 
     252             : /*
     253             :  * The only thing keeping the address space around is the i_pages lock
     254             :  * (it's cycled in clear_inode() after removing the entries from i_pages)
     255             :  * After we call xas_unlock_irq(), we cannot touch xas->xa.
     256             :  */
     257           0 : static void wait_entry_unlocked(struct xa_state *xas, void *entry)
     258             : {
     259           0 :         struct wait_exceptional_entry_queue ewait;
     260           0 :         wait_queue_head_t *wq;
     261             : 
     262           0 :         init_wait(&ewait.wait);
     263           0 :         ewait.wait.func = wake_exceptional_entry_func;
     264             : 
     265           0 :         wq = dax_entry_waitqueue(xas, entry, &ewait.key);
     266             :         /*
     267             :          * Unlike get_unlocked_entry() there is no guarantee that this
     268             :          * path ever successfully retrieves an unlocked entry before an
     269             :          * inode dies. Perform a non-exclusive wait in case this path
     270             :          * never successfully performs its own wake up.
     271             :          */
     272           0 :         prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
     273           0 :         xas_unlock_irq(xas);
     274           0 :         schedule();
     275           0 :         finish_wait(wq, &ewait.wait);
     276           0 : }
     277             : 
     278           0 : static void put_unlocked_entry(struct xa_state *xas, void *entry,
     279             :                                enum dax_wake_mode mode)
     280             : {
     281           0 :         if (entry && !dax_is_conflict(entry))
     282           0 :                 dax_wake_entry(xas, entry, mode);
     283           0 : }
     284             : 
     285             : /*
     286             :  * We used the xa_state to get the entry, but then we locked the entry and
     287             :  * dropped the xa_lock, so we know the xa_state is stale and must be reset
     288             :  * before use.
     289             :  */
     290           0 : static void dax_unlock_entry(struct xa_state *xas, void *entry)
     291             : {
     292           0 :         void *old;
     293             : 
     294           0 :         BUG_ON(dax_is_locked(entry));
     295           0 :         xas_reset(xas);
     296           0 :         xas_lock_irq(xas);
     297           0 :         old = xas_store(xas, entry);
     298           0 :         xas_unlock_irq(xas);
     299           0 :         BUG_ON(!dax_is_locked(old));
     300           0 :         dax_wake_entry(xas, entry, WAKE_NEXT);
     301           0 : }
     302             : 
     303             : /*
     304             :  * Return: The entry stored at this location before it was locked.
     305             :  */
     306           0 : static void *dax_lock_entry(struct xa_state *xas, void *entry)
     307             : {
     308           0 :         unsigned long v = xa_to_value(entry);
     309           0 :         return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
     310             : }
     311             : 
     312           0 : static unsigned long dax_entry_size(void *entry)
     313             : {
     314           0 :         if (dax_is_zero_entry(entry))
     315             :                 return 0;
     316           0 :         else if (dax_is_empty_entry(entry))
     317             :                 return 0;
     318           0 :         else if (dax_is_pmd_entry(entry))
     319             :                 return PMD_SIZE;
     320             :         else
     321           0 :                 return PAGE_SIZE;
     322             : }
     323             : 
     324             : static unsigned long dax_end_pfn(void *entry)
     325             : {
     326           0 :         return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
     327             : }
     328             : 
     329             : /*
     330             :  * Iterate through all mapped pfns represented by an entry, i.e. skip
     331             :  * 'empty' and 'zero' entries.
     332             :  */
     333             : #define for_each_mapped_pfn(entry, pfn) \
     334             :         for (pfn = dax_to_pfn(entry); \
     335             :                         pfn < dax_end_pfn(entry); pfn++)
     336             : 
     337             : static inline bool dax_page_is_shared(struct page *page)
     338             : {
     339           0 :         return page->mapping == PAGE_MAPPING_DAX_SHARED;
     340             : }
     341             : 
     342             : /*
     343             :  * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
     344             :  * refcount.
     345             :  */
     346             : static inline void dax_page_share_get(struct page *page)
     347             : {
     348           0 :         if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
     349             :                 /*
     350             :                  * Reset the index if the page was already mapped
     351             :                  * regularly before.
     352             :                  */
     353           0 :                 if (page->mapping)
     354           0 :                         page->share = 1;
     355           0 :                 page->mapping = PAGE_MAPPING_DAX_SHARED;
     356             :         }
     357           0 :         page->share++;
     358           0 : }
     359             : 
     360             : static inline unsigned long dax_page_share_put(struct page *page)
     361             : {
     362           0 :         return --page->share;
     363             : }
     364             : 
     365             : /*
     366             :  * When it is called in dax_insert_entry(), the shared flag will indicate that
     367             :  * whether this entry is shared by multiple files.  If so, set the page->mapping
     368             :  * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
     369             :  */
     370           0 : static void dax_associate_entry(void *entry, struct address_space *mapping,
     371             :                 struct vm_area_struct *vma, unsigned long address, bool shared)
     372             : {
     373           0 :         unsigned long size = dax_entry_size(entry), pfn, index;
     374           0 :         int i = 0;
     375             : 
     376           0 :         if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
     377             :                 return;
     378             : 
     379           0 :         index = linear_page_index(vma, address & ~(size - 1));
     380           0 :         for_each_mapped_pfn(entry, pfn) {
     381           0 :                 struct page *page = pfn_to_page(pfn);
     382             : 
     383           0 :                 if (shared) {
     384           0 :                         dax_page_share_get(page);
     385             :                 } else {
     386           0 :                         WARN_ON_ONCE(page->mapping);
     387           0 :                         page->mapping = mapping;
     388           0 :                         page->index = index + i++;
     389             :                 }
     390             :         }
     391             : }
     392             : 
     393           0 : static void dax_disassociate_entry(void *entry, struct address_space *mapping,
     394             :                 bool trunc)
     395             : {
     396           0 :         unsigned long pfn;
     397             : 
     398           0 :         if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
     399             :                 return;
     400             : 
     401           0 :         for_each_mapped_pfn(entry, pfn) {
     402           0 :                 struct page *page = pfn_to_page(pfn);
     403             : 
     404           0 :                 WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
     405           0 :                 if (dax_page_is_shared(page)) {
     406             :                         /* keep the shared flag if this page is still shared */
     407           0 :                         if (dax_page_share_put(page) > 0)
     408           0 :                                 continue;
     409             :                 } else
     410           0 :                         WARN_ON_ONCE(page->mapping && page->mapping != mapping);
     411           0 :                 page->mapping = NULL;
     412           0 :                 page->index = 0;
     413             :         }
     414             : }
     415             : 
     416           0 : static struct page *dax_busy_page(void *entry)
     417             : {
     418           0 :         unsigned long pfn;
     419             : 
     420           0 :         for_each_mapped_pfn(entry, pfn) {
     421           0 :                 struct page *page = pfn_to_page(pfn);
     422             : 
     423           0 :                 if (page_ref_count(page) > 1)
     424           0 :                         return page;
     425             :         }
     426             :         return NULL;
     427             : }
     428             : 
     429             : /*
     430             :  * dax_lock_page - Lock the DAX entry corresponding to a page
     431             :  * @page: The page whose entry we want to lock
     432             :  *
     433             :  * Context: Process context.
     434             :  * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
     435             :  * not be locked.
     436             :  */
     437           0 : dax_entry_t dax_lock_page(struct page *page)
     438             : {
     439           0 :         XA_STATE(xas, NULL, 0);
     440           0 :         void *entry;
     441             : 
     442             :         /* Ensure page->mapping isn't freed while we look at it */
     443           0 :         rcu_read_lock();
     444           0 :         for (;;) {
     445           0 :                 struct address_space *mapping = READ_ONCE(page->mapping);
     446             : 
     447           0 :                 entry = NULL;
     448           0 :                 if (!mapping || !dax_mapping(mapping))
     449             :                         break;
     450             : 
     451             :                 /*
     452             :                  * In the device-dax case there's no need to lock, a
     453             :                  * struct dev_pagemap pin is sufficient to keep the
     454             :                  * inode alive, and we assume we have dev_pagemap pin
     455             :                  * otherwise we would not have a valid pfn_to_page()
     456             :                  * translation.
     457             :                  */
     458           0 :                 entry = (void *)~0UL;
     459           0 :                 if (S_ISCHR(mapping->host->i_mode))
     460             :                         break;
     461             : 
     462           0 :                 xas.xa = &mapping->i_pages;
     463           0 :                 xas_lock_irq(&xas);
     464           0 :                 if (mapping != page->mapping) {
     465           0 :                         xas_unlock_irq(&xas);
     466           0 :                         continue;
     467             :                 }
     468           0 :                 xas_set(&xas, page->index);
     469           0 :                 entry = xas_load(&xas);
     470           0 :                 if (dax_is_locked(entry)) {
     471           0 :                         rcu_read_unlock();
     472           0 :                         wait_entry_unlocked(&xas, entry);
     473           0 :                         rcu_read_lock();
     474           0 :                         continue;
     475             :                 }
     476           0 :                 dax_lock_entry(&xas, entry);
     477           0 :                 xas_unlock_irq(&xas);
     478             :                 break;
     479             :         }
     480           0 :         rcu_read_unlock();
     481           0 :         return (dax_entry_t)entry;
     482             : }
     483             : 
     484           0 : void dax_unlock_page(struct page *page, dax_entry_t cookie)
     485             : {
     486           0 :         struct address_space *mapping = page->mapping;
     487           0 :         XA_STATE(xas, &mapping->i_pages, page->index);
     488             : 
     489           0 :         if (S_ISCHR(mapping->host->i_mode))
     490           0 :                 return;
     491             : 
     492           0 :         dax_unlock_entry(&xas, (void *)cookie);
     493             : }
     494             : 
     495             : /*
     496             :  * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
     497             :  * @mapping: the file's mapping whose entry we want to lock
     498             :  * @index: the offset within this file
     499             :  * @page: output the dax page corresponding to this dax entry
     500             :  *
     501             :  * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
     502             :  * could not be locked.
     503             :  */
     504           0 : dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
     505             :                 struct page **page)
     506             : {
     507           0 :         XA_STATE(xas, NULL, 0);
     508           0 :         void *entry;
     509             : 
     510           0 :         rcu_read_lock();
     511           0 :         for (;;) {
     512           0 :                 entry = NULL;
     513           0 :                 if (!dax_mapping(mapping))
     514             :                         break;
     515             : 
     516           0 :                 xas.xa = &mapping->i_pages;
     517           0 :                 xas_lock_irq(&xas);
     518           0 :                 xas_set(&xas, index);
     519           0 :                 entry = xas_load(&xas);
     520           0 :                 if (dax_is_locked(entry)) {
     521           0 :                         rcu_read_unlock();
     522           0 :                         wait_entry_unlocked(&xas, entry);
     523           0 :                         rcu_read_lock();
     524           0 :                         continue;
     525             :                 }
     526           0 :                 if (!entry ||
     527           0 :                     dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
     528             :                         /*
     529             :                          * Because we are looking for entry from file's mapping
     530             :                          * and index, so the entry may not be inserted for now,
     531             :                          * or even a zero/empty entry.  We don't think this is
     532             :                          * an error case.  So, return a special value and do
     533             :                          * not output @page.
     534             :                          */
     535             :                         entry = (void *)~0UL;
     536             :                 } else {
     537           0 :                         *page = pfn_to_page(dax_to_pfn(entry));
     538           0 :                         dax_lock_entry(&xas, entry);
     539             :                 }
     540           0 :                 xas_unlock_irq(&xas);
     541             :                 break;
     542             :         }
     543           0 :         rcu_read_unlock();
     544           0 :         return (dax_entry_t)entry;
     545             : }
     546             : 
     547           0 : void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
     548             :                 dax_entry_t cookie)
     549             : {
     550           0 :         XA_STATE(xas, &mapping->i_pages, index);
     551             : 
     552           0 :         if (cookie == ~0UL)
     553           0 :                 return;
     554             : 
     555           0 :         dax_unlock_entry(&xas, (void *)cookie);
     556             : }
     557             : 
     558             : /*
     559             :  * Find page cache entry at given index. If it is a DAX entry, return it
     560             :  * with the entry locked. If the page cache doesn't contain an entry at
     561             :  * that index, add a locked empty entry.
     562             :  *
     563             :  * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
     564             :  * either return that locked entry or will return VM_FAULT_FALLBACK.
     565             :  * This will happen if there are any PTE entries within the PMD range
     566             :  * that we are requesting.
     567             :  *
     568             :  * We always favor PTE entries over PMD entries. There isn't a flow where we
     569             :  * evict PTE entries in order to 'upgrade' them to a PMD entry.  A PMD
     570             :  * insertion will fail if it finds any PTE entries already in the tree, and a
     571             :  * PTE insertion will cause an existing PMD entry to be unmapped and
     572             :  * downgraded to PTE entries.  This happens for both PMD zero pages as
     573             :  * well as PMD empty entries.
     574             :  *
     575             :  * The exception to this downgrade path is for PMD entries that have
     576             :  * real storage backing them.  We will leave these real PMD entries in
     577             :  * the tree, and PTE writes will simply dirty the entire PMD entry.
     578             :  *
     579             :  * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
     580             :  * persistent memory the benefit is doubtful. We can add that later if we can
     581             :  * show it helps.
     582             :  *
     583             :  * On error, this function does not return an ERR_PTR.  Instead it returns
     584             :  * a VM_FAULT code, encoded as an xarray internal entry.  The ERR_PTR values
     585             :  * overlap with xarray value entries.
     586             :  */
     587           0 : static void *grab_mapping_entry(struct xa_state *xas,
     588             :                 struct address_space *mapping, unsigned int order)
     589             : {
     590           0 :         unsigned long index = xas->xa_index;
     591           0 :         bool pmd_downgrade;     /* splitting PMD entry into PTE entries? */
     592           0 :         void *entry;
     593             : 
     594           0 : retry:
     595           0 :         pmd_downgrade = false;
     596           0 :         xas_lock_irq(xas);
     597           0 :         entry = get_unlocked_entry(xas, order);
     598             : 
     599           0 :         if (entry) {
     600           0 :                 if (dax_is_conflict(entry))
     601           0 :                         goto fallback;
     602           0 :                 if (!xa_is_value(entry)) {
     603           0 :                         xas_set_err(xas, -EIO);
     604           0 :                         goto out_unlock;
     605             :                 }
     606             : 
     607           0 :                 if (order == 0) {
     608           0 :                         if (dax_is_pmd_entry(entry) &&
     609           0 :                             (dax_is_zero_entry(entry) ||
     610             :                              dax_is_empty_entry(entry))) {
     611           0 :                                 pmd_downgrade = true;
     612             :                         }
     613             :                 }
     614             :         }
     615             : 
     616           0 :         if (pmd_downgrade) {
     617             :                 /*
     618             :                  * Make sure 'entry' remains valid while we drop
     619             :                  * the i_pages lock.
     620             :                  */
     621           0 :                 dax_lock_entry(xas, entry);
     622             : 
     623             :                 /*
     624             :                  * Besides huge zero pages the only other thing that gets
     625             :                  * downgraded are empty entries which don't need to be
     626             :                  * unmapped.
     627             :                  */
     628           0 :                 if (dax_is_zero_entry(entry)) {
     629           0 :                         xas_unlock_irq(xas);
     630           0 :                         unmap_mapping_pages(mapping,
     631           0 :                                         xas->xa_index & ~PG_PMD_COLOUR,
     632             :                                         PG_PMD_NR, false);
     633           0 :                         xas_reset(xas);
     634           0 :                         xas_lock_irq(xas);
     635             :                 }
     636             : 
     637           0 :                 dax_disassociate_entry(entry, mapping, false);
     638           0 :                 xas_store(xas, NULL);   /* undo the PMD join */
     639           0 :                 dax_wake_entry(xas, entry, WAKE_ALL);
     640           0 :                 mapping->nrpages -= PG_PMD_NR;
     641           0 :                 entry = NULL;
     642           0 :                 xas_set(xas, index);
     643             :         }
     644             : 
     645           0 :         if (entry) {
     646           0 :                 dax_lock_entry(xas, entry);
     647             :         } else {
     648           0 :                 unsigned long flags = DAX_EMPTY;
     649             : 
     650           0 :                 if (order > 0)
     651           0 :                         flags |= DAX_PMD;
     652           0 :                 entry = dax_make_entry(pfn_to_pfn_t(0), flags);
     653           0 :                 dax_lock_entry(xas, entry);
     654           0 :                 if (xas_error(xas))
     655           0 :                         goto out_unlock;
     656           0 :                 mapping->nrpages += 1UL << order;
     657             :         }
     658             : 
     659           0 : out_unlock:
     660           0 :         xas_unlock_irq(xas);
     661           0 :         if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
     662           0 :                 goto retry;
     663           0 :         if (xas->xa_node == XA_ERROR(-ENOMEM))
     664             :                 return xa_mk_internal(VM_FAULT_OOM);
     665           0 :         if (xas_error(xas))
     666           0 :                 return xa_mk_internal(VM_FAULT_SIGBUS);
     667             :         return entry;
     668             : fallback:
     669           0 :         xas_unlock_irq(xas);
     670           0 :         return xa_mk_internal(VM_FAULT_FALLBACK);
     671             : }
     672             : 
     673             : /**
     674             :  * dax_layout_busy_page_range - find first pinned page in @mapping
     675             :  * @mapping: address space to scan for a page with ref count > 1
     676             :  * @start: Starting offset. Page containing 'start' is included.
     677             :  * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
     678             :  *       pages from 'start' till the end of file are included.
     679             :  *
     680             :  * DAX requires ZONE_DEVICE mapped pages. These pages are never
     681             :  * 'onlined' to the page allocator so they are considered idle when
     682             :  * page->count == 1. A filesystem uses this interface to determine if
     683             :  * any page in the mapping is busy, i.e. for DMA, or other
     684             :  * get_user_pages() usages.
     685             :  *
     686             :  * It is expected that the filesystem is holding locks to block the
     687             :  * establishment of new mappings in this address_space. I.e. it expects
     688             :  * to be able to run unmap_mapping_range() and subsequently not race
     689             :  * mapping_mapped() becoming true.
     690             :  */
     691    66390389 : struct page *dax_layout_busy_page_range(struct address_space *mapping,
     692             :                                         loff_t start, loff_t end)
     693             : {
     694    66390389 :         void *entry;
     695    66390389 :         unsigned int scanned = 0;
     696    66390389 :         struct page *page = NULL;
     697    66390389 :         pgoff_t start_idx = start >> PAGE_SHIFT;
     698    66390389 :         pgoff_t end_idx;
     699    66390389 :         XA_STATE(xas, &mapping->i_pages, start_idx);
     700             : 
     701             :         /*
     702             :          * In the 'limited' case get_user_pages() for dax is disabled.
     703             :          */
     704    66390389 :         if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
     705             :                 return NULL;
     706             : 
     707   132780778 :         if (!dax_mapping(mapping) || !mapping_mapped(mapping))
     708             :                 return NULL;
     709             : 
     710             :         /* If end == LLONG_MAX, all pages from start to till end of file */
     711           0 :         if (end == LLONG_MAX)
     712             :                 end_idx = ULONG_MAX;
     713             :         else
     714           0 :                 end_idx = end >> PAGE_SHIFT;
     715             :         /*
     716             :          * If we race get_user_pages_fast() here either we'll see the
     717             :          * elevated page count in the iteration and wait, or
     718             :          * get_user_pages_fast() will see that the page it took a reference
     719             :          * against is no longer mapped in the page tables and bail to the
     720             :          * get_user_pages() slow path.  The slow path is protected by
     721             :          * pte_lock() and pmd_lock(). New references are not taken without
     722             :          * holding those locks, and unmap_mapping_pages() will not zero the
     723             :          * pte or pmd without holding the respective lock, so we are
     724             :          * guaranteed to either see new references or prevent new
     725             :          * references from being established.
     726             :          */
     727           0 :         unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
     728             : 
     729           0 :         xas_lock_irq(&xas);
     730           0 :         xas_for_each(&xas, entry, end_idx) {
     731           0 :                 if (WARN_ON_ONCE(!xa_is_value(entry)))
     732           0 :                         continue;
     733           0 :                 if (unlikely(dax_is_locked(entry)))
     734           0 :                         entry = get_unlocked_entry(&xas, 0);
     735           0 :                 if (entry)
     736           0 :                         page = dax_busy_page(entry);
     737           0 :                 put_unlocked_entry(&xas, entry, WAKE_NEXT);
     738           0 :                 if (page)
     739             :                         break;
     740           0 :                 if (++scanned % XA_CHECK_SCHED)
     741           0 :                         continue;
     742             : 
     743           0 :                 xas_pause(&xas);
     744           0 :                 xas_unlock_irq(&xas);
     745           0 :                 cond_resched();
     746           0 :                 xas_lock_irq(&xas);
     747             :         }
     748           0 :         xas_unlock_irq(&xas);
     749           0 :         return page;
     750             : }
     751             : EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
     752             : 
     753    66391302 : struct page *dax_layout_busy_page(struct address_space *mapping)
     754             : {
     755    66391302 :         return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
     756             : }
     757             : EXPORT_SYMBOL_GPL(dax_layout_busy_page);
     758             : 
     759           0 : static int __dax_invalidate_entry(struct address_space *mapping,
     760             :                                           pgoff_t index, bool trunc)
     761             : {
     762           0 :         XA_STATE(xas, &mapping->i_pages, index);
     763           0 :         int ret = 0;
     764           0 :         void *entry;
     765             : 
     766           0 :         xas_lock_irq(&xas);
     767           0 :         entry = get_unlocked_entry(&xas, 0);
     768           0 :         if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
     769           0 :                 goto out;
     770           0 :         if (!trunc &&
     771           0 :             (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
     772           0 :              xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
     773           0 :                 goto out;
     774           0 :         dax_disassociate_entry(entry, mapping, trunc);
     775           0 :         xas_store(&xas, NULL);
     776           0 :         mapping->nrpages -= 1UL << dax_entry_order(entry);
     777           0 :         ret = 1;
     778           0 : out:
     779           0 :         put_unlocked_entry(&xas, entry, WAKE_ALL);
     780           0 :         xas_unlock_irq(&xas);
     781           0 :         return ret;
     782             : }
     783             : 
     784           0 : static int __dax_clear_dirty_range(struct address_space *mapping,
     785             :                 pgoff_t start, pgoff_t end)
     786             : {
     787           0 :         XA_STATE(xas, &mapping->i_pages, start);
     788           0 :         unsigned int scanned = 0;
     789           0 :         void *entry;
     790             : 
     791           0 :         xas_lock_irq(&xas);
     792           0 :         xas_for_each(&xas, entry, end) {
     793           0 :                 entry = get_unlocked_entry(&xas, 0);
     794           0 :                 xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
     795           0 :                 xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
     796           0 :                 put_unlocked_entry(&xas, entry, WAKE_NEXT);
     797             : 
     798           0 :                 if (++scanned % XA_CHECK_SCHED)
     799           0 :                         continue;
     800             : 
     801           0 :                 xas_pause(&xas);
     802           0 :                 xas_unlock_irq(&xas);
     803           0 :                 cond_resched();
     804           0 :                 xas_lock_irq(&xas);
     805             :         }
     806           0 :         xas_unlock_irq(&xas);
     807             : 
     808           0 :         return 0;
     809             : }
     810             : 
     811             : /*
     812             :  * Delete DAX entry at @index from @mapping.  Wait for it
     813             :  * to be unlocked before deleting it.
     814             :  */
     815           0 : int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
     816             : {
     817           0 :         int ret = __dax_invalidate_entry(mapping, index, true);
     818             : 
     819             :         /*
     820             :          * This gets called from truncate / punch_hole path. As such, the caller
     821             :          * must hold locks protecting against concurrent modifications of the
     822             :          * page cache (usually fs-private i_mmap_sem for writing). Since the
     823             :          * caller has seen a DAX entry for this index, we better find it
     824             :          * at that index as well...
     825             :          */
     826           0 :         WARN_ON_ONCE(!ret);
     827           0 :         return ret;
     828             : }
     829             : 
     830             : /*
     831             :  * Invalidate DAX entry if it is clean.
     832             :  */
     833           0 : int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
     834             :                                       pgoff_t index)
     835             : {
     836           0 :         return __dax_invalidate_entry(mapping, index, false);
     837             : }
     838             : 
     839             : static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
     840             : {
     841           0 :         return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
     842             : }
     843             : 
     844           0 : static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
     845             : {
     846           0 :         pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos);
     847           0 :         void *vto, *kaddr;
     848           0 :         long rc;
     849           0 :         int id;
     850             : 
     851           0 :         id = dax_read_lock();
     852           0 :         rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS,
     853             :                                 &kaddr, NULL);
     854           0 :         if (rc < 0) {
     855           0 :                 dax_read_unlock(id);
     856           0 :                 return rc;
     857             :         }
     858           0 :         vto = kmap_atomic(vmf->cow_page);
     859           0 :         copy_user_page(vto, kaddr, vmf->address, vmf->cow_page);
     860           0 :         kunmap_atomic(vto);
     861           0 :         dax_read_unlock(id);
     862           0 :         return 0;
     863             : }
     864             : 
     865             : /*
     866             :  * MAP_SYNC on a dax mapping guarantees dirty metadata is
     867             :  * flushed on write-faults (non-cow), but not read-faults.
     868             :  */
     869           0 : static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
     870             :                 struct vm_area_struct *vma)
     871             : {
     872           0 :         return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
     873           0 :                 (iter->iomap.flags & IOMAP_F_DIRTY);
     874             : }
     875             : 
     876             : /*
     877             :  * By this point grab_mapping_entry() has ensured that we have a locked entry
     878             :  * of the appropriate size so we don't have to worry about downgrading PMDs to
     879             :  * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
     880             :  * already in the tree, we will skip the insertion and just dirty the PMD as
     881             :  * appropriate.
     882             :  */
     883           0 : static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
     884             :                 const struct iomap_iter *iter, void *entry, pfn_t pfn,
     885             :                 unsigned long flags)
     886             : {
     887           0 :         struct address_space *mapping = vmf->vma->vm_file->f_mapping;
     888           0 :         void *new_entry = dax_make_entry(pfn, flags);
     889           0 :         bool write = iter->flags & IOMAP_WRITE;
     890           0 :         bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma);
     891           0 :         bool shared = iter->iomap.flags & IOMAP_F_SHARED;
     892             : 
     893           0 :         if (dirty)
     894           0 :                 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
     895             : 
     896           0 :         if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
     897           0 :                 unsigned long index = xas->xa_index;
     898             :                 /* we are replacing a zero page with block mapping */
     899           0 :                 if (dax_is_pmd_entry(entry))
     900           0 :                         unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
     901             :                                         PG_PMD_NR, false);
     902             :                 else /* pte entry */
     903           0 :                         unmap_mapping_pages(mapping, index, 1, false);
     904             :         }
     905             : 
     906           0 :         xas_reset(xas);
     907           0 :         xas_lock_irq(xas);
     908           0 :         if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
     909           0 :                 void *old;
     910             : 
     911           0 :                 dax_disassociate_entry(entry, mapping, false);
     912           0 :                 dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
     913             :                                 shared);
     914             :                 /*
     915             :                  * Only swap our new entry into the page cache if the current
     916             :                  * entry is a zero page or an empty entry.  If a normal PTE or
     917             :                  * PMD entry is already in the cache, we leave it alone.  This
     918             :                  * means that if we are trying to insert a PTE and the
     919             :                  * existing entry is a PMD, we will just leave the PMD in the
     920             :                  * tree and dirty it if necessary.
     921             :                  */
     922           0 :                 old = dax_lock_entry(xas, new_entry);
     923           0 :                 WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
     924             :                                         DAX_LOCKED));
     925             :                 entry = new_entry;
     926             :         } else {
     927           0 :                 xas_load(xas);  /* Walk the xa_state */
     928             :         }
     929             : 
     930           0 :         if (dirty)
     931           0 :                 xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
     932             : 
     933           0 :         if (write && shared)
     934           0 :                 xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
     935             : 
     936           0 :         xas_unlock_irq(xas);
     937           0 :         return entry;
     938             : }
     939             : 
     940           0 : static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
     941             :                 struct address_space *mapping, void *entry)
     942             : {
     943           0 :         unsigned long pfn, index, count, end;
     944           0 :         long ret = 0;
     945           0 :         struct vm_area_struct *vma;
     946             : 
     947             :         /*
     948             :          * A page got tagged dirty in DAX mapping? Something is seriously
     949             :          * wrong.
     950             :          */
     951           0 :         if (WARN_ON(!xa_is_value(entry)))
     952             :                 return -EIO;
     953             : 
     954           0 :         if (unlikely(dax_is_locked(entry))) {
     955           0 :                 void *old_entry = entry;
     956             : 
     957           0 :                 entry = get_unlocked_entry(xas, 0);
     958             : 
     959             :                 /* Entry got punched out / reallocated? */
     960           0 :                 if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
     961           0 :                         goto put_unlocked;
     962             :                 /*
     963             :                  * Entry got reallocated elsewhere? No need to writeback.
     964             :                  * We have to compare pfns as we must not bail out due to
     965             :                  * difference in lockbit or entry type.
     966             :                  */
     967           0 :                 if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
     968           0 :                         goto put_unlocked;
     969           0 :                 if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
     970             :                                         dax_is_zero_entry(entry))) {
     971           0 :                         ret = -EIO;
     972           0 :                         goto put_unlocked;
     973             :                 }
     974             : 
     975             :                 /* Another fsync thread may have already done this entry */
     976           0 :                 if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
     977           0 :                         goto put_unlocked;
     978             :         }
     979             : 
     980             :         /* Lock the entry to serialize with page faults */
     981           0 :         dax_lock_entry(xas, entry);
     982             : 
     983             :         /*
     984             :          * We can clear the tag now but we have to be careful so that concurrent
     985             :          * dax_writeback_one() calls for the same index cannot finish before we
     986             :          * actually flush the caches. This is achieved as the calls will look
     987             :          * at the entry only under the i_pages lock and once they do that
     988             :          * they will see the entry locked and wait for it to unlock.
     989             :          */
     990           0 :         xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
     991           0 :         xas_unlock_irq(xas);
     992             : 
     993             :         /*
     994             :          * If dax_writeback_mapping_range() was given a wbc->range_start
     995             :          * in the middle of a PMD, the 'index' we use needs to be
     996             :          * aligned to the start of the PMD.
     997             :          * This allows us to flush for PMD_SIZE and not have to worry about
     998             :          * partial PMD writebacks.
     999             :          */
    1000           0 :         pfn = dax_to_pfn(entry);
    1001           0 :         count = 1UL << dax_entry_order(entry);
    1002           0 :         index = xas->xa_index & ~(count - 1);
    1003           0 :         end = index + count - 1;
    1004             : 
    1005             :         /* Walk all mappings of a given index of a file and writeprotect them */
    1006           0 :         i_mmap_lock_read(mapping);
    1007           0 :         vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
    1008           0 :                 pfn_mkclean_range(pfn, count, index, vma);
    1009           0 :                 cond_resched();
    1010             :         }
    1011           0 :         i_mmap_unlock_read(mapping);
    1012             : 
    1013           0 :         dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
    1014             :         /*
    1015             :          * After we have flushed the cache, we can clear the dirty tag. There
    1016             :          * cannot be new dirty data in the pfn after the flush has completed as
    1017             :          * the pfn mappings are writeprotected and fault waits for mapping
    1018             :          * entry lock.
    1019             :          */
    1020           0 :         xas_reset(xas);
    1021           0 :         xas_lock_irq(xas);
    1022           0 :         xas_store(xas, entry);
    1023           0 :         xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
    1024           0 :         dax_wake_entry(xas, entry, WAKE_NEXT);
    1025             : 
    1026           0 :         trace_dax_writeback_one(mapping->host, index, count);
    1027           0 :         return ret;
    1028             : 
    1029           0 :  put_unlocked:
    1030           0 :         put_unlocked_entry(xas, entry, WAKE_NEXT);
    1031           0 :         return ret;
    1032             : }
    1033             : 
    1034             : /*
    1035             :  * Flush the mapping to the persistent domain within the byte range of [start,
    1036             :  * end]. This is required by data integrity operations to ensure file data is
    1037             :  * on persistent storage prior to completion of the operation.
    1038             :  */
    1039           0 : int dax_writeback_mapping_range(struct address_space *mapping,
    1040             :                 struct dax_device *dax_dev, struct writeback_control *wbc)
    1041             : {
    1042           0 :         XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
    1043           0 :         struct inode *inode = mapping->host;
    1044           0 :         pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
    1045           0 :         void *entry;
    1046           0 :         int ret = 0;
    1047           0 :         unsigned int scanned = 0;
    1048             : 
    1049           0 :         if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
    1050             :                 return -EIO;
    1051             : 
    1052           0 :         if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL)
    1053             :                 return 0;
    1054             : 
    1055           0 :         trace_dax_writeback_range(inode, xas.xa_index, end_index);
    1056             : 
    1057           0 :         tag_pages_for_writeback(mapping, xas.xa_index, end_index);
    1058             : 
    1059           0 :         xas_lock_irq(&xas);
    1060           0 :         xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
    1061           0 :                 ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
    1062           0 :                 if (ret < 0) {
    1063           0 :                         mapping_set_error(mapping, ret);
    1064           0 :                         break;
    1065             :                 }
    1066           0 :                 if (++scanned % XA_CHECK_SCHED)
    1067           0 :                         continue;
    1068             : 
    1069           0 :                 xas_pause(&xas);
    1070           0 :                 xas_unlock_irq(&xas);
    1071           0 :                 cond_resched();
    1072           0 :                 xas_lock_irq(&xas);
    1073             :         }
    1074           0 :         xas_unlock_irq(&xas);
    1075           0 :         trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
    1076           0 :         return ret;
    1077             : }
    1078             : EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
    1079             : 
    1080           0 : static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
    1081             :                 size_t size, void **kaddr, pfn_t *pfnp)
    1082             : {
    1083           0 :         pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
    1084           0 :         int id, rc = 0;
    1085           0 :         long length;
    1086             : 
    1087           0 :         id = dax_read_lock();
    1088           0 :         length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
    1089             :                                    DAX_ACCESS, kaddr, pfnp);
    1090           0 :         if (length < 0) {
    1091           0 :                 rc = length;
    1092           0 :                 goto out;
    1093             :         }
    1094           0 :         if (!pfnp)
    1095           0 :                 goto out_check_addr;
    1096           0 :         rc = -EINVAL;
    1097           0 :         if (PFN_PHYS(length) < size)
    1098           0 :                 goto out;
    1099           0 :         if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
    1100           0 :                 goto out;
    1101             :         /* For larger pages we need devmap */
    1102           0 :         if (length > 1 && !pfn_t_devmap(*pfnp))
    1103           0 :                 goto out;
    1104             :         rc = 0;
    1105             : 
    1106           0 : out_check_addr:
    1107           0 :         if (!kaddr)
    1108           0 :                 goto out;
    1109           0 :         if (!*kaddr)
    1110           0 :                 rc = -EFAULT;
    1111           0 : out:
    1112           0 :         dax_read_unlock(id);
    1113           0 :         return rc;
    1114             : }
    1115             : 
    1116             : /**
    1117             :  * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
    1118             :  * by copying the data before and after the range to be written.
    1119             :  * @pos:        address to do copy from.
    1120             :  * @length:     size of copy operation.
    1121             :  * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
    1122             :  * @srcmap:     iomap srcmap
    1123             :  * @daddr:      destination address to copy to.
    1124             :  *
    1125             :  * This can be called from two places. Either during DAX write fault (page
    1126             :  * aligned), to copy the length size data to daddr. Or, while doing normal DAX
    1127             :  * write operation, dax_iomap_iter() might call this to do the copy of either
    1128             :  * start or end unaligned address. In the latter case the rest of the copy of
    1129             :  * aligned ranges is taken care by dax_iomap_iter() itself.
    1130             :  * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the
    1131             :  * area to make sure no old data remains.
    1132             :  */
    1133           0 : static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
    1134             :                 const struct iomap *srcmap, void *daddr)
    1135             : {
    1136           0 :         loff_t head_off = pos & (align_size - 1);
    1137           0 :         size_t size = ALIGN(head_off + length, align_size);
    1138           0 :         loff_t end = pos + length;
    1139           0 :         loff_t pg_end = round_up(end, align_size);
    1140             :         /* copy_all is usually in page fault case */
    1141           0 :         bool copy_all = head_off == 0 && end == pg_end;
    1142             :         /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */
    1143           0 :         bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
    1144           0 :                          srcmap->type == IOMAP_UNWRITTEN;
    1145           0 :         void *saddr = 0;
    1146           0 :         int ret = 0;
    1147             : 
    1148           0 :         if (!zero_edge) {
    1149           0 :                 ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
    1150           0 :                 if (ret)
    1151           0 :                         return dax_mem2blk_err(ret);
    1152             :         }
    1153             : 
    1154           0 :         if (copy_all) {
    1155           0 :                 if (zero_edge)
    1156           0 :                         memset(daddr, 0, size);
    1157             :                 else
    1158           0 :                         ret = copy_mc_to_kernel(daddr, saddr, length);
    1159           0 :                 goto out;
    1160             :         }
    1161             : 
    1162             :         /* Copy the head part of the range */
    1163           0 :         if (head_off) {
    1164           0 :                 if (zero_edge)
    1165           0 :                         memset(daddr, 0, head_off);
    1166             :                 else {
    1167           0 :                         ret = copy_mc_to_kernel(daddr, saddr, head_off);
    1168           0 :                         if (ret)
    1169             :                                 return -EIO;
    1170             :                 }
    1171             :         }
    1172             : 
    1173             :         /* Copy the tail part of the range */
    1174           0 :         if (end < pg_end) {
    1175           0 :                 loff_t tail_off = head_off + length;
    1176           0 :                 loff_t tail_len = pg_end - end;
    1177             : 
    1178           0 :                 if (zero_edge)
    1179           0 :                         memset(daddr + tail_off, 0, tail_len);
    1180             :                 else {
    1181           0 :                         ret = copy_mc_to_kernel(daddr + tail_off,
    1182           0 :                                                 saddr + tail_off, tail_len);
    1183           0 :                         if (ret)
    1184             :                                 return -EIO;
    1185             :                 }
    1186             :         }
    1187           0 : out:
    1188           0 :         if (zero_edge)
    1189           0 :                 dax_flush(srcmap->dax_dev, daddr, size);
    1190           0 :         return ret ? -EIO : 0;
    1191             : }
    1192             : 
    1193             : /*
    1194             :  * The user has performed a load from a hole in the file.  Allocating a new
    1195             :  * page in the file would cause excessive storage usage for workloads with
    1196             :  * sparse files.  Instead we insert a read-only mapping of the 4k zero page.
    1197             :  * If this page is ever written to we will re-fault and change the mapping to
    1198             :  * point to real DAX storage instead.
    1199             :  */
    1200           0 : static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
    1201             :                 const struct iomap_iter *iter, void **entry)
    1202             : {
    1203           0 :         struct inode *inode = iter->inode;
    1204           0 :         unsigned long vaddr = vmf->address;
    1205           0 :         pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
    1206           0 :         vm_fault_t ret;
    1207             : 
    1208           0 :         *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
    1209             : 
    1210           0 :         ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
    1211           0 :         trace_dax_load_hole(inode, vmf, ret);
    1212           0 :         return ret;
    1213             : }
    1214             : 
    1215             : #ifdef CONFIG_FS_DAX_PMD
    1216           0 : static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
    1217             :                 const struct iomap_iter *iter, void **entry)
    1218             : {
    1219           0 :         struct address_space *mapping = vmf->vma->vm_file->f_mapping;
    1220           0 :         unsigned long pmd_addr = vmf->address & PMD_MASK;
    1221           0 :         struct vm_area_struct *vma = vmf->vma;
    1222           0 :         struct inode *inode = mapping->host;
    1223           0 :         pgtable_t pgtable = NULL;
    1224           0 :         struct page *zero_page;
    1225           0 :         spinlock_t *ptl;
    1226           0 :         pmd_t pmd_entry;
    1227           0 :         pfn_t pfn;
    1228             : 
    1229           0 :         zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
    1230             : 
    1231           0 :         if (unlikely(!zero_page))
    1232           0 :                 goto fallback;
    1233             : 
    1234           0 :         pfn = page_to_pfn_t(zero_page);
    1235           0 :         *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
    1236             :                                   DAX_PMD | DAX_ZERO_PAGE);
    1237             : 
    1238           0 :         if (arch_needs_pgtable_deposit()) {
    1239             :                 pgtable = pte_alloc_one(vma->vm_mm);
    1240             :                 if (!pgtable)
    1241             :                         return VM_FAULT_OOM;
    1242             :         }
    1243             : 
    1244           0 :         ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
    1245           0 :         if (!pmd_none(*(vmf->pmd))) {
    1246           0 :                 spin_unlock(ptl);
    1247           0 :                 goto fallback;
    1248             :         }
    1249             : 
    1250           0 :         if (pgtable) {
    1251             :                 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
    1252             :                 mm_inc_nr_ptes(vma->vm_mm);
    1253             :         }
    1254           0 :         pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
    1255           0 :         pmd_entry = pmd_mkhuge(pmd_entry);
    1256           0 :         set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
    1257           0 :         spin_unlock(ptl);
    1258           0 :         trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
    1259           0 :         return VM_FAULT_NOPAGE;
    1260             : 
    1261           0 : fallback:
    1262           0 :         if (pgtable)
    1263             :                 pte_free(vma->vm_mm, pgtable);
    1264           0 :         trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
    1265           0 :         return VM_FAULT_FALLBACK;
    1266             : }
    1267             : #else
    1268             : static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
    1269             :                 const struct iomap_iter *iter, void **entry)
    1270             : {
    1271             :         return VM_FAULT_FALLBACK;
    1272             : }
    1273             : #endif /* CONFIG_FS_DAX_PMD */
    1274             : 
    1275           0 : static s64 dax_unshare_iter(struct iomap_iter *iter)
    1276             : {
    1277           0 :         struct iomap *iomap = &iter->iomap;
    1278           0 :         const struct iomap *srcmap = iomap_iter_srcmap(iter);
    1279           0 :         loff_t pos = iter->pos;
    1280           0 :         loff_t length = iomap_length(iter);
    1281           0 :         int id = 0;
    1282           0 :         s64 ret = 0;
    1283           0 :         void *daddr = NULL, *saddr = NULL;
    1284             : 
    1285             :         /* don't bother with blocks that are not shared to start with */
    1286           0 :         if (!(iomap->flags & IOMAP_F_SHARED))
    1287             :                 return length;
    1288             : 
    1289           0 :         id = dax_read_lock();
    1290           0 :         ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL);
    1291           0 :         if (ret < 0)
    1292           0 :                 goto out_unlock;
    1293             : 
    1294             :         /* zero the distance if srcmap is HOLE or UNWRITTEN */
    1295           0 :         if (srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN) {
    1296           0 :                 memset(daddr, 0, length);
    1297           0 :                 dax_flush(iomap->dax_dev, daddr, length);
    1298           0 :                 ret = length;
    1299           0 :                 goto out_unlock;
    1300             :         }
    1301             : 
    1302           0 :         ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL);
    1303           0 :         if (ret < 0)
    1304           0 :                 goto out_unlock;
    1305             : 
    1306           0 :         if (copy_mc_to_kernel(daddr, saddr, length) == 0)
    1307             :                 ret = length;
    1308             :         else
    1309           0 :                 ret = -EIO;
    1310             : 
    1311           0 : out_unlock:
    1312           0 :         dax_read_unlock(id);
    1313           0 :         return dax_mem2blk_err(ret);
    1314             : }
    1315             : 
    1316           0 : int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
    1317             :                 const struct iomap_ops *ops)
    1318             : {
    1319           0 :         struct iomap_iter iter = {
    1320             :                 .inode          = inode,
    1321             :                 .pos            = pos,
    1322             :                 .len            = len,
    1323             :                 .flags          = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX,
    1324             :         };
    1325           0 :         int ret;
    1326             : 
    1327           0 :         while ((ret = iomap_iter(&iter, ops)) > 0)
    1328           0 :                 iter.processed = dax_unshare_iter(&iter);
    1329           0 :         return ret;
    1330             : }
    1331             : EXPORT_SYMBOL_GPL(dax_file_unshare);
    1332             : 
    1333           0 : static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
    1334             : {
    1335           0 :         const struct iomap *iomap = &iter->iomap;
    1336           0 :         const struct iomap *srcmap = iomap_iter_srcmap(iter);
    1337           0 :         unsigned offset = offset_in_page(pos);
    1338           0 :         pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
    1339           0 :         void *kaddr;
    1340           0 :         long ret;
    1341             : 
    1342           0 :         ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr,
    1343             :                                 NULL);
    1344           0 :         if (ret < 0)
    1345           0 :                 return dax_mem2blk_err(ret);
    1346             : 
    1347           0 :         memset(kaddr + offset, 0, size);
    1348           0 :         if (iomap->flags & IOMAP_F_SHARED)
    1349           0 :                 ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap,
    1350             :                                             kaddr);
    1351             :         else
    1352           0 :                 dax_flush(iomap->dax_dev, kaddr + offset, size);
    1353           0 :         return ret;
    1354             : }
    1355             : 
    1356           0 : static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
    1357             : {
    1358           0 :         const struct iomap *iomap = &iter->iomap;
    1359           0 :         const struct iomap *srcmap = iomap_iter_srcmap(iter);
    1360           0 :         loff_t pos = iter->pos;
    1361           0 :         u64 length = iomap_length(iter);
    1362           0 :         s64 written = 0;
    1363             : 
    1364             :         /* already zeroed?  we're done. */
    1365           0 :         if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
    1366           0 :                 return length;
    1367             : 
    1368             :         /*
    1369             :          * invalidate the pages whose sharing state is to be changed
    1370             :          * because of CoW.
    1371             :          */
    1372           0 :         if (iomap->flags & IOMAP_F_SHARED)
    1373           0 :                 invalidate_inode_pages2_range(iter->inode->i_mapping,
    1374           0 :                                               pos >> PAGE_SHIFT,
    1375           0 :                                               (pos + length - 1) >> PAGE_SHIFT);
    1376             : 
    1377           0 :         do {
    1378           0 :                 unsigned offset = offset_in_page(pos);
    1379           0 :                 unsigned size = min_t(u64, PAGE_SIZE - offset, length);
    1380           0 :                 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
    1381           0 :                 long rc;
    1382           0 :                 int id;
    1383             : 
    1384           0 :                 id = dax_read_lock();
    1385           0 :                 if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
    1386           0 :                         rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
    1387             :                 else
    1388           0 :                         rc = dax_memzero(iter, pos, size);
    1389           0 :                 dax_read_unlock(id);
    1390             : 
    1391           0 :                 if (rc < 0)
    1392           0 :                         return rc;
    1393           0 :                 pos += size;
    1394           0 :                 length -= size;
    1395           0 :                 written += size;
    1396           0 :         } while (length > 0);
    1397             : 
    1398           0 :         if (did_zero)
    1399           0 :                 *did_zero = true;
    1400             :         return written;
    1401             : }
    1402             : 
    1403           0 : int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
    1404             :                 const struct iomap_ops *ops)
    1405             : {
    1406           0 :         struct iomap_iter iter = {
    1407             :                 .inode          = inode,
    1408             :                 .pos            = pos,
    1409             :                 .len            = len,
    1410             :                 .flags          = IOMAP_DAX | IOMAP_ZERO,
    1411             :         };
    1412           0 :         int ret;
    1413             : 
    1414           0 :         while ((ret = iomap_iter(&iter, ops)) > 0)
    1415           0 :                 iter.processed = dax_zero_iter(&iter, did_zero);
    1416           0 :         return ret;
    1417             : }
    1418             : EXPORT_SYMBOL_GPL(dax_zero_range);
    1419             : 
    1420           0 : int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
    1421             :                 const struct iomap_ops *ops)
    1422             : {
    1423           0 :         unsigned int blocksize = i_blocksize(inode);
    1424           0 :         unsigned int off = pos & (blocksize - 1);
    1425             : 
    1426             :         /* Block boundary? Nothing to do */
    1427           0 :         if (!off)
    1428             :                 return 0;
    1429           0 :         return dax_zero_range(inode, pos, blocksize - off, did_zero, ops);
    1430             : }
    1431             : EXPORT_SYMBOL_GPL(dax_truncate_page);
    1432             : 
    1433           0 : static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
    1434             :                 struct iov_iter *iter)
    1435             : {
    1436           0 :         const struct iomap *iomap = &iomi->iomap;
    1437           0 :         const struct iomap *srcmap = iomap_iter_srcmap(iomi);
    1438           0 :         loff_t length = iomap_length(iomi);
    1439           0 :         loff_t pos = iomi->pos;
    1440           0 :         struct dax_device *dax_dev = iomap->dax_dev;
    1441           0 :         loff_t end = pos + length, done = 0;
    1442           0 :         bool write = iov_iter_rw(iter) == WRITE;
    1443           0 :         bool cow = write && iomap->flags & IOMAP_F_SHARED;
    1444           0 :         ssize_t ret = 0;
    1445           0 :         size_t xfer;
    1446           0 :         int id;
    1447             : 
    1448           0 :         if (!write) {
    1449           0 :                 end = min(end, i_size_read(iomi->inode));
    1450           0 :                 if (pos >= end)
    1451             :                         return 0;
    1452             : 
    1453           0 :                 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
    1454           0 :                         return iov_iter_zero(min(length, end - pos), iter);
    1455             :         }
    1456             : 
    1457             :         /*
    1458             :          * In DAX mode, enforce either pure overwrites of written extents, or
    1459             :          * writes to unwritten extents as part of a copy-on-write operation.
    1460             :          */
    1461           0 :         if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
    1462             :                         !(iomap->flags & IOMAP_F_SHARED)))
    1463             :                 return -EIO;
    1464             : 
    1465             :         /*
    1466             :          * Write can allocate block for an area which has a hole page mapped
    1467             :          * into page tables. We have to tear down these mappings so that data
    1468             :          * written by write(2) is visible in mmap.
    1469             :          */
    1470           0 :         if (iomap->flags & IOMAP_F_NEW || cow) {
    1471             :                 /*
    1472             :                  * Filesystem allows CoW on non-shared extents. The src extents
    1473             :                  * may have been mmapped with dirty mark before. To be able to
    1474             :                  * invalidate its dax entries, we need to clear the dirty mark
    1475             :                  * in advance.
    1476             :                  */
    1477           0 :                 if (cow)
    1478           0 :                         __dax_clear_dirty_range(iomi->inode->i_mapping,
    1479           0 :                                                 pos >> PAGE_SHIFT,
    1480           0 :                                                 (end - 1) >> PAGE_SHIFT);
    1481           0 :                 invalidate_inode_pages2_range(iomi->inode->i_mapping,
    1482           0 :                                               pos >> PAGE_SHIFT,
    1483           0 :                                               (end - 1) >> PAGE_SHIFT);
    1484             :         }
    1485             : 
    1486           0 :         id = dax_read_lock();
    1487           0 :         while (pos < end) {
    1488           0 :                 unsigned offset = pos & (PAGE_SIZE - 1);
    1489           0 :                 const size_t size = ALIGN(length + offset, PAGE_SIZE);
    1490           0 :                 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
    1491           0 :                 ssize_t map_len;
    1492           0 :                 bool recovery = false;
    1493           0 :                 void *kaddr;
    1494             : 
    1495           0 :                 if (fatal_signal_pending(current)) {
    1496             :                         ret = -EINTR;
    1497           0 :                         break;
    1498             :                 }
    1499             : 
    1500           0 :                 map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
    1501             :                                 DAX_ACCESS, &kaddr, NULL);
    1502           0 :                 if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
    1503           0 :                         map_len = dax_direct_access(dax_dev, pgoff,
    1504             :                                         PHYS_PFN(size), DAX_RECOVERY_WRITE,
    1505             :                                         &kaddr, NULL);
    1506           0 :                         if (map_len > 0)
    1507           0 :                                 recovery = true;
    1508             :                 }
    1509           0 :                 if (map_len < 0) {
    1510           0 :                         ret = dax_mem2blk_err(map_len);
    1511           0 :                         break;
    1512             :                 }
    1513             : 
    1514           0 :                 if (cow) {
    1515           0 :                         ret = dax_iomap_copy_around(pos, length, PAGE_SIZE,
    1516             :                                                     srcmap, kaddr);
    1517           0 :                         if (ret)
    1518             :                                 break;
    1519             :                 }
    1520             : 
    1521           0 :                 map_len = PFN_PHYS(map_len);
    1522           0 :                 kaddr += offset;
    1523           0 :                 map_len -= offset;
    1524           0 :                 if (map_len > end - pos)
    1525             :                         map_len = end - pos;
    1526             : 
    1527           0 :                 if (recovery)
    1528           0 :                         xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
    1529             :                                         map_len, iter);
    1530           0 :                 else if (write)
    1531           0 :                         xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
    1532             :                                         map_len, iter);
    1533             :                 else
    1534           0 :                         xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
    1535             :                                         map_len, iter);
    1536             : 
    1537           0 :                 pos += xfer;
    1538           0 :                 length -= xfer;
    1539           0 :                 done += xfer;
    1540             : 
    1541           0 :                 if (xfer == 0)
    1542           0 :                         ret = -EFAULT;
    1543           0 :                 if (xfer < map_len)
    1544             :                         break;
    1545             :         }
    1546           0 :         dax_read_unlock(id);
    1547             : 
    1548           0 :         return done ? done : ret;
    1549             : }
    1550             : 
    1551             : /**
    1552             :  * dax_iomap_rw - Perform I/O to a DAX file
    1553             :  * @iocb:       The control block for this I/O
    1554             :  * @iter:       The addresses to do I/O from or to
    1555             :  * @ops:        iomap ops passed from the file system
    1556             :  *
    1557             :  * This function performs read and write operations to directly mapped
    1558             :  * persistent memory.  The callers needs to take care of read/write exclusion
    1559             :  * and evicting any page cache pages in the region under I/O.
    1560             :  */
    1561             : ssize_t
    1562           0 : dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
    1563             :                 const struct iomap_ops *ops)
    1564             : {
    1565           0 :         struct iomap_iter iomi = {
    1566           0 :                 .inode          = iocb->ki_filp->f_mapping->host,
    1567           0 :                 .pos            = iocb->ki_pos,
    1568             :                 .len            = iov_iter_count(iter),
    1569             :                 .flags          = IOMAP_DAX,
    1570             :         };
    1571           0 :         loff_t done = 0;
    1572           0 :         int ret;
    1573             : 
    1574           0 :         if (!iomi.len)
    1575             :                 return 0;
    1576             : 
    1577           0 :         if (iov_iter_rw(iter) == WRITE) {
    1578           0 :                 lockdep_assert_held_write(&iomi.inode->i_rwsem);
    1579           0 :                 iomi.flags |= IOMAP_WRITE;
    1580             :         } else {
    1581           0 :                 lockdep_assert_held(&iomi.inode->i_rwsem);
    1582             :         }
    1583             : 
    1584           0 :         if (iocb->ki_flags & IOCB_NOWAIT)
    1585           0 :                 iomi.flags |= IOMAP_NOWAIT;
    1586             : 
    1587           0 :         while ((ret = iomap_iter(&iomi, ops)) > 0)
    1588           0 :                 iomi.processed = dax_iomap_iter(&iomi, iter);
    1589             : 
    1590           0 :         done = iomi.pos - iocb->ki_pos;
    1591           0 :         iocb->ki_pos = iomi.pos;
    1592           0 :         return done ? done : ret;
    1593             : }
    1594             : EXPORT_SYMBOL_GPL(dax_iomap_rw);
    1595             : 
    1596             : static vm_fault_t dax_fault_return(int error)
    1597             : {
    1598           0 :         if (error == 0)
    1599             :                 return VM_FAULT_NOPAGE;
    1600           0 :         return vmf_error(error);
    1601             : }
    1602             : 
    1603             : /*
    1604             :  * When handling a synchronous page fault and the inode need a fsync, we can
    1605             :  * insert the PTE/PMD into page tables only after that fsync happened. Skip
    1606             :  * insertion for now and return the pfn so that caller can insert it after the
    1607             :  * fsync is done.
    1608             :  */
    1609             : static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
    1610             : {
    1611           0 :         if (WARN_ON_ONCE(!pfnp))
    1612             :                 return VM_FAULT_SIGBUS;
    1613           0 :         *pfnp = pfn;
    1614           0 :         return VM_FAULT_NEEDDSYNC;
    1615             : }
    1616             : 
    1617           0 : static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
    1618             :                 const struct iomap_iter *iter)
    1619             : {
    1620           0 :         vm_fault_t ret;
    1621           0 :         int error = 0;
    1622             : 
    1623           0 :         switch (iter->iomap.type) {
    1624           0 :         case IOMAP_HOLE:
    1625             :         case IOMAP_UNWRITTEN:
    1626           0 :                 clear_user_highpage(vmf->cow_page, vmf->address);
    1627             :                 break;
    1628           0 :         case IOMAP_MAPPED:
    1629           0 :                 error = copy_cow_page_dax(vmf, iter);
    1630           0 :                 break;
    1631             :         default:
    1632           0 :                 WARN_ON_ONCE(1);
    1633           0 :                 error = -EIO;
    1634           0 :                 break;
    1635             :         }
    1636             : 
    1637           0 :         if (error)
    1638           0 :                 return dax_fault_return(error);
    1639             : 
    1640           0 :         __SetPageUptodate(vmf->cow_page);
    1641           0 :         ret = finish_fault(vmf);
    1642           0 :         if (!ret)
    1643           0 :                 return VM_FAULT_DONE_COW;
    1644             :         return ret;
    1645             : }
    1646             : 
    1647             : /**
    1648             :  * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault.
    1649             :  * @vmf:        vm fault instance
    1650             :  * @iter:       iomap iter
    1651             :  * @pfnp:       pfn to be returned
    1652             :  * @xas:        the dax mapping tree of a file
    1653             :  * @entry:      an unlocked dax entry to be inserted
    1654             :  * @pmd:        distinguish whether it is a pmd fault
    1655             :  */
    1656           0 : static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
    1657             :                 const struct iomap_iter *iter, pfn_t *pfnp,
    1658             :                 struct xa_state *xas, void **entry, bool pmd)
    1659             : {
    1660           0 :         const struct iomap *iomap = &iter->iomap;
    1661           0 :         const struct iomap *srcmap = iomap_iter_srcmap(iter);
    1662           0 :         size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
    1663           0 :         loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
    1664           0 :         bool write = iter->flags & IOMAP_WRITE;
    1665           0 :         unsigned long entry_flags = pmd ? DAX_PMD : 0;
    1666           0 :         int err = 0;
    1667           0 :         pfn_t pfn;
    1668           0 :         void *kaddr;
    1669             : 
    1670           0 :         if (!pmd && vmf->cow_page)
    1671           0 :                 return dax_fault_cow_page(vmf, iter);
    1672             : 
    1673             :         /* if we are reading UNWRITTEN and HOLE, return a hole. */
    1674           0 :         if (!write &&
    1675           0 :             (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
    1676           0 :                 if (!pmd)
    1677           0 :                         return dax_load_hole(xas, vmf, iter, entry);
    1678           0 :                 return dax_pmd_load_hole(xas, vmf, iter, entry);
    1679             :         }
    1680             : 
    1681           0 :         if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
    1682           0 :                 WARN_ON_ONCE(1);
    1683           0 :                 return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
    1684             :         }
    1685             : 
    1686           0 :         err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
    1687           0 :         if (err)
    1688           0 :                 return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
    1689             : 
    1690           0 :         *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
    1691             : 
    1692           0 :         if (write && iomap->flags & IOMAP_F_SHARED) {
    1693           0 :                 err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr);
    1694           0 :                 if (err)
    1695           0 :                         return dax_fault_return(err);
    1696             :         }
    1697             : 
    1698           0 :         if (dax_fault_is_synchronous(iter, vmf->vma))
    1699           0 :                 return dax_fault_synchronous_pfnp(pfnp, pfn);
    1700             : 
    1701             :         /* insert PMD pfn */
    1702           0 :         if (pmd)
    1703           0 :                 return vmf_insert_pfn_pmd(vmf, pfn, write);
    1704             : 
    1705             :         /* insert PTE pfn */
    1706           0 :         if (write)
    1707           0 :                 return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
    1708           0 :         return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
    1709             : }
    1710             : 
    1711           0 : static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
    1712             :                                int *iomap_errp, const struct iomap_ops *ops)
    1713             : {
    1714           0 :         struct address_space *mapping = vmf->vma->vm_file->f_mapping;
    1715           0 :         XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
    1716           0 :         struct iomap_iter iter = {
    1717           0 :                 .inode          = mapping->host,
    1718           0 :                 .pos            = (loff_t)vmf->pgoff << PAGE_SHIFT,
    1719             :                 .len            = PAGE_SIZE,
    1720             :                 .flags          = IOMAP_DAX | IOMAP_FAULT,
    1721             :         };
    1722           0 :         vm_fault_t ret = 0;
    1723           0 :         void *entry;
    1724           0 :         int error;
    1725             : 
    1726           0 :         trace_dax_pte_fault(iter.inode, vmf, ret);
    1727             :         /*
    1728             :          * Check whether offset isn't beyond end of file now. Caller is supposed
    1729             :          * to hold locks serializing us with truncate / punch hole so this is
    1730             :          * a reliable test.
    1731             :          */
    1732           0 :         if (iter.pos >= i_size_read(iter.inode)) {
    1733           0 :                 ret = VM_FAULT_SIGBUS;
    1734           0 :                 goto out;
    1735             :         }
    1736             : 
    1737           0 :         if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
    1738           0 :                 iter.flags |= IOMAP_WRITE;
    1739             : 
    1740           0 :         entry = grab_mapping_entry(&xas, mapping, 0);
    1741           0 :         if (xa_is_internal(entry)) {
    1742           0 :                 ret = xa_to_internal(entry);
    1743           0 :                 goto out;
    1744             :         }
    1745             : 
    1746             :         /*
    1747             :          * It is possible, particularly with mixed reads & writes to private
    1748             :          * mappings, that we have raced with a PMD fault that overlaps with
    1749             :          * the PTE we need to set up.  If so just return and the fault will be
    1750             :          * retried.
    1751             :          */
    1752           0 :         if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
    1753           0 :                 ret = VM_FAULT_NOPAGE;
    1754           0 :                 goto unlock_entry;
    1755             :         }
    1756             : 
    1757           0 :         while ((error = iomap_iter(&iter, ops)) > 0) {
    1758           0 :                 if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
    1759           0 :                         iter.processed = -EIO;  /* fs corruption? */
    1760           0 :                         continue;
    1761             :                 }
    1762             : 
    1763           0 :                 ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false);
    1764           0 :                 if (ret != VM_FAULT_SIGBUS &&
    1765           0 :                     (iter.iomap.flags & IOMAP_F_NEW)) {
    1766           0 :                         count_vm_event(PGMAJFAULT);
    1767           0 :                         count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
    1768           0 :                         ret |= VM_FAULT_MAJOR;
    1769             :                 }
    1770             : 
    1771           0 :                 if (!(ret & VM_FAULT_ERROR))
    1772           0 :                         iter.processed = PAGE_SIZE;
    1773             :         }
    1774             : 
    1775           0 :         if (iomap_errp)
    1776           0 :                 *iomap_errp = error;
    1777           0 :         if (!ret && error)
    1778           0 :                 ret = dax_fault_return(error);
    1779             : 
    1780           0 : unlock_entry:
    1781           0 :         dax_unlock_entry(&xas, entry);
    1782           0 : out:
    1783           0 :         trace_dax_pte_fault_done(iter.inode, vmf, ret);
    1784           0 :         return ret;
    1785             : }
    1786             : 
    1787             : #ifdef CONFIG_FS_DAX_PMD
    1788           0 : static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
    1789             :                 pgoff_t max_pgoff)
    1790             : {
    1791           0 :         unsigned long pmd_addr = vmf->address & PMD_MASK;
    1792           0 :         bool write = vmf->flags & FAULT_FLAG_WRITE;
    1793             : 
    1794             :         /*
    1795             :          * Make sure that the faulting address's PMD offset (color) matches
    1796             :          * the PMD offset from the start of the file.  This is necessary so
    1797             :          * that a PMD range in the page table overlaps exactly with a PMD
    1798             :          * range in the page cache.
    1799             :          */
    1800           0 :         if ((vmf->pgoff & PG_PMD_COLOUR) !=
    1801           0 :             ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
    1802             :                 return true;
    1803             : 
    1804             :         /* Fall back to PTEs if we're going to COW */
    1805           0 :         if (write && !(vmf->vma->vm_flags & VM_SHARED))
    1806             :                 return true;
    1807             : 
    1808             :         /* If the PMD would extend outside the VMA */
    1809           0 :         if (pmd_addr < vmf->vma->vm_start)
    1810             :                 return true;
    1811           0 :         if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
    1812             :                 return true;
    1813             : 
    1814             :         /* If the PMD would extend beyond the file size */
    1815           0 :         if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff)
    1816           0 :                 return true;
    1817             : 
    1818             :         return false;
    1819             : }
    1820             : 
    1821           0 : static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
    1822             :                                const struct iomap_ops *ops)
    1823             : {
    1824           0 :         struct address_space *mapping = vmf->vma->vm_file->f_mapping;
    1825           0 :         XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
    1826           0 :         struct iomap_iter iter = {
    1827           0 :                 .inode          = mapping->host,
    1828             :                 .len            = PMD_SIZE,
    1829             :                 .flags          = IOMAP_DAX | IOMAP_FAULT,
    1830             :         };
    1831           0 :         vm_fault_t ret = VM_FAULT_FALLBACK;
    1832           0 :         pgoff_t max_pgoff;
    1833           0 :         void *entry;
    1834             : 
    1835           0 :         if (vmf->flags & FAULT_FLAG_WRITE)
    1836           0 :                 iter.flags |= IOMAP_WRITE;
    1837             : 
    1838             :         /*
    1839             :          * Check whether offset isn't beyond end of file now. Caller is
    1840             :          * supposed to hold locks serializing us with truncate / punch hole so
    1841             :          * this is a reliable test.
    1842             :          */
    1843           0 :         max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE);
    1844             : 
    1845           0 :         trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0);
    1846             : 
    1847           0 :         if (xas.xa_index >= max_pgoff) {
    1848           0 :                 ret = VM_FAULT_SIGBUS;
    1849           0 :                 goto out;
    1850             :         }
    1851             : 
    1852           0 :         if (dax_fault_check_fallback(vmf, &xas, max_pgoff))
    1853           0 :                 goto fallback;
    1854             : 
    1855             :         /*
    1856             :          * grab_mapping_entry() will make sure we get an empty PMD entry,
    1857             :          * a zero PMD entry or a DAX PMD.  If it can't (because a PTE
    1858             :          * entry is already in the array, for instance), it will return
    1859             :          * VM_FAULT_FALLBACK.
    1860             :          */
    1861           0 :         entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
    1862           0 :         if (xa_is_internal(entry)) {
    1863           0 :                 ret = xa_to_internal(entry);
    1864           0 :                 goto fallback;
    1865             :         }
    1866             : 
    1867             :         /*
    1868             :          * It is possible, particularly with mixed reads & writes to private
    1869             :          * mappings, that we have raced with a PTE fault that overlaps with
    1870             :          * the PMD we need to set up.  If so just return and the fault will be
    1871             :          * retried.
    1872             :          */
    1873           0 :         if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
    1874             :                         !pmd_devmap(*vmf->pmd)) {
    1875           0 :                 ret = 0;
    1876           0 :                 goto unlock_entry;
    1877             :         }
    1878             : 
    1879           0 :         iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT;
    1880           0 :         while (iomap_iter(&iter, ops) > 0) {
    1881           0 :                 if (iomap_length(&iter) < PMD_SIZE)
    1882           0 :                         continue; /* actually breaks out of the loop */
    1883             : 
    1884           0 :                 ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
    1885           0 :                 if (ret != VM_FAULT_FALLBACK)
    1886           0 :                         iter.processed = PMD_SIZE;
    1887             :         }
    1888             : 
    1889           0 : unlock_entry:
    1890           0 :         dax_unlock_entry(&xas, entry);
    1891           0 : fallback:
    1892           0 :         if (ret == VM_FAULT_FALLBACK) {
    1893           0 :                 split_huge_pmd(vmf->vma, vmf->pmd, vmf->address);
    1894           0 :                 count_vm_event(THP_FAULT_FALLBACK);
    1895             :         }
    1896           0 : out:
    1897           0 :         trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret);
    1898           0 :         return ret;
    1899             : }
    1900             : #else
    1901             : static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
    1902             :                                const struct iomap_ops *ops)
    1903             : {
    1904             :         return VM_FAULT_FALLBACK;
    1905             : }
    1906             : #endif /* CONFIG_FS_DAX_PMD */
    1907             : 
    1908             : /**
    1909             :  * dax_iomap_fault - handle a page fault on a DAX file
    1910             :  * @vmf: The description of the fault
    1911             :  * @pe_size: Size of the page to fault in
    1912             :  * @pfnp: PFN to insert for synchronous faults if fsync is required
    1913             :  * @iomap_errp: Storage for detailed error code in case of error
    1914             :  * @ops: Iomap ops passed from the file system
    1915             :  *
    1916             :  * When a page fault occurs, filesystems may call this helper in
    1917             :  * their fault handler for DAX files. dax_iomap_fault() assumes the caller
    1918             :  * has done all the necessary locking for page fault to proceed
    1919             :  * successfully.
    1920             :  */
    1921           0 : vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
    1922             :                     pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
    1923             : {
    1924           0 :         switch (pe_size) {
    1925           0 :         case PE_SIZE_PTE:
    1926           0 :                 return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
    1927           0 :         case PE_SIZE_PMD:
    1928           0 :                 return dax_iomap_pmd_fault(vmf, pfnp, ops);
    1929             :         default:
    1930             :                 return VM_FAULT_FALLBACK;
    1931             :         }
    1932             : }
    1933             : EXPORT_SYMBOL_GPL(dax_iomap_fault);
    1934             : 
    1935             : /*
    1936             :  * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
    1937             :  * @vmf: The description of the fault
    1938             :  * @pfn: PFN to insert
    1939             :  * @order: Order of entry to insert.
    1940             :  *
    1941             :  * This function inserts a writeable PTE or PMD entry into the page tables
    1942             :  * for an mmaped DAX file.  It also marks the page cache entry as dirty.
    1943             :  */
    1944             : static vm_fault_t
    1945           0 : dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
    1946             : {
    1947           0 :         struct address_space *mapping = vmf->vma->vm_file->f_mapping;
    1948           0 :         XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
    1949           0 :         void *entry;
    1950           0 :         vm_fault_t ret;
    1951             : 
    1952           0 :         xas_lock_irq(&xas);
    1953           0 :         entry = get_unlocked_entry(&xas, order);
    1954             :         /* Did we race with someone splitting entry or so? */
    1955           0 :         if (!entry || dax_is_conflict(entry) ||
    1956           0 :             (order == 0 && !dax_is_pte_entry(entry))) {
    1957           0 :                 put_unlocked_entry(&xas, entry, WAKE_NEXT);
    1958           0 :                 xas_unlock_irq(&xas);
    1959           0 :                 trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
    1960             :                                                       VM_FAULT_NOPAGE);
    1961           0 :                 return VM_FAULT_NOPAGE;
    1962             :         }
    1963           0 :         xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
    1964           0 :         dax_lock_entry(&xas, entry);
    1965           0 :         xas_unlock_irq(&xas);
    1966           0 :         if (order == 0)
    1967           0 :                 ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
    1968             : #ifdef CONFIG_FS_DAX_PMD
    1969           0 :         else if (order == PMD_ORDER)
    1970           0 :                 ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
    1971             : #endif
    1972             :         else
    1973             :                 ret = VM_FAULT_FALLBACK;
    1974           0 :         dax_unlock_entry(&xas, entry);
    1975           0 :         trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
    1976           0 :         return ret;
    1977             : }
    1978             : 
    1979             : /**
    1980             :  * dax_finish_sync_fault - finish synchronous page fault
    1981             :  * @vmf: The description of the fault
    1982             :  * @pe_size: Size of entry to be inserted
    1983             :  * @pfn: PFN to insert
    1984             :  *
    1985             :  * This function ensures that the file range touched by the page fault is
    1986             :  * stored persistently on the media and handles inserting of appropriate page
    1987             :  * table entry.
    1988             :  */
    1989           0 : vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
    1990             :                 enum page_entry_size pe_size, pfn_t pfn)
    1991             : {
    1992           0 :         int err;
    1993           0 :         loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
    1994           0 :         unsigned int order = pe_order(pe_size);
    1995           0 :         size_t len = PAGE_SIZE << order;
    1996             : 
    1997           0 :         err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
    1998           0 :         if (err)
    1999             :                 return VM_FAULT_SIGBUS;
    2000           0 :         return dax_insert_pfn_mkwrite(vmf, pfn, order);
    2001             : }
    2002             : EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
    2003             : 
    2004           0 : static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
    2005             :                 struct iomap_iter *it_dest, u64 len, bool *same)
    2006             : {
    2007           0 :         const struct iomap *smap = &it_src->iomap;
    2008           0 :         const struct iomap *dmap = &it_dest->iomap;
    2009           0 :         loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
    2010           0 :         void *saddr, *daddr;
    2011           0 :         int id, ret;
    2012             : 
    2013           0 :         len = min(len, min(smap->length, dmap->length));
    2014             : 
    2015           0 :         if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
    2016           0 :                 *same = true;
    2017           0 :                 return len;
    2018             :         }
    2019             : 
    2020           0 :         if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
    2021           0 :                 *same = false;
    2022           0 :                 return 0;
    2023             :         }
    2024             : 
    2025           0 :         id = dax_read_lock();
    2026           0 :         ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
    2027             :                                       &saddr, NULL);
    2028           0 :         if (ret < 0)
    2029           0 :                 goto out_unlock;
    2030             : 
    2031           0 :         ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
    2032             :                                       &daddr, NULL);
    2033           0 :         if (ret < 0)
    2034           0 :                 goto out_unlock;
    2035             : 
    2036           0 :         *same = !memcmp(saddr, daddr, len);
    2037           0 :         if (!*same)
    2038           0 :                 len = 0;
    2039           0 :         dax_read_unlock(id);
    2040           0 :         return len;
    2041             : 
    2042           0 : out_unlock:
    2043           0 :         dax_read_unlock(id);
    2044           0 :         return -EIO;
    2045             : }
    2046             : 
    2047           0 : int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
    2048             :                 struct inode *dst, loff_t dstoff, loff_t len, bool *same,
    2049             :                 const struct iomap_ops *ops)
    2050             : {
    2051           0 :         struct iomap_iter src_iter = {
    2052             :                 .inode          = src,
    2053             :                 .pos            = srcoff,
    2054             :                 .len            = len,
    2055             :                 .flags          = IOMAP_DAX,
    2056             :         };
    2057           0 :         struct iomap_iter dst_iter = {
    2058             :                 .inode          = dst,
    2059             :                 .pos            = dstoff,
    2060             :                 .len            = len,
    2061             :                 .flags          = IOMAP_DAX,
    2062             :         };
    2063           0 :         int ret, compared = 0;
    2064             : 
    2065           0 :         while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
    2066           0 :                (ret = iomap_iter(&dst_iter, ops)) > 0) {
    2067           0 :                 compared = dax_range_compare_iter(&src_iter, &dst_iter,
    2068           0 :                                 min(src_iter.len, dst_iter.len), same);
    2069           0 :                 if (compared < 0)
    2070           0 :                         return ret;
    2071           0 :                 src_iter.processed = dst_iter.processed = compared;
    2072             :         }
    2073             :         return ret;
    2074             : }
    2075             : 
    2076           0 : int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
    2077             :                               struct file *file_out, loff_t pos_out,
    2078             :                               loff_t *len, unsigned int remap_flags,
    2079             :                               const struct iomap_ops *ops)
    2080             : {
    2081           0 :         return __generic_remap_file_range_prep(file_in, pos_in, file_out,
    2082             :                                                pos_out, len, remap_flags, ops);
    2083             : }
    2084             : EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);

Generated by: LCOV version 1.14