LCOV - code coverage report
Current view: top level - mm - filemap.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-achx @ Mon Jul 31 20:08:12 PDT 2023 Lines: 1351 1661 81.3 %
Date: 2023-07-31 20:08:12 Functions: 89 106 84.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *      linux/mm/filemap.c
       4             :  *
       5             :  * Copyright (C) 1994-1999  Linus Torvalds
       6             :  */
       7             : 
       8             : /*
       9             :  * This file handles the generic file mmap semantics used by
      10             :  * most "normal" filesystems (but you don't /have/ to use this:
      11             :  * the NFS filesystem used to do this differently, for example)
      12             :  */
      13             : #include <linux/export.h>
      14             : #include <linux/compiler.h>
      15             : #include <linux/dax.h>
      16             : #include <linux/fs.h>
      17             : #include <linux/sched/signal.h>
      18             : #include <linux/uaccess.h>
      19             : #include <linux/capability.h>
      20             : #include <linux/kernel_stat.h>
      21             : #include <linux/gfp.h>
      22             : #include <linux/mm.h>
      23             : #include <linux/swap.h>
      24             : #include <linux/swapops.h>
      25             : #include <linux/syscalls.h>
      26             : #include <linux/mman.h>
      27             : #include <linux/pagemap.h>
      28             : #include <linux/file.h>
      29             : #include <linux/uio.h>
      30             : #include <linux/error-injection.h>
      31             : #include <linux/hash.h>
      32             : #include <linux/writeback.h>
      33             : #include <linux/backing-dev.h>
      34             : #include <linux/pagevec.h>
      35             : #include <linux/security.h>
      36             : #include <linux/cpuset.h>
      37             : #include <linux/hugetlb.h>
      38             : #include <linux/memcontrol.h>
      39             : #include <linux/shmem_fs.h>
      40             : #include <linux/rmap.h>
      41             : #include <linux/delayacct.h>
      42             : #include <linux/psi.h>
      43             : #include <linux/ramfs.h>
      44             : #include <linux/page_idle.h>
      45             : #include <linux/migrate.h>
      46             : #include <linux/pipe_fs_i.h>
      47             : #include <linux/splice.h>
      48             : #include <asm/pgalloc.h>
      49             : #include <asm/tlbflush.h>
      50             : #include "internal.h"
      51             : 
      52             : #define CREATE_TRACE_POINTS
      53             : #include <trace/events/filemap.h>
      54             : 
      55             : /*
      56             :  * FIXME: remove all knowledge of the buffer layer from the core VM
      57             :  */
      58             : #include <linux/buffer_head.h> /* for try_to_free_buffers */
      59             : 
      60             : #include <asm/mman.h>
      61             : 
      62             : #include "swap.h"
      63             : 
      64             : /*
      65             :  * Shared mappings implemented 30.11.1994. It's not fully working yet,
      66             :  * though.
      67             :  *
      68             :  * Shared mappings now work. 15.8.1995  Bruno.
      69             :  *
      70             :  * finished 'unifying' the page and buffer cache and SMP-threaded the
      71             :  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
      72             :  *
      73             :  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
      74             :  */
      75             : 
      76             : /*
      77             :  * Lock ordering:
      78             :  *
      79             :  *  ->i_mmap_rwsem           (truncate_pagecache)
      80             :  *    ->private_lock         (__free_pte->block_dirty_folio)
      81             :  *      ->swap_lock          (exclusive_swap_page, others)
      82             :  *        ->i_pages lock
      83             :  *
      84             :  *  ->i_rwsem
      85             :  *    ->invalidate_lock              (acquired by fs in truncate path)
      86             :  *      ->i_mmap_rwsem               (truncate->unmap_mapping_range)
      87             :  *
      88             :  *  ->mmap_lock
      89             :  *    ->i_mmap_rwsem
      90             :  *      ->page_table_lock or pte_lock        (various, mainly in memory.c)
      91             :  *        ->i_pages lock     (arch-dependent flush_dcache_mmap_lock)
      92             :  *
      93             :  *  ->mmap_lock
      94             :  *    ->invalidate_lock              (filemap_fault)
      95             :  *      ->lock_page          (filemap_fault, access_process_vm)
      96             :  *
      97             :  *  ->i_rwsem                        (generic_perform_write)
      98             :  *    ->mmap_lock            (fault_in_readable->do_page_fault)
      99             :  *
     100             :  *  bdi->wb.list_lock
     101             :  *    sb_lock                   (fs/fs-writeback.c)
     102             :  *    ->i_pages lock         (__sync_single_inode)
     103             :  *
     104             :  *  ->i_mmap_rwsem
     105             :  *    ->anon_vma.lock                (vma_merge)
     106             :  *
     107             :  *  ->anon_vma.lock
     108             :  *    ->page_table_lock or pte_lock  (anon_vma_prepare and various)
     109             :  *
     110             :  *  ->page_table_lock or pte_lock
     111             :  *    ->swap_lock            (try_to_unmap_one)
     112             :  *    ->private_lock         (try_to_unmap_one)
     113             :  *    ->i_pages lock         (try_to_unmap_one)
     114             :  *    ->lruvec->lru_lock  (follow_page->mark_page_accessed)
     115             :  *    ->lruvec->lru_lock  (check_pte_range->isolate_lru_page)
     116             :  *    ->private_lock         (page_remove_rmap->set_page_dirty)
     117             :  *    ->i_pages lock         (page_remove_rmap->set_page_dirty)
     118             :  *    bdi.wb->list_lock              (page_remove_rmap->set_page_dirty)
     119             :  *    ->inode->i_lock             (page_remove_rmap->set_page_dirty)
     120             :  *    ->memcg->move_lock  (page_remove_rmap->folio_memcg_lock)
     121             :  *    bdi.wb->list_lock              (zap_pte_range->set_page_dirty)
     122             :  *    ->inode->i_lock             (zap_pte_range->set_page_dirty)
     123             :  *    ->private_lock         (zap_pte_range->block_dirty_folio)
     124             :  *
     125             :  * ->i_mmap_rwsem
     126             :  *   ->tasklist_lock            (memory_failure, collect_procs_ao)
     127             :  */
     128             : 
     129   638630254 : static void page_cache_delete(struct address_space *mapping,
     130             :                                    struct folio *folio, void *shadow)
     131             : {
     132   638630254 :         XA_STATE(xas, &mapping->i_pages, folio->index);
     133   638630254 :         long nr = 1;
     134             : 
     135  1277260508 :         mapping_set_update(&xas, mapping);
     136             : 
     137             :         /* hugetlb pages are represented by a single entry in the xarray */
     138   638630254 :         if (!folio_test_hugetlb(folio)) {
     139   638386299 :                 xas_set_order(&xas, folio->index, folio_order(folio));
     140   638386299 :                 nr = folio_nr_pages(folio);
     141             :         }
     142             : 
     143   638421079 :         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
     144             : 
     145   638421079 :         xas_store(&xas, shadow);
     146   638273116 :         xas_init_marks(&xas);
     147             : 
     148   638371010 :         folio->mapping = NULL;
     149             :         /* Leave page->index set: truncation lookup relies upon it */
     150   638371010 :         mapping->nrpages -= nr;
     151   638371010 : }
     152             : 
     153  3847746913 : static void filemap_unaccount_folio(struct address_space *mapping,
     154             :                 struct folio *folio)
     155             : {
     156  3847746913 :         long nr;
     157             : 
     158  3847746913 :         VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
     159  3847746913 :         if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
     160           0 :                 pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
     161             :                          current->comm, folio_pfn(folio));
     162           0 :                 dump_page(&folio->page, "still mapped when deleted");
     163           0 :                 dump_stack();
     164           0 :                 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
     165             : 
     166           0 :                 if (mapping_exiting(mapping) && !folio_test_large(folio)) {
     167           0 :                         int mapcount = page_mapcount(&folio->page);
     168             : 
     169           0 :                         if (folio_ref_count(folio) >= mapcount + 2) {
     170             :                                 /*
     171             :                                  * All vmas have already been torn down, so it's
     172             :                                  * a good bet that actually the page is unmapped
     173             :                                  * and we'd rather not leak it: if we're wrong,
     174             :                                  * another bad page check should catch it later.
     175             :                                  */
     176           0 :                                 page_mapcount_reset(&folio->page);
     177           0 :                                 folio_ref_sub(folio, mapcount);
     178             :                         }
     179             :                 }
     180             :         }
     181             : 
     182             :         /* hugetlb folios do not participate in page cache accounting. */
     183  3847491463 :         if (folio_test_hugetlb(folio))
     184             :                 return;
     185             : 
     186  3847573303 :         nr = folio_nr_pages(folio);
     187             : 
     188  3847573303 :         __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
     189  3847951135 :         if (folio_test_swapbacked(folio)) {
     190   159269610 :                 __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
     191   159121812 :                 if (folio_test_pmd_mappable(folio))
     192           0 :                         __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
     193  3688681525 :         } else if (folio_test_pmd_mappable(folio)) {
     194      188582 :                 __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
     195      188581 :                 filemap_nr_thps_dec(mapping);
     196             :         }
     197             : 
     198             :         /*
     199             :          * At this point folio must be either written or cleaned by
     200             :          * truncate.  Dirty folio here signals a bug and loss of
     201             :          * unwritten data - on ordinary filesystems.
     202             :          *
     203             :          * But it's harmless on in-memory filesystems like tmpfs; and can
     204             :          * occur when a driver which did get_user_pages() sets page dirty
     205             :          * before putting it, while the inode is being finally evicted.
     206             :          *
     207             :          * Below fixes dirty accounting after removing the folio entirely
     208             :          * but leaves the dirty flag set: it has no effect for truncated
     209             :          * folio and anyway will be cleared before returning folio to
     210             :          * buddy allocator.
     211             :          */
     212  3847803336 :         if (WARN_ON_ONCE(folio_test_dirty(folio) &&
     213             :                          mapping_can_writeback(mapping)))
     214           0 :                 folio_account_cleaned(folio, inode_to_wb(mapping->host));
     215             : }
     216             : 
     217             : /*
     218             :  * Delete a page from the page cache and free it. Caller has to make
     219             :  * sure the page is locked and that nobody else uses it - or that usage
     220             :  * is safe.  The caller must hold the i_pages lock.
     221             :  */
     222   638878015 : void __filemap_remove_folio(struct folio *folio, void *shadow)
     223             : {
     224   638878015 :         struct address_space *mapping = folio->mapping;
     225             : 
     226   638878015 :         trace_mm_filemap_delete_from_page_cache(folio);
     227   638630655 :         filemap_unaccount_folio(mapping, folio);
     228   638563013 :         page_cache_delete(mapping, folio, shadow);
     229   638293171 : }
     230             : 
     231  3452790871 : void filemap_free_folio(struct address_space *mapping, struct folio *folio)
     232             : {
     233  3452790871 :         void (*free_folio)(struct folio *);
     234  3452790871 :         int refs = 1;
     235             : 
     236  3452790871 :         free_folio = mapping->a_ops->free_folio;
     237  3452790871 :         if (free_folio)
     238           0 :                 free_folio(folio);
     239             : 
     240  3452818710 :         if (folio_test_large(folio) && !folio_test_hugetlb(folio))
     241   224306874 :                 refs = folio_nr_pages(folio);
     242  3452818305 :         folio_put_refs(folio, refs);
     243  3453086642 : }
     244             : 
     245             : /**
     246             :  * filemap_remove_folio - Remove folio from page cache.
     247             :  * @folio: The folio.
     248             :  *
     249             :  * This must be called only on folios that are locked and have been
     250             :  * verified to be in the page cache.  It will never put the folio into
     251             :  * the free list because the caller has a reference on the page.
     252             :  */
     253   188516833 : void filemap_remove_folio(struct folio *folio)
     254             : {
     255   188516833 :         struct address_space *mapping = folio->mapping;
     256             : 
     257   188516833 :         BUG_ON(!folio_test_locked(folio));
     258   188516833 :         spin_lock(&mapping->host->i_lock);
     259   188534156 :         xa_lock_irq(&mapping->i_pages);
     260   188536800 :         __filemap_remove_folio(folio, NULL);
     261   187903074 :         xa_unlock_irq(&mapping->i_pages);
     262   188255884 :         if (mapping_shrinkable(mapping))
     263   107102722 :                 inode_add_lru(mapping->host);
     264   187873283 :         spin_unlock(&mapping->host->i_lock);
     265             : 
     266   188298409 :         filemap_free_folio(mapping, folio);
     267   188484733 : }
     268             : 
     269             : /*
     270             :  * page_cache_delete_batch - delete several folios from page cache
     271             :  * @mapping: the mapping to which folios belong
     272             :  * @fbatch: batch of folios to delete
     273             :  *
     274             :  * The function walks over mapping->i_pages and removes folios passed in
     275             :  * @fbatch from the mapping. The function expects @fbatch to be sorted
     276             :  * by page index and is optimised for it to be dense.
     277             :  * It tolerates holes in @fbatch (mapping entries at those indices are not
     278             :  * modified).
     279             :  *
     280             :  * The function expects the i_pages lock to be held.
     281             :  */
     282   308700781 : static void page_cache_delete_batch(struct address_space *mapping,
     283             :                              struct folio_batch *fbatch)
     284             : {
     285   308700781 :         XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);
     286   308700781 :         long total_pages = 0;
     287   308700781 :         int i = 0;
     288   308700781 :         struct folio *folio;
     289             : 
     290   617401562 :         mapping_set_update(&xas, mapping);
     291  3523716547 :         xas_for_each(&xas, folio, ULONG_MAX) {
     292  3477705459 :                 if (i >= folio_batch_count(fbatch))
     293             :                         break;
     294             : 
     295             :                 /* A swap/dax/shadow entry got inserted? Skip it. */
     296  3214877041 :                 if (xa_is_value(folio))
     297          93 :                         continue;
     298             :                 /*
     299             :                  * A page got inserted in our range? Skip it. We have our
     300             :                  * pages locked so they are protected from being removed.
     301             :                  * If we see a page whose index is higher than ours, it
     302             :                  * means our page has been removed, which shouldn't be
     303             :                  * possible because we're holding the PageLock.
     304             :                  */
     305  3214876948 :                 if (folio != fbatch->folios[i]) {
     306     5943381 :                         VM_BUG_ON_FOLIO(folio->index >
     307             :                                         fbatch->folios[i]->index, folio);
     308     5943381 :                         continue;
     309             :                 }
     310             : 
     311  3208933567 :                 WARN_ON_ONCE(!folio_test_locked(folio));
     312             : 
     313  3208933567 :                 folio->mapping = NULL;
     314             :                 /* Leave folio->index set: truncation lookup relies on it */
     315             : 
     316  3208933567 :                 i++;
     317  3208933567 :                 xas_store(&xas, NULL);
     318  3317856181 :                 total_pages += folio_nr_pages(folio);
     319             :         }
     320   308665809 :         mapping->nrpages -= total_pages;
     321   308665809 : }
     322             : 
     323   321090481 : void delete_from_page_cache_batch(struct address_space *mapping,
     324             :                                   struct folio_batch *fbatch)
     325             : {
     326   321090481 :         int i;
     327             : 
     328   321090481 :         if (!folio_batch_count(fbatch))
     329             :                 return;
     330             : 
     331   308724918 :         spin_lock(&mapping->host->i_lock);
     332   308722428 :         xa_lock_irq(&mapping->i_pages);
     333  3826715321 :         for (i = 0; i < folio_batch_count(fbatch); i++) {
     334  3209299280 :                 struct folio *folio = fbatch->folios[i];
     335             : 
     336  3209299280 :                 trace_mm_filemap_delete_from_page_cache(folio);
     337  3209215227 :                 filemap_unaccount_folio(mapping, folio);
     338             :         }
     339   308693613 :         page_cache_delete_batch(mapping, fbatch);
     340   308666248 :         xa_unlock_irq(&mapping->i_pages);
     341   308706468 :         if (mapping_shrinkable(mapping))
     342    27244160 :                 inode_add_lru(mapping->host);
     343   308656144 :         spin_unlock(&mapping->host->i_lock);
     344             : 
     345  3826491216 :         for (i = 0; i < folio_batch_count(fbatch); i++)
     346  3209114962 :                 filemap_free_folio(mapping, fbatch->folios[i]);
     347             : }
     348             : 
     349   907075025 : int filemap_check_errors(struct address_space *mapping)
     350             : {
     351   907075025 :         int ret = 0;
     352             :         /* Check for outstanding write errors */
     353  1814150122 :         if (test_bit(AS_ENOSPC, &mapping->flags) &&
     354             :             test_and_clear_bit(AS_ENOSPC, &mapping->flags))
     355          73 :                 ret = -ENOSPC;
     356  1814153954 :         if (test_bit(AS_EIO, &mapping->flags) &&
     357             :             test_and_clear_bit(AS_EIO, &mapping->flags))
     358        3904 :                 ret = -EIO;
     359   907075025 :         return ret;
     360             : }
     361             : EXPORT_SYMBOL(filemap_check_errors);
     362             : 
     363   158766511 : static int filemap_check_and_keep_errors(struct address_space *mapping)
     364             : {
     365             :         /* Check for outstanding write errors */
     366   317533022 :         if (test_bit(AS_EIO, &mapping->flags))
     367             :                 return -EIO;
     368   317519864 :         if (test_bit(AS_ENOSPC, &mapping->flags))
     369         217 :                 return -ENOSPC;
     370             :         return 0;
     371             : }
     372             : 
     373             : /**
     374             :  * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
     375             :  * @mapping:    address space structure to write
     376             :  * @wbc:        the writeback_control controlling the writeout
     377             :  *
     378             :  * Call writepages on the mapping using the provided wbc to control the
     379             :  * writeout.
     380             :  *
     381             :  * Return: %0 on success, negative error code otherwise.
     382             :  */
     383   988656830 : int filemap_fdatawrite_wbc(struct address_space *mapping,
     384             :                            struct writeback_control *wbc)
     385             : {
     386   988656830 :         int ret;
     387             : 
     388   988656830 :         if (!mapping_can_writeback(mapping) ||
     389             :             !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
     390             :                 return 0;
     391             : 
     392    55263271 :         wbc_attach_fdatawrite_inode(wbc, mapping->host);
     393    55264852 :         ret = do_writepages(mapping, wbc);
     394    55263616 :         wbc_detach_inode(wbc);
     395    55263616 :         return ret;
     396             : }
     397             : EXPORT_SYMBOL(filemap_fdatawrite_wbc);
     398             : 
     399             : /**
     400             :  * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
     401             :  * @mapping:    address space structure to write
     402             :  * @start:      offset in bytes where the range starts
     403             :  * @end:        offset in bytes where the range ends (inclusive)
     404             :  * @sync_mode:  enable synchronous operation
     405             :  *
     406             :  * Start writeback against all of a mapping's dirty pages that lie
     407             :  * within the byte offsets <start, end> inclusive.
     408             :  *
     409             :  * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
     410             :  * opposed to a regular memory cleansing writeback.  The difference between
     411             :  * these two operations is that if a dirty page/buffer is encountered, it must
     412             :  * be waited upon, and not just skipped over.
     413             :  *
     414             :  * Return: %0 on success, negative error code otherwise.
     415             :  */
     416   986827752 : int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
     417             :                                 loff_t end, int sync_mode)
     418             : {
     419   986827752 :         struct writeback_control wbc = {
     420             :                 .sync_mode = sync_mode,
     421             :                 .nr_to_write = LONG_MAX,
     422             :                 .range_start = start,
     423             :                 .range_end = end,
     424             :         };
     425             : 
     426   986827752 :         return filemap_fdatawrite_wbc(mapping, &wbc);
     427             : }
     428             : 
     429             : static inline int __filemap_fdatawrite(struct address_space *mapping,
     430             :         int sync_mode)
     431             : {
     432   156837363 :         return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
     433             : }
     434             : 
     435   153811762 : int filemap_fdatawrite(struct address_space *mapping)
     436             : {
     437   153811762 :         return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
     438             : }
     439             : EXPORT_SYMBOL(filemap_fdatawrite);
     440             : 
     441    11985545 : int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
     442             :                                 loff_t end)
     443             : {
     444    11985545 :         return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
     445             : }
     446             : EXPORT_SYMBOL(filemap_fdatawrite_range);
     447             : 
     448             : /**
     449             :  * filemap_flush - mostly a non-blocking flush
     450             :  * @mapping:    target address_space
     451             :  *
     452             :  * This is a mostly non-blocking flush.  Not suitable for data-integrity
     453             :  * purposes - I/O may not be started against all dirty pages.
     454             :  *
     455             :  * Return: %0 on success, negative error code otherwise.
     456             :  */
     457     3025601 : int filemap_flush(struct address_space *mapping)
     458             : {
     459     3025601 :         return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
     460             : }
     461             : EXPORT_SYMBOL(filemap_flush);
     462             : 
     463             : /**
     464             :  * filemap_range_has_page - check if a page exists in range.
     465             :  * @mapping:           address space within which to check
     466             :  * @start_byte:        offset in bytes where the range starts
     467             :  * @end_byte:          offset in bytes where the range ends (inclusive)
     468             :  *
     469             :  * Find at least one page in the range supplied, usually used to check if
     470             :  * direct writing in this range will trigger a writeback.
     471             :  *
     472             :  * Return: %true if at least one page exists in the specified range,
     473             :  * %false otherwise.
     474             :  */
     475     3360303 : bool filemap_range_has_page(struct address_space *mapping,
     476             :                            loff_t start_byte, loff_t end_byte)
     477             : {
     478     3360303 :         struct folio *folio;
     479     3360303 :         XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
     480     3360303 :         pgoff_t max = end_byte >> PAGE_SHIFT;
     481             : 
     482     3360303 :         if (end_byte < start_byte)
     483             :                 return false;
     484             : 
     485     3359701 :         rcu_read_lock();
     486     3359669 :         for (;;) {
     487     3359669 :                 folio = xas_find(&xas, max);
     488     3360157 :                 if (xas_retry(&xas, folio))
     489           0 :                         continue;
     490             :                 /* Shadow entries don't count */
     491     3360157 :                 if (xa_is_value(folio))
     492           0 :                         continue;
     493             :                 /*
     494             :                  * We don't need to try to pin this page; we're about to
     495             :                  * release the RCU lock anyway.  It is enough to know that
     496             :                  * there was a page here recently.
     497             :                  */
     498     3360157 :                 break;
     499             :         }
     500     3360157 :         rcu_read_unlock();
     501             : 
     502     3359736 :         return folio != NULL;
     503             : }
     504             : EXPORT_SYMBOL(filemap_range_has_page);
     505             : 
     506   988185739 : static void __filemap_fdatawait_range(struct address_space *mapping,
     507             :                                      loff_t start_byte, loff_t end_byte)
     508             : {
     509   988185739 :         pgoff_t index = start_byte >> PAGE_SHIFT;
     510   988185739 :         pgoff_t end = end_byte >> PAGE_SHIFT;
     511   988185739 :         struct folio_batch fbatch;
     512   988185739 :         unsigned nr_folios;
     513             : 
     514   988185739 :         folio_batch_init(&fbatch);
     515             : 
     516  1012186461 :         while (index <= end) {
     517   994479859 :                 unsigned i;
     518             : 
     519   994479859 :                 nr_folios = filemap_get_folios_tag(mapping, &index, end,
     520             :                                 PAGECACHE_TAG_WRITEBACK, &fbatch);
     521             : 
     522   994222010 :                 if (!nr_folios)
     523             :                         break;
     524             : 
     525   186522904 :                 for (i = 0; i < nr_folios; i++) {
     526   162407535 :                         struct folio *folio = fbatch.folios[i];
     527             : 
     528   162407535 :                         folio_wait_writeback(folio);
     529   162403284 :                         folio_clear_error(folio);
     530             :                 }
     531    24115369 :                 folio_batch_release(&fbatch);
     532    24115045 :                 cond_resched();
     533             :         }
     534   987812256 : }
     535             : 
     536             : /**
     537             :  * filemap_fdatawait_range - wait for writeback to complete
     538             :  * @mapping:            address space structure to wait for
     539             :  * @start_byte:         offset in bytes where the range starts
     540             :  * @end_byte:           offset in bytes where the range ends (inclusive)
     541             :  *
     542             :  * Walk the list of under-writeback pages of the given address space
     543             :  * in the given range and wait for all of them.  Check error status of
     544             :  * the address space and return it.
     545             :  *
     546             :  * Since the error status of the address space is cleared by this function,
     547             :  * callers are responsible for checking the return value and handling and/or
     548             :  * reporting the error.
     549             :  *
     550             :  * Return: error status of the address space.
     551             :  */
     552    11383114 : int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
     553             :                             loff_t end_byte)
     554             : {
     555    11383114 :         __filemap_fdatawait_range(mapping, start_byte, end_byte);
     556    11382658 :         return filemap_check_errors(mapping);
     557             : }
     558             : EXPORT_SYMBOL(filemap_fdatawait_range);
     559             : 
     560             : /**
     561             :  * filemap_fdatawait_range_keep_errors - wait for writeback to complete
     562             :  * @mapping:            address space structure to wait for
     563             :  * @start_byte:         offset in bytes where the range starts
     564             :  * @end_byte:           offset in bytes where the range ends (inclusive)
     565             :  *
     566             :  * Walk the list of under-writeback pages of the given address space in the
     567             :  * given range and wait for all of them.  Unlike filemap_fdatawait_range(),
     568             :  * this function does not clear error status of the address space.
     569             :  *
     570             :  * Use this function if callers don't handle errors themselves.  Expected
     571             :  * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
     572             :  * fsfreeze(8)
     573             :  */
     574      159983 : int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
     575             :                 loff_t start_byte, loff_t end_byte)
     576             : {
     577      159983 :         __filemap_fdatawait_range(mapping, start_byte, end_byte);
     578      159983 :         return filemap_check_and_keep_errors(mapping);
     579             : }
     580             : EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
     581             : 
     582             : /**
     583             :  * file_fdatawait_range - wait for writeback to complete
     584             :  * @file:               file pointing to address space structure to wait for
     585             :  * @start_byte:         offset in bytes where the range starts
     586             :  * @end_byte:           offset in bytes where the range ends (inclusive)
     587             :  *
     588             :  * Walk the list of under-writeback pages of the address space that file
     589             :  * refers to, in the given range and wait for all of them.  Check error
     590             :  * status of the address space vs. the file->f_wb_err cursor and return it.
     591             :  *
     592             :  * Since the error status of the file is advanced by this function,
     593             :  * callers are responsible for checking the return value and handling and/or
     594             :  * reporting the error.
     595             :  *
     596             :  * Return: error status of the address space vs. the file->f_wb_err cursor.
     597             :  */
     598        1501 : int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
     599             : {
     600        1501 :         struct address_space *mapping = file->f_mapping;
     601             : 
     602        1501 :         __filemap_fdatawait_range(mapping, start_byte, end_byte);
     603        1501 :         return file_check_and_advance_wb_err(file);
     604             : }
     605             : EXPORT_SYMBOL(file_fdatawait_range);
     606             : 
     607             : /**
     608             :  * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
     609             :  * @mapping: address space structure to wait for
     610             :  *
     611             :  * Walk the list of under-writeback pages of the given address space
     612             :  * and wait for all of them.  Unlike filemap_fdatawait(), this function
     613             :  * does not clear error status of the address space.
     614             :  *
     615             :  * Use this function if callers don't handle errors themselves.  Expected
     616             :  * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
     617             :  * fsfreeze(8)
     618             :  *
     619             :  * Return: error status of the address space.
     620             :  */
     621   158722345 : int filemap_fdatawait_keep_errors(struct address_space *mapping)
     622             : {
     623   158722345 :         __filemap_fdatawait_range(mapping, 0, LLONG_MAX);
     624   158373080 :         return filemap_check_and_keep_errors(mapping);
     625             : }
     626             : EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
     627             : 
     628             : /* Returns true if writeback might be needed or already in progress. */
     629             : static bool mapping_needs_writeback(struct address_space *mapping)
     630             : {
     631   916735650 :         return mapping->nrpages;
     632             : }
     633             : 
     634           0 : bool filemap_range_has_writeback(struct address_space *mapping,
     635             :                                  loff_t start_byte, loff_t end_byte)
     636             : {
     637           0 :         XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
     638           0 :         pgoff_t max = end_byte >> PAGE_SHIFT;
     639           0 :         struct folio *folio;
     640             : 
     641           0 :         if (end_byte < start_byte)
     642             :                 return false;
     643             : 
     644           0 :         rcu_read_lock();
     645           0 :         xas_for_each(&xas, folio, max) {
     646           0 :                 if (xas_retry(&xas, folio))
     647           0 :                         continue;
     648           0 :                 if (xa_is_value(folio))
     649           0 :                         continue;
     650           0 :                 if (folio_test_dirty(folio) || folio_test_locked(folio) ||
     651             :                                 folio_test_writeback(folio))
     652             :                         break;
     653             :         }
     654           0 :         rcu_read_unlock();
     655           0 :         return folio != NULL;
     656             : }
     657             : EXPORT_SYMBOL_GPL(filemap_range_has_writeback);
     658             : 
     659             : /**
     660             :  * filemap_write_and_wait_range - write out & wait on a file range
     661             :  * @mapping:    the address_space for the pages
     662             :  * @lstart:     offset in bytes where the range starts
     663             :  * @lend:       offset in bytes where the range ends (inclusive)
     664             :  *
     665             :  * Write out and wait upon file offsets lstart->lend, inclusive.
     666             :  *
     667             :  * Note that @lend is inclusive (describes the last byte to be written) so
     668             :  * that this function can be used to write to the very end-of-file (end = -1).
     669             :  *
     670             :  * Return: error status of the address space.
     671             :  */
     672   897367467 : int filemap_write_and_wait_range(struct address_space *mapping,
     673             :                                  loff_t lstart, loff_t lend)
     674             : {
     675   897367467 :         int err = 0, err2;
     676             : 
     677   897367467 :         if (lend < lstart)
     678             :                 return 0;
     679             : 
     680   895800621 :         if (mapping_needs_writeback(mapping)) {
     681   812818573 :                 err = __filemap_fdatawrite_range(mapping, lstart, lend,
     682             :                                                  WB_SYNC_ALL);
     683             :                 /*
     684             :                  * Even if the above returned error, the pages may be
     685             :                  * written partially (e.g. -ENOSPC), so we wait for it.
     686             :                  * But the -EIO is special case, it may indicate the worst
     687             :                  * thing (e.g. bug) happened, so we avoid waiting for it.
     688             :                  */
     689   812883998 :                 if (err != -EIO)
     690   812915854 :                         __filemap_fdatawait_range(mapping, lstart, lend);
     691             :         }
     692   895732869 :         err2 = filemap_check_errors(mapping);
     693   895627172 :         if (!err)
     694   895598856 :                 err = err2;
     695             :         return err;
     696             : }
     697             : EXPORT_SYMBOL(filemap_write_and_wait_range);
     698             : 
     699     5378560 : void __filemap_set_wb_err(struct address_space *mapping, int err)
     700             : {
     701     5378560 :         errseq_t eseq = errseq_set(&mapping->wb_err, err);
     702             : 
     703     5378560 :         trace_filemap_set_wb_err(mapping, eseq);
     704     5378558 : }
     705             : EXPORT_SYMBOL(__filemap_set_wb_err);
     706             : 
     707             : /**
     708             :  * file_check_and_advance_wb_err - report wb error (if any) that was previously
     709             :  *                                 and advance wb_err to current one
     710             :  * @file: struct file on which the error is being reported
     711             :  *
     712             :  * When userland calls fsync (or something like nfsd does the equivalent), we
     713             :  * want to report any writeback errors that occurred since the last fsync (or
     714             :  * since the file was opened if there haven't been any).
     715             :  *
     716             :  * Grab the wb_err from the mapping. If it matches what we have in the file,
     717             :  * then just quickly return 0. The file is all caught up.
     718             :  *
     719             :  * If it doesn't match, then take the mapping value, set the "seen" flag in
     720             :  * it and try to swap it into place. If it works, or another task beat us
     721             :  * to it with the new value, then update the f_wb_err and return the error
     722             :  * portion. The error at this point must be reported via proper channels
     723             :  * (a'la fsync, or NFS COMMIT operation, etc.).
     724             :  *
     725             :  * While we handle mapping->wb_err with atomic operations, the f_wb_err
     726             :  * value is protected by the f_lock since we must ensure that it reflects
     727             :  * the latest value swapped in for this file descriptor.
     728             :  *
     729             :  * Return: %0 on success, negative error code otherwise.
     730             :  */
     731    21450696 : int file_check_and_advance_wb_err(struct file *file)
     732             : {
     733    21450696 :         int err = 0;
     734    21450696 :         errseq_t old = READ_ONCE(file->f_wb_err);
     735    21450696 :         struct address_space *mapping = file->f_mapping;
     736             : 
     737             :         /* Locklessly handle the common case where nothing has changed */
     738    21450696 :         if (errseq_check(&mapping->wb_err, old)) {
     739             :                 /* Something changed, must use slow path */
     740        3565 :                 spin_lock(&file->f_lock);
     741        3566 :                 old = file->f_wb_err;
     742        3566 :                 err = errseq_check_and_advance(&mapping->wb_err,
     743             :                                                 &file->f_wb_err);
     744        3565 :                 trace_file_check_and_advance_wb_err(file, old);
     745        3565 :                 spin_unlock(&file->f_lock);
     746             :         }
     747             : 
     748             :         /*
     749             :          * We're mostly using this function as a drop in replacement for
     750             :          * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
     751             :          * that the legacy code would have had on these flags.
     752             :          */
     753    21451667 :         clear_bit(AS_EIO, &mapping->flags);
     754    21455795 :         clear_bit(AS_ENOSPC, &mapping->flags);
     755    21456121 :         return err;
     756             : }
     757             : EXPORT_SYMBOL(file_check_and_advance_wb_err);
     758             : 
     759             : /**
     760             :  * file_write_and_wait_range - write out & wait on a file range
     761             :  * @file:       file pointing to address_space with pages
     762             :  * @lstart:     offset in bytes where the range starts
     763             :  * @lend:       offset in bytes where the range ends (inclusive)
     764             :  *
     765             :  * Write out and wait upon file offsets lstart->lend, inclusive.
     766             :  *
     767             :  * Note that @lend is inclusive (describes the last byte to be written) so
     768             :  * that this function can be used to write to the very end-of-file (end = -1).
     769             :  *
     770             :  * After writing out and waiting on the data, we check and advance the
     771             :  * f_wb_err cursor to the latest value, and return any errors detected there.
     772             :  *
     773             :  * Return: %0 on success, negative error code otherwise.
     774             :  */
     775    20935029 : int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
     776             : {
     777    20935029 :         int err = 0, err2;
     778    20935029 :         struct address_space *mapping = file->f_mapping;
     779             : 
     780    20935029 :         if (lend < lstart)
     781             :                 return 0;
     782             : 
     783    20935029 :         if (mapping_needs_writeback(mapping)) {
     784     5128645 :                 err = __filemap_fdatawrite_range(mapping, lstart, lend,
     785             :                                                  WB_SYNC_ALL);
     786             :                 /* See comment of filemap_write_and_wait() */
     787     5128768 :                 if (err != -EIO)
     788     5128569 :                         __filemap_fdatawait_range(mapping, lstart, lend);
     789             :         }
     790    20935026 :         err2 = file_check_and_advance_wb_err(file);
     791    20940613 :         if (!err)
     792    20940412 :                 err = err2;
     793             :         return err;
     794             : }
     795             : EXPORT_SYMBOL(file_write_and_wait_range);
     796             : 
     797             : /**
     798             :  * replace_page_cache_folio - replace a pagecache folio with a new one
     799             :  * @old:        folio to be replaced
     800             :  * @new:        folio to replace with
     801             :  *
     802             :  * This function replaces a folio in the pagecache with a new one.  On
     803             :  * success it acquires the pagecache reference for the new folio and
     804             :  * drops it for the old folio.  Both the old and new folios must be
     805             :  * locked.  This function does not add the new folio to the LRU, the
     806             :  * caller must do that.
     807             :  *
     808             :  * The remove + add is atomic.  This function cannot fail.
     809             :  */
     810           0 : void replace_page_cache_folio(struct folio *old, struct folio *new)
     811             : {
     812           0 :         struct address_space *mapping = old->mapping;
     813           0 :         void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;
     814           0 :         pgoff_t offset = old->index;
     815           0 :         XA_STATE(xas, &mapping->i_pages, offset);
     816             : 
     817           0 :         VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
     818           0 :         VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
     819           0 :         VM_BUG_ON_FOLIO(new->mapping, new);
     820             : 
     821           0 :         folio_get(new);
     822           0 :         new->mapping = mapping;
     823           0 :         new->index = offset;
     824             : 
     825           0 :         mem_cgroup_migrate(old, new);
     826             : 
     827           0 :         xas_lock_irq(&xas);
     828           0 :         xas_store(&xas, new);
     829             : 
     830           0 :         old->mapping = NULL;
     831             :         /* hugetlb pages do not participate in page cache accounting. */
     832           0 :         if (!folio_test_hugetlb(old))
     833           0 :                 __lruvec_stat_sub_folio(old, NR_FILE_PAGES);
     834           0 :         if (!folio_test_hugetlb(new))
     835           0 :                 __lruvec_stat_add_folio(new, NR_FILE_PAGES);
     836           0 :         if (folio_test_swapbacked(old))
     837           0 :                 __lruvec_stat_sub_folio(old, NR_SHMEM);
     838           0 :         if (folio_test_swapbacked(new))
     839           0 :                 __lruvec_stat_add_folio(new, NR_SHMEM);
     840           0 :         xas_unlock_irq(&xas);
     841           0 :         if (free_folio)
     842           0 :                 free_folio(old);
     843           0 :         folio_put(old);
     844           0 : }
     845             : EXPORT_SYMBOL_GPL(replace_page_cache_folio);
     846             : 
     847  3663369615 : noinline int __filemap_add_folio(struct address_space *mapping,
     848             :                 struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
     849             : {
     850  3663369615 :         XA_STATE(xas, &mapping->i_pages, index);
     851  3663369615 :         int huge = folio_test_hugetlb(folio);
     852  3663780499 :         bool charged = false;
     853  3663780499 :         long nr = 1;
     854             : 
     855  3663780499 :         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
     856  3663780499 :         VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
     857  7327560998 :         mapping_set_update(&xas, mapping);
     858             : 
     859  3663780499 :         if (!huge) {
     860  3664107205 :                 int error = mem_cgroup_charge(folio, NULL, gfp);
     861  3665850728 :                 VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
     862  3665850728 :                 if (error)
     863             :                         return error;
     864  3665850728 :                 charged = true;
     865  3665850728 :                 xas_set_order(&xas, index, folio_order(folio));
     866  3665850728 :                 nr = folio_nr_pages(folio);
     867             :         }
     868             : 
     869  3665524022 :         gfp &= GFP_RECLAIM_MASK;
     870  3665524022 :         folio_ref_add(folio, nr);
     871  3667904378 :         folio->mapping = mapping;
     872  3667904378 :         folio->index = xas.xa_index;
     873             : 
     874  3668331123 :         do {
     875  3668331123 :                 unsigned int order = xa_get_order(xas.xa, xas.xa_index);
     876  3666813201 :                 void *entry, *old = NULL;
     877             : 
     878  3809395781 :                 if (order > folio_order(folio))
     879     2198382 :                         xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
     880             :                                         order, gfp);
     881  3666813201 :                 xas_lock_irq(&xas);
     882  3732147319 :                 xas_for_each_conflict(&xas, entry) {
     883    72793066 :                         old = entry;
     884    72793066 :                         if (!xa_is_value(entry)) {
     885     6666509 :                                 xas_set_err(&xas, -EEXIST);
     886     6666509 :                                 goto unlock;
     887             :                         }
     888             :                 }
     889             : 
     890  3658589604 :                 if (old) {
     891    58803426 :                         if (shadowp)
     892    58803426 :                                 *shadowp = old;
     893             :                         /* entry may have been split before we acquired lock */
     894    58803426 :                         order = xa_get_order(xas.xa, xas.xa_index);
     895    62397241 :                         if (order > folio_order(folio)) {
     896             :                                 /* How to handle large swap entries? */
     897     2122961 :                                 BUG_ON(shmem_mapping(mapping));
     898     2122961 :                                 xas_split(&xas, old, order);
     899     2122957 :                                 xas_reset(&xas);
     900             :                         }
     901             :                 }
     902             : 
     903  3658590087 :                 xas_store(&xas, folio);
     904  3658977770 :                 if (xas_error(&xas))
     905           0 :                         goto unlock;
     906             : 
     907  3658977770 :                 mapping->nrpages += nr;
     908             : 
     909             :                 /* hugetlb pages do not participate in page cache accounting */
     910  3658977770 :                 if (!huge) {
     911  3658977770 :                         __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
     912  3657473660 :                         if (folio_test_pmd_mappable(folio))
     913      188589 :                                 __lruvec_stat_mod_folio(folio,
     914             :                                                 NR_FILE_THPS, nr);
     915             :                 }
     916   136440382 : unlock:
     917  3664140172 :                 xas_unlock_irq(&xas);
     918  3664918547 :         } while (xas_nomem(&xas, gfp));
     919             : 
     920  3671076978 :         if (xas_error(&xas))
     921     6666504 :                 goto error;
     922             : 
     923  3657743970 :         trace_mm_filemap_add_to_page_cache(folio);
     924  3657743970 :         return 0;
     925             : error:
     926     6666504 :         if (charged)
     927     6666506 :                 mem_cgroup_uncharge(folio);
     928     6666466 :         folio->mapping = NULL;
     929             :         /* Leave page->index set: truncation relies upon it */
     930     6666466 :         folio_put_refs(folio, nr);
     931     6666494 :         return xas_error(&xas);
     932             : }
     933             : ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
     934             : 
     935  3664630768 : int filemap_add_folio(struct address_space *mapping, struct folio *folio,
     936             :                                 pgoff_t index, gfp_t gfp)
     937             : {
     938  3664630768 :         void *shadow = NULL;
     939  3664630768 :         int ret;
     940             : 
     941  3664630768 :         __folio_set_locked(folio);
     942  3664230870 :         ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
     943  3662230929 :         if (unlikely(ret))
     944     6666489 :                 __folio_clear_locked(folio);
     945             :         else {
     946             :                 /*
     947             :                  * The folio might have been evicted from cache only
     948             :                  * recently, in which case it should be activated like
     949             :                  * any other repeatedly accessed folio.
     950             :                  * The exception is folios getting rewritten; evicting other
     951             :                  * data from the working set, only to cache data that will
     952             :                  * get overwritten with something else, is a waste of memory.
     953             :                  */
     954  3655564440 :                 WARN_ON_ONCE(folio_test_active(folio));
     955  3655564440 :                 if (!(gfp & __GFP_WRITE) && shadow)
     956    53124857 :                         workingset_refault(folio, shadow);
     957  3655565306 :                 folio_add_lru(folio);
     958             :         }
     959  3665817439 :         return ret;
     960             : }
     961             : EXPORT_SYMBOL_GPL(filemap_add_folio);
     962             : 
     963             : #ifdef CONFIG_NUMA
     964  3664269834 : struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
     965             : {
     966  3664269834 :         int n;
     967  3664269834 :         struct folio *folio;
     968             : 
     969  3664269834 :         if (cpuset_do_page_mem_spread()) {
     970       20815 :                 unsigned int cpuset_mems_cookie;
     971       20815 :                 do {
     972       20815 :                         cpuset_mems_cookie = read_mems_allowed_begin();
     973           0 :                         n = cpuset_mem_spread_node();
     974           0 :                         folio = __folio_alloc_node(gfp, order, n);
     975           0 :                 } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));
     976             : 
     977           0 :                 return folio;
     978             :         }
     979  3663659221 :         return folio_alloc(gfp, order);
     980             : }
     981             : EXPORT_SYMBOL(filemap_alloc_folio);
     982             : #endif
     983             : 
     984             : /*
     985             :  * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
     986             :  *
     987             :  * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
     988             :  *
     989             :  * @mapping1: the first mapping to lock
     990             :  * @mapping2: the second mapping to lock
     991             :  */
     992   191496525 : void filemap_invalidate_lock_two(struct address_space *mapping1,
     993             :                                  struct address_space *mapping2)
     994             : {
     995   191496525 :         if (mapping1 > mapping2)
     996    93363377 :                 swap(mapping1, mapping2);
     997   191496525 :         if (mapping1)
     998   191496525 :                 down_write(&mapping1->invalidate_lock);
     999   191502866 :         if (mapping2 && mapping1 != mapping2)
    1000   185331504 :                 down_write_nested(&mapping2->invalidate_lock, 1);
    1001   191507936 : }
    1002             : EXPORT_SYMBOL(filemap_invalidate_lock_two);
    1003             : 
    1004             : /*
    1005             :  * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
    1006             :  *
    1007             :  * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
    1008             :  *
    1009             :  * @mapping1: the first mapping to unlock
    1010             :  * @mapping2: the second mapping to unlock
    1011             :  */
    1012   191466092 : void filemap_invalidate_unlock_two(struct address_space *mapping1,
    1013             :                                    struct address_space *mapping2)
    1014             : {
    1015   191466092 :         if (mapping1)
    1016   191466092 :                 up_write(&mapping1->invalidate_lock);
    1017   191498983 :         if (mapping2 && mapping1 != mapping2)
    1018   185333206 :                 up_write(&mapping2->invalidate_lock);
    1019   191492186 : }
    1020             : EXPORT_SYMBOL(filemap_invalidate_unlock_two);
    1021             : 
    1022             : /*
    1023             :  * In order to wait for pages to become available there must be
    1024             :  * waitqueues associated with pages. By using a hash table of
    1025             :  * waitqueues where the bucket discipline is to maintain all
    1026             :  * waiters on the same queue and wake all when any of the pages
    1027             :  * become available, and for the woken contexts to check to be
    1028             :  * sure the appropriate page became available, this saves space
    1029             :  * at a cost of "thundering herd" phenomena during rare hash
    1030             :  * collisions.
    1031             :  */
    1032             : #define PAGE_WAIT_TABLE_BITS 8
    1033             : #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
    1034             : static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
    1035             : 
    1036             : static wait_queue_head_t *folio_waitqueue(struct folio *folio)
    1037             : {
    1038   148813063 :         return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
    1039             : }
    1040             : 
    1041           0 : void __init pagecache_init(void)
    1042             : {
    1043           0 :         int i;
    1044             : 
    1045           0 :         for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
    1046           0 :                 init_waitqueue_head(&folio_wait_table[i]);
    1047             : 
    1048           0 :         page_writeback_init();
    1049           0 : }
    1050             : 
    1051             : /*
    1052             :  * The page wait code treats the "wait->flags" somewhat unusually, because
    1053             :  * we have multiple different kinds of waits, not just the usual "exclusive"
    1054             :  * one.
    1055             :  *
    1056             :  * We have:
    1057             :  *
    1058             :  *  (a) no special bits set:
    1059             :  *
    1060             :  *      We're just waiting for the bit to be released, and when a waker
    1061             :  *      calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
    1062             :  *      and remove it from the wait queue.
    1063             :  *
    1064             :  *      Simple and straightforward.
    1065             :  *
    1066             :  *  (b) WQ_FLAG_EXCLUSIVE:
    1067             :  *
    1068             :  *      The waiter is waiting to get the lock, and only one waiter should
    1069             :  *      be woken up to avoid any thundering herd behavior. We'll set the
    1070             :  *      WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
    1071             :  *
    1072             :  *      This is the traditional exclusive wait.
    1073             :  *
    1074             :  *  (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
    1075             :  *
    1076             :  *      The waiter is waiting to get the bit, and additionally wants the
    1077             :  *      lock to be transferred to it for fair lock behavior. If the lock
    1078             :  *      cannot be taken, we stop walking the wait queue without waking
    1079             :  *      the waiter.
    1080             :  *
    1081             :  *      This is the "fair lock handoff" case, and in addition to setting
    1082             :  *      WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
    1083             :  *      that it now has the lock.
    1084             :  */
    1085    70212890 : static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
    1086             : {
    1087    70212890 :         unsigned int flags;
    1088    70212890 :         struct wait_page_key *key = arg;
    1089    70212890 :         struct wait_page_queue *wait_page
    1090    70212890 :                 = container_of(wait, struct wait_page_queue, wait);
    1091             : 
    1092    70212890 :         if (!wake_page_match(wait_page, key))
    1093      393095 :                 return 0;
    1094             : 
    1095             :         /*
    1096             :          * If it's a lock handoff wait, we get the bit for it, and
    1097             :          * stop walking (and do not wake it up) if we can't.
    1098             :          */
    1099    69819795 :         flags = wait->flags;
    1100    69819795 :         if (flags & WQ_FLAG_EXCLUSIVE) {
    1101    13241808 :                 if (test_bit(key->bit_nr, &key->folio->flags))
    1102             :                         return -1;
    1103     6572608 :                 if (flags & WQ_FLAG_CUSTOM) {
    1104        6420 :                         if (test_and_set_bit(key->bit_nr, &key->folio->flags))
    1105             :                                 return -1;
    1106        6420 :                         flags |= WQ_FLAG_DONE;
    1107             :                 }
    1108             :         }
    1109             : 
    1110             :         /*
    1111             :          * We are holding the wait-queue lock, but the waiter that
    1112             :          * is waiting for this will be checking the flags without
    1113             :          * any locking.
    1114             :          *
    1115             :          * So update the flags atomically, and wake up the waiter
    1116             :          * afterwards to avoid any races. This store-release pairs
    1117             :          * with the load-acquire in folio_wait_bit_common().
    1118             :          */
    1119    69771499 :         smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
    1120    69771492 :         wake_up_state(wait->private, mode);
    1121             : 
    1122             :         /*
    1123             :          * Ok, we have successfully done what we're waiting for,
    1124             :          * and we can unconditionally remove the wait entry.
    1125             :          *
    1126             :          * Note that this pairs with the "finish_wait()" in the
    1127             :          * waiter, and has to be the absolute last thing we do.
    1128             :          * After this list_del_init(&wait->entry) the wait entry
    1129             :          * might be de-allocated and the process might even have
    1130             :          * exited.
    1131             :          */
    1132    69771525 :         list_del_init_careful(&wait->entry);
    1133    69771505 :         return (flags & WQ_FLAG_EXCLUSIVE) != 0;
    1134             : }
    1135             : 
    1136    74871821 : static void folio_wake_bit(struct folio *folio, int bit_nr)
    1137             : {
    1138    74871821 :         wait_queue_head_t *q = folio_waitqueue(folio);
    1139    74871821 :         struct wait_page_key key;
    1140    74871821 :         unsigned long flags;
    1141    74871821 :         wait_queue_entry_t bookmark;
    1142             : 
    1143    74871821 :         key.folio = folio;
    1144    74871821 :         key.bit_nr = bit_nr;
    1145    74871821 :         key.page_match = 0;
    1146             : 
    1147    74871821 :         bookmark.flags = 0;
    1148    74871821 :         bookmark.private = NULL;
    1149    74871821 :         bookmark.func = NULL;
    1150    74871821 :         INIT_LIST_HEAD(&bookmark.entry);
    1151             : 
    1152    74871821 :         spin_lock_irqsave(&q->lock, flags);
    1153    74891107 :         __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
    1154             : 
    1155    74892066 :         while (bookmark.flags & WQ_FLAG_BOOKMARK) {
    1156             :                 /*
    1157             :                  * Take a breather from holding the lock,
    1158             :                  * allow pages that finish wake up asynchronously
    1159             :                  * to acquire the lock and remove themselves
    1160             :                  * from wait queue
    1161             :                  */
    1162         965 :                 spin_unlock_irqrestore(&q->lock, flags);
    1163         965 :                 cpu_relax();
    1164         965 :                 spin_lock_irqsave(&q->lock, flags);
    1165         965 :                 __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
    1166             :         }
    1167             : 
    1168             :         /*
    1169             :          * It's possible to miss clearing waiters here, when we woke our page
    1170             :          * waiters, but the hashed waitqueue has waiters for other pages on it.
    1171             :          * That's okay, it's a rare case. The next waker will clear it.
    1172             :          *
    1173             :          * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
    1174             :          * other), the flag may be cleared in the course of freeing the page;
    1175             :          * but that is not required for correctness.
    1176             :          */
    1177    74891064 :         if (!waitqueue_active(q) || !key.page_match)
    1178    74363690 :                 folio_clear_waiters(folio);
    1179             : 
    1180    74891119 :         spin_unlock_irqrestore(&q->lock, flags);
    1181    74890862 : }
    1182             : 
    1183   806474997 : static void folio_wake(struct folio *folio, int bit)
    1184             : {
    1185   806474997 :         if (!folio_test_waiters(folio))
    1186             :                 return;
    1187    32184265 :         folio_wake_bit(folio, bit);
    1188             : }
    1189             : 
    1190             : /*
    1191             :  * A choice of three behaviors for folio_wait_bit_common():
    1192             :  */
    1193             : enum behavior {
    1194             :         EXCLUSIVE,      /* Hold ref to page and take the bit when woken, like
    1195             :                          * __folio_lock() waiting on then setting PG_locked.
    1196             :                          */
    1197             :         SHARED,         /* Hold ref to page and check the bit when woken, like
    1198             :                          * folio_wait_writeback() waiting on PG_writeback.
    1199             :                          */
    1200             :         DROP,           /* Drop ref to page before wait, no check when woken,
    1201             :                          * like folio_put_wait_locked() on PG_locked.
    1202             :                          */
    1203             : };
    1204             : 
    1205             : /*
    1206             :  * Attempt to check (or get) the folio flag, and mark us done
    1207             :  * if successful.
    1208             :  */
    1209    74443950 : static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
    1210             :                                         struct wait_queue_entry *wait)
    1211             : {
    1212    74443950 :         if (wait->flags & WQ_FLAG_EXCLUSIVE) {
    1213    10320189 :                 if (test_and_set_bit(bit_nr, &folio->flags))
    1214             :                         return false;
    1215   128247506 :         } else if (test_bit(bit_nr, &folio->flags))
    1216             :                 return false;
    1217             : 
    1218     4671834 :         wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
    1219     4671834 :         return true;
    1220             : }
    1221             : 
    1222             : /* How many times do we accept lock stealing from under a waiter? */
    1223             : int sysctl_page_lock_unfairness = 5;
    1224             : 
    1225    73938572 : static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
    1226             :                 int state, enum behavior behavior)
    1227             : {
    1228    73938572 :         wait_queue_head_t *q = folio_waitqueue(folio);
    1229    73938572 :         int unfairness = sysctl_page_lock_unfairness;
    1230    73938572 :         struct wait_page_queue wait_page;
    1231    73938572 :         wait_queue_entry_t *wait = &wait_page.wait;
    1232    73938572 :         bool thrashing = false;
    1233    73938572 :         unsigned long pflags;
    1234    73938572 :         bool in_thrashing;
    1235             : 
    1236   114712899 :         if (bit_nr == PG_locked &&
    1237    32924189 :             !folio_test_uptodate(folio) && folio_test_workingset(folio)) {
    1238      242887 :                 delayacct_thrashing_start(&in_thrashing);
    1239      242881 :                 psi_memstall_enter(&pflags);
    1240      242881 :                 thrashing = true;
    1241             :         }
    1242             : 
    1243    73938101 :         init_wait(wait);
    1244    73938101 :         wait->func = wake_page_function;
    1245    73938101 :         wait_page.folio = folio;
    1246    73938101 :         wait_page.bit_nr = bit_nr;
    1247             : 
    1248    74376619 : repeat:
    1249    74376619 :         wait->flags = 0;
    1250    74376619 :         if (behavior == EXCLUSIVE) {
    1251    10235448 :                 wait->flags = WQ_FLAG_EXCLUSIVE;
    1252    10235448 :                 if (--unfairness < 0)
    1253        7059 :                         wait->flags |= WQ_FLAG_CUSTOM;
    1254             :         }
    1255             : 
    1256             :         /*
    1257             :          * Do one last check whether we can get the
    1258             :          * page bit synchronously.
    1259             :          *
    1260             :          * Do the folio_set_waiters() marking before that
    1261             :          * to let any waker we _just_ missed know they
    1262             :          * need to wake us up (otherwise they'll never
    1263             :          * even go to the slow case that looks at the
    1264             :          * page queue), and add ourselves to the wait
    1265             :          * queue if we need to sleep.
    1266             :          *
    1267             :          * This part needs to be done under the queue
    1268             :          * lock to avoid races.
    1269             :          */
    1270    74376619 :         spin_lock_irq(&q->lock);
    1271    74440734 :         folio_set_waiters(folio);
    1272    74441308 :         if (!folio_trylock_flag(folio, bit_nr, wait))
    1273    69769371 :                 __add_wait_queue_entry_tail(q, wait);
    1274    74440936 :         spin_unlock_irq(&q->lock);
    1275             : 
    1276             :         /*
    1277             :          * From now on, all the logic will be based on
    1278             :          * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
    1279             :          * see whether the page bit testing has already
    1280             :          * been done by the wake function.
    1281             :          *
    1282             :          * We can drop our reference to the folio.
    1283             :          */
    1284    74441011 :         if (behavior == DROP)
    1285    11681718 :                 folio_put(folio);
    1286             : 
    1287             :         /*
    1288             :          * Note that until the "finish_wait()", or until
    1289             :          * we see the WQ_FLAG_WOKEN flag, we need to
    1290             :          * be very careful with the 'wait->flags', because
    1291             :          * we may race with a waker that sets them.
    1292             :          */
    1293   213354838 :         for (;;) {
    1294   143897909 :                 unsigned int flags;
    1295             : 
    1296   143897909 :                 set_current_state(state);
    1297             : 
    1298             :                 /* Loop until we've been woken or interrupted */
    1299   143899593 :                 flags = smp_load_acquire(&wait->flags);
    1300   143901014 :                 if (!(flags & WQ_FLAG_WOKEN)) {
    1301    69463838 :                         if (signal_pending_state(state, current))
    1302             :                                 break;
    1303             : 
    1304    69462473 :                         io_schedule();
    1305    69456929 :                         continue;
    1306             :                 }
    1307             : 
    1308             :                 /* If we were non-exclusive, we're done */
    1309    74437176 :                 if (behavior != EXCLUSIVE)
    1310             :                         break;
    1311             : 
    1312             :                 /* If the waker got the lock for us, we're done */
    1313    10319945 :                 if (flags & WQ_FLAG_DONE)
    1314             :                         break;
    1315             : 
    1316             :                 /*
    1317             :                  * Otherwise, if we're getting the lock, we need to
    1318             :                  * try to get it ourselves.
    1319             :                  *
    1320             :                  * And if that fails, we'll have to retry this all.
    1321             :                  */
    1322     6566125 :                 if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
    1323      438518 :                         goto repeat;
    1324             : 
    1325     6127626 :                 wait->flags |= WQ_FLAG_DONE;
    1326     6127626 :                 break;
    1327             :         }
    1328             : 
    1329             :         /*
    1330             :          * If a signal happened, this 'finish_wait()' may remove the last
    1331             :          * waiter from the wait-queues, but the folio waiters bit will remain
    1332             :          * set. That's ok. The next wakeup will take care of it, and trying
    1333             :          * to do it here would be difficult and prone to races.
    1334             :          */
    1335    73998633 :         finish_wait(q, wait);
    1336             : 
    1337    73989107 :         if (thrashing) {
    1338      242909 :                 delayacct_thrashing_end(&in_thrashing);
    1339      242909 :                 psi_memstall_leave(&pflags);
    1340             :         }
    1341             : 
    1342             :         /*
    1343             :          * NOTE! The wait->flags weren't stable until we've done the
    1344             :          * 'finish_wait()', and we could have exited the loop above due
    1345             :          * to a signal, and had a wakeup event happen after the signal
    1346             :          * test but before the 'finish_wait()'.
    1347             :          *
    1348             :          * So only after the finish_wait() can we reliably determine
    1349             :          * if we got woken up or not, so we can now figure out the final
    1350             :          * return value based on that state without races.
    1351             :          *
    1352             :          * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
    1353             :          * waiter, but an exclusive one requires WQ_FLAG_DONE.
    1354             :          */
    1355    73989107 :         if (behavior == EXCLUSIVE)
    1356     9881585 :                 return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
    1357             : 
    1358    64107522 :         return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
    1359             : }
    1360             : 
    1361             : #ifdef CONFIG_MIGRATION
    1362             : /**
    1363             :  * migration_entry_wait_on_locked - Wait for a migration entry to be removed
    1364             :  * @entry: migration swap entry.
    1365             :  * @ptl: already locked ptl. This function will drop the lock.
    1366             :  *
    1367             :  * Wait for a migration entry referencing the given page to be removed. This is
    1368             :  * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except
    1369             :  * this can be called without taking a reference on the page. Instead this
    1370             :  * should be called while holding the ptl for the migration entry referencing
    1371             :  * the page.
    1372             :  *
    1373             :  * Returns after unlocking the ptl.
    1374             :  *
    1375             :  * This follows the same logic as folio_wait_bit_common() so see the comments
    1376             :  * there.
    1377             :  */
    1378        2670 : void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
    1379             :         __releases(ptl)
    1380             : {
    1381        2670 :         struct wait_page_queue wait_page;
    1382        2670 :         wait_queue_entry_t *wait = &wait_page.wait;
    1383        2670 :         bool thrashing = false;
    1384        2670 :         unsigned long pflags;
    1385        2670 :         bool in_thrashing;
    1386        2670 :         wait_queue_head_t *q;
    1387        2670 :         struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
    1388             : 
    1389        2670 :         q = folio_waitqueue(folio);
    1390        5340 :         if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
    1391           0 :                 delayacct_thrashing_start(&in_thrashing);
    1392           0 :                 psi_memstall_enter(&pflags);
    1393           0 :                 thrashing = true;
    1394             :         }
    1395             : 
    1396        2670 :         init_wait(wait);
    1397        2670 :         wait->func = wake_page_function;
    1398        2670 :         wait_page.folio = folio;
    1399        2670 :         wait_page.bit_nr = PG_locked;
    1400        2670 :         wait->flags = 0;
    1401             : 
    1402        2670 :         spin_lock_irq(&q->lock);
    1403        2670 :         folio_set_waiters(folio);
    1404        2670 :         if (!folio_trylock_flag(folio, PG_locked, wait))
    1405        2670 :                 __add_wait_queue_entry_tail(q, wait);
    1406        2670 :         spin_unlock_irq(&q->lock);
    1407             : 
    1408             :         /*
    1409             :          * If a migration entry exists for the page the migration path must hold
    1410             :          * a valid reference to the page, and it must take the ptl to remove the
    1411             :          * migration entry. So the page is valid until the ptl is dropped.
    1412             :          */
    1413        2670 :         spin_unlock(ptl);
    1414             : 
    1415        8008 :         for (;;) {
    1416        5339 :                 unsigned int flags;
    1417             : 
    1418        5339 :                 set_current_state(TASK_UNINTERRUPTIBLE);
    1419             : 
    1420             :                 /* Loop until we've been woken or interrupted */
    1421        5339 :                 flags = smp_load_acquire(&wait->flags);
    1422        5339 :                 if (!(flags & WQ_FLAG_WOKEN)) {
    1423        2669 :                         if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))
    1424             :                                 break;
    1425             : 
    1426        2669 :                         io_schedule();
    1427        2669 :                         continue;
    1428             :                 }
    1429             :                 break;
    1430             :         }
    1431             : 
    1432        2670 :         finish_wait(q, wait);
    1433             : 
    1434        2670 :         if (thrashing) {
    1435           0 :                 delayacct_thrashing_end(&in_thrashing);
    1436           0 :                 psi_memstall_leave(&pflags);
    1437             :         }
    1438        2670 : }
    1439             : #endif
    1440             : 
    1441    33669610 : void folio_wait_bit(struct folio *folio, int bit_nr)
    1442             : {
    1443    33669610 :         folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
    1444           0 : }
    1445             : EXPORT_SYMBOL(folio_wait_bit);
    1446             : 
    1447           0 : int folio_wait_bit_killable(struct folio *folio, int bit_nr)
    1448             : {
    1449    18768307 :         return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
    1450             : }
    1451             : EXPORT_SYMBOL(folio_wait_bit_killable);
    1452             : 
    1453             : /**
    1454             :  * folio_put_wait_locked - Drop a reference and wait for it to be unlocked
    1455             :  * @folio: The folio to wait for.
    1456             :  * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
    1457             :  *
    1458             :  * The caller should hold a reference on @folio.  They expect the page to
    1459             :  * become unlocked relatively soon, but do not wish to hold up migration
    1460             :  * (for example) by holding the reference while waiting for the folio to
    1461             :  * come unlocked.  After this function returns, the caller should not
    1462             :  * dereference @folio.
    1463             :  *
    1464             :  * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.
    1465             :  */
    1466             : static int folio_put_wait_locked(struct folio *folio, int state)
    1467             : {
    1468    11681528 :         return folio_wait_bit_common(folio, PG_locked, state, DROP);
    1469             : }
    1470             : 
    1471             : /**
    1472             :  * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue
    1473             :  * @folio: Folio defining the wait queue of interest
    1474             :  * @waiter: Waiter to add to the queue
    1475             :  *
    1476             :  * Add an arbitrary @waiter to the wait queue for the nominated @folio.
    1477             :  */
    1478           0 : void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
    1479             : {
    1480           0 :         wait_queue_head_t *q = folio_waitqueue(folio);
    1481           0 :         unsigned long flags;
    1482             : 
    1483           0 :         spin_lock_irqsave(&q->lock, flags);
    1484           0 :         __add_wait_queue_entry_tail(q, waiter);
    1485           0 :         folio_set_waiters(folio);
    1486           0 :         spin_unlock_irqrestore(&q->lock, flags);
    1487           0 : }
    1488             : EXPORT_SYMBOL_GPL(folio_add_wait_queue);
    1489             : 
    1490             : #ifndef clear_bit_unlock_is_negative_byte
    1491             : 
    1492             : /*
    1493             :  * PG_waiters is the high bit in the same byte as PG_lock.
    1494             :  *
    1495             :  * On x86 (and on many other architectures), we can clear PG_lock and
    1496             :  * test the sign bit at the same time. But if the architecture does
    1497             :  * not support that special operation, we just do this all by hand
    1498             :  * instead.
    1499             :  *
    1500             :  * The read of PG_waiters has to be after (or concurrently with) PG_locked
    1501             :  * being cleared, but a memory barrier should be unnecessary since it is
    1502             :  * in the same byte as PG_locked.
    1503             :  */
    1504             : static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
    1505             : {
    1506             :         clear_bit_unlock(nr, mem);
    1507             :         /* smp_mb__after_atomic(); */
    1508             :         return test_bit(PG_waiters, mem);
    1509             : }
    1510             : 
    1511             : #endif
    1512             : 
    1513             : /**
    1514             :  * folio_unlock - Unlock a locked folio.
    1515             :  * @folio: The folio.
    1516             :  *
    1517             :  * Unlocks the folio and wakes up any thread sleeping on the page lock.
    1518             :  *
    1519             :  * Context: May be called from interrupt or process context.  May not be
    1520             :  * called from NMI context.
    1521             :  */
    1522 70828716029 : void folio_unlock(struct folio *folio)
    1523             : {
    1524             :         /* Bit 7 allows x86 to check the byte's sign bit */
    1525 70828716029 :         BUILD_BUG_ON(PG_waiters != 7);
    1526 70828716029 :         BUILD_BUG_ON(PG_locked > 7);
    1527 70828716029 :         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
    1528 70828716029 :         if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0)))
    1529    42701929 :                 folio_wake_bit(folio, PG_locked);
    1530 70880971351 : }
    1531             : EXPORT_SYMBOL(folio_unlock);
    1532             : 
    1533             : /**
    1534             :  * folio_end_private_2 - Clear PG_private_2 and wake any waiters.
    1535             :  * @folio: The folio.
    1536             :  *
    1537             :  * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
    1538             :  * it.  The folio reference held for PG_private_2 being set is released.
    1539             :  *
    1540             :  * This is, for example, used when a netfs folio is being written to a local
    1541             :  * disk cache, thereby allowing writes to the cache for the same folio to be
    1542             :  * serialised.
    1543             :  */
    1544           0 : void folio_end_private_2(struct folio *folio)
    1545             : {
    1546           0 :         VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
    1547           0 :         clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
    1548           0 :         folio_wake_bit(folio, PG_private_2);
    1549           0 :         folio_put(folio);
    1550           0 : }
    1551             : EXPORT_SYMBOL(folio_end_private_2);
    1552             : 
    1553             : /**
    1554             :  * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
    1555             :  * @folio: The folio to wait on.
    1556             :  *
    1557             :  * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio.
    1558             :  */
    1559           0 : void folio_wait_private_2(struct folio *folio)
    1560             : {
    1561           0 :         while (folio_test_private_2(folio))
    1562           0 :                 folio_wait_bit(folio, PG_private_2);
    1563           0 : }
    1564             : EXPORT_SYMBOL(folio_wait_private_2);
    1565             : 
    1566             : /**
    1567             :  * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
    1568             :  * @folio: The folio to wait on.
    1569             :  *
    1570             :  * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a
    1571             :  * fatal signal is received by the calling task.
    1572             :  *
    1573             :  * Return:
    1574             :  * - 0 if successful.
    1575             :  * - -EINTR if a fatal signal was encountered.
    1576             :  */
    1577           0 : int folio_wait_private_2_killable(struct folio *folio)
    1578             : {
    1579           0 :         int ret = 0;
    1580             : 
    1581           0 :         while (folio_test_private_2(folio)) {
    1582           0 :                 ret = folio_wait_bit_killable(folio, PG_private_2);
    1583           0 :                 if (ret < 0)
    1584             :                         break;
    1585             :         }
    1586             : 
    1587           0 :         return ret;
    1588             : }
    1589             : EXPORT_SYMBOL(folio_wait_private_2_killable);
    1590             : 
    1591             : /**
    1592             :  * folio_end_writeback - End writeback against a folio.
    1593             :  * @folio: The folio.
    1594             :  */
    1595   806471083 : void folio_end_writeback(struct folio *folio)
    1596             : {
    1597             :         /*
    1598             :          * folio_test_clear_reclaim() could be used here but it is an
    1599             :          * atomic operation and overkill in this particular case. Failing
    1600             :          * to shuffle a folio marked for immediate reclaim is too mild
    1601             :          * a gain to justify taking an atomic operation penalty at the
    1602             :          * end of every folio writeback.
    1603             :          */
    1604   806471083 :         if (folio_test_reclaim(folio)) {
    1605     1094474 :                 folio_clear_reclaim(folio);
    1606     1094475 :                 folio_rotate_reclaimable(folio);
    1607             :         }
    1608             : 
    1609             :         /*
    1610             :          * Writeback does not hold a folio reference of its own, relying
    1611             :          * on truncation to wait for the clearing of PG_writeback.
    1612             :          * But here we must make sure that the folio is not freed and
    1613             :          * reused before the folio_wake().
    1614             :          */
    1615   806471081 :         folio_get(folio);
    1616   806473567 :         if (!__folio_end_writeback(folio))
    1617           0 :                 BUG();
    1618             : 
    1619   806475073 :         smp_mb__after_atomic();
    1620   806475073 :         folio_wake(folio, PG_writeback);
    1621   806475055 :         acct_reclaim_writeback(folio);
    1622   806474942 :         folio_put(folio);
    1623   806475486 : }
    1624             : EXPORT_SYMBOL(folio_end_writeback);
    1625             : 
    1626             : /**
    1627             :  * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
    1628             :  * @folio: The folio to lock
    1629             :  */
    1630     6292072 : void __folio_lock(struct folio *folio)
    1631             : {
    1632     7211620 :         folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
    1633             :                                 EXCLUSIVE);
    1634      919523 : }
    1635             : EXPORT_SYMBOL(__folio_lock);
    1636             : 
    1637           0 : int __folio_lock_killable(struct folio *folio)
    1638             : {
    1639           0 :         return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
    1640             :                                         EXCLUSIVE);
    1641             : }
    1642             : EXPORT_SYMBOL_GPL(__folio_lock_killable);
    1643             : 
    1644           0 : static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
    1645             : {
    1646           0 :         struct wait_queue_head *q = folio_waitqueue(folio);
    1647           0 :         int ret = 0;
    1648             : 
    1649           0 :         wait->folio = folio;
    1650           0 :         wait->bit_nr = PG_locked;
    1651             : 
    1652           0 :         spin_lock_irq(&q->lock);
    1653           0 :         __add_wait_queue_entry_tail(q, &wait->wait);
    1654           0 :         folio_set_waiters(folio);
    1655           0 :         ret = !folio_trylock(folio);
    1656             :         /*
    1657             :          * If we were successful now, we know we're still on the
    1658             :          * waitqueue as we're still under the lock. This means it's
    1659             :          * safe to remove and return success, we know the callback
    1660             :          * isn't going to trigger.
    1661             :          */
    1662           0 :         if (!ret)
    1663           0 :                 __remove_wait_queue(q, &wait->wait);
    1664             :         else
    1665             :                 ret = -EIOCBQUEUED;
    1666           0 :         spin_unlock_irq(&q->lock);
    1667           0 :         return ret;
    1668             : }
    1669             : 
    1670             : /*
    1671             :  * Return values:
    1672             :  * true - folio is locked; mmap_lock is still held.
    1673             :  * false - folio is not locked.
    1674             :  *     mmap_lock has been released (mmap_read_unlock(), unless flags had both
    1675             :  *     FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
    1676             :  *     which case mmap_lock is still held.
    1677             :  *
    1678             :  * If neither ALLOW_RETRY nor KILLABLE are set, will always return true
    1679             :  * with the folio locked and the mmap_lock unperturbed.
    1680             :  */
    1681           0 : bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm,
    1682             :                          unsigned int flags)
    1683             : {
    1684           0 :         if (fault_flag_allow_retry_first(flags)) {
    1685             :                 /*
    1686             :                  * CAUTION! In this case, mmap_lock is not released
    1687             :                  * even though return 0.
    1688             :                  */
    1689           0 :                 if (flags & FAULT_FLAG_RETRY_NOWAIT)
    1690             :                         return false;
    1691             : 
    1692           0 :                 mmap_read_unlock(mm);
    1693           0 :                 if (flags & FAULT_FLAG_KILLABLE)
    1694           0 :                         folio_wait_locked_killable(folio);
    1695             :                 else
    1696           0 :                         folio_wait_locked(folio);
    1697           0 :                 return false;
    1698             :         }
    1699           0 :         if (flags & FAULT_FLAG_KILLABLE) {
    1700           0 :                 bool ret;
    1701             : 
    1702           0 :                 ret = __folio_lock_killable(folio);
    1703           0 :                 if (ret) {
    1704           0 :                         mmap_read_unlock(mm);
    1705           0 :                         return false;
    1706             :                 }
    1707             :         } else {
    1708           0 :                 __folio_lock(folio);
    1709             :         }
    1710             : 
    1711             :         return true;
    1712             : }
    1713             : 
    1714             : /**
    1715             :  * page_cache_next_miss() - Find the next gap in the page cache.
    1716             :  * @mapping: Mapping.
    1717             :  * @index: Index.
    1718             :  * @max_scan: Maximum range to search.
    1719             :  *
    1720             :  * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
    1721             :  * gap with the lowest index.
    1722             :  *
    1723             :  * This function may be called under the rcu_read_lock.  However, this will
    1724             :  * not atomically search a snapshot of the cache at a single point in time.
    1725             :  * For example, if a gap is created at index 5, then subsequently a gap is
    1726             :  * created at index 10, page_cache_next_miss covering both indices may
    1727             :  * return 10 if called under the rcu_read_lock.
    1728             :  *
    1729             :  * Return: The index of the gap if found, otherwise an index outside the
    1730             :  * range specified (in which case 'return - index >= max_scan' will be true).
    1731             :  * In the rare case of index wrap-around, 0 will be returned.
    1732             :  */
    1733      743940 : pgoff_t page_cache_next_miss(struct address_space *mapping,
    1734             :                              pgoff_t index, unsigned long max_scan)
    1735             : {
    1736      743940 :         XA_STATE(xas, &mapping->i_pages, index);
    1737             : 
    1738   127555017 :         while (max_scan--) {
    1739   127456646 :                 void *entry = xas_next(&xas);
    1740   127456617 :                 if (!entry || xa_is_value(entry))
    1741             :                         break;
    1742   126811077 :                 if (xas.xa_index == 0)
    1743             :                         break;
    1744             :         }
    1745             : 
    1746      743911 :         return xas.xa_index;
    1747             : }
    1748             : EXPORT_SYMBOL(page_cache_next_miss);
    1749             : 
    1750             : /**
    1751             :  * page_cache_prev_miss() - Find the previous gap in the page cache.
    1752             :  * @mapping: Mapping.
    1753             :  * @index: Index.
    1754             :  * @max_scan: Maximum range to search.
    1755             :  *
    1756             :  * Search the range [max(index - max_scan + 1, 0), index] for the
    1757             :  * gap with the highest index.
    1758             :  *
    1759             :  * This function may be called under the rcu_read_lock.  However, this will
    1760             :  * not atomically search a snapshot of the cache at a single point in time.
    1761             :  * For example, if a gap is created at index 10, then subsequently a gap is
    1762             :  * created at index 5, page_cache_prev_miss() covering both indices may
    1763             :  * return 5 if called under the rcu_read_lock.
    1764             :  *
    1765             :  * Return: The index of the gap if found, otherwise an index outside the
    1766             :  * range specified (in which case 'index - return >= max_scan' will be true).
    1767             :  * In the rare case of wrap-around, ULONG_MAX will be returned.
    1768             :  */
    1769     9012795 : pgoff_t page_cache_prev_miss(struct address_space *mapping,
    1770             :                              pgoff_t index, unsigned long max_scan)
    1771             : {
    1772     9012795 :         XA_STATE(xas, &mapping->i_pages, index);
    1773             : 
    1774    12161293 :         while (max_scan--) {
    1775    12160447 :                 void *entry = xas_prev(&xas);
    1776    12160493 :                 if (!entry || xa_is_value(entry))
    1777             :                         break;
    1778     3148498 :                 if (xas.xa_index == ULONG_MAX)
    1779             :                         break;
    1780             :         }
    1781             : 
    1782     9012841 :         return xas.xa_index;
    1783             : }
    1784             : EXPORT_SYMBOL(page_cache_prev_miss);
    1785             : 
    1786             : /*
    1787             :  * Lockless page cache protocol:
    1788             :  * On the lookup side:
    1789             :  * 1. Load the folio from i_pages
    1790             :  * 2. Increment the refcount if it's not zero
    1791             :  * 3. If the folio is not found by xas_reload(), put the refcount and retry
    1792             :  *
    1793             :  * On the removal side:
    1794             :  * A. Freeze the page (by zeroing the refcount if nobody else has a reference)
    1795             :  * B. Remove the page from i_pages
    1796             :  * C. Return the page to the page allocator
    1797             :  *
    1798             :  * This means that any page may have its reference count temporarily
    1799             :  * increased by a speculative page cache (or fast GUP) lookup as it can
    1800             :  * be allocated by another user before the RCU grace period expires.
    1801             :  * Because the refcount temporarily acquired here may end up being the
    1802             :  * last refcount on the page, any page allocation must be freeable by
    1803             :  * folio_put().
    1804             :  */
    1805             : 
    1806             : /*
    1807             :  * filemap_get_entry - Get a page cache entry.
    1808             :  * @mapping: the address_space to search
    1809             :  * @index: The page cache index.
    1810             :  *
    1811             :  * Looks up the page cache entry at @mapping & @index.  If it is a folio,
    1812             :  * it is returned with an increased refcount.  If it is a shadow entry
    1813             :  * of a previously evicted folio, or a swap entry from shmem/tmpfs,
    1814             :  * it is returned without further action.
    1815             :  *
    1816             :  * Return: The folio, swap or shadow entry, %NULL if nothing is found.
    1817             :  */
    1818 28894525802 : void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
    1819             : {
    1820 28894525802 :         XA_STATE(xas, &mapping->i_pages, index);
    1821 28894525802 :         struct folio *folio;
    1822             : 
    1823 28894525802 :         rcu_read_lock();
    1824             : repeat:
    1825 28650435609 :         xas_reset(&xas);
    1826 28650435609 :         folio = xas_load(&xas);
    1827 28755528410 :         if (xas_retry(&xas, folio))
    1828           0 :                 goto repeat;
    1829             :         /*
    1830             :          * A shadow entry of a recently evicted page, or a swap entry from
    1831             :          * shmem/tmpfs.  Return it without attempting to raise page count.
    1832             :          */
    1833 28755528410 :         if (!folio || xa_is_value(folio))
    1834  4632815810 :                 goto out;
    1835             : 
    1836 24122712600 :         if (!folio_try_get_rcu(folio))
    1837       45605 :                 goto repeat;
    1838             : 
    1839 24193571227 :         if (unlikely(folio != xas_reload(&xas))) {
    1840         146 :                 folio_put(folio);
    1841         146 :                 goto repeat;
    1842             :         }
    1843 24161314619 : out:
    1844 28794130429 :         rcu_read_unlock();
    1845             : 
    1846 28780126477 :         return folio;
    1847             : }
    1848             : 
    1849             : /**
    1850             :  * __filemap_get_folio - Find and get a reference to a folio.
    1851             :  * @mapping: The address_space to search.
    1852             :  * @index: The page index.
    1853             :  * @fgp_flags: %FGP flags modify how the folio is returned.
    1854             :  * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
    1855             :  *
    1856             :  * Looks up the page cache entry at @mapping & @index.
    1857             :  *
    1858             :  * @fgp_flags can be zero or more of these flags:
    1859             :  *
    1860             :  * * %FGP_ACCESSED - The folio will be marked accessed.
    1861             :  * * %FGP_LOCK - The folio is returned locked.
    1862             :  * * %FGP_CREAT - If no page is present then a new page is allocated using
    1863             :  *   @gfp and added to the page cache and the VM's LRU list.
    1864             :  *   The page is returned locked and with an increased refcount.
    1865             :  * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
    1866             :  *   page is already in cache.  If the page was allocated, unlock it before
    1867             :  *   returning so the caller can do the same dance.
    1868             :  * * %FGP_WRITE - The page will be written to by the caller.
    1869             :  * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
    1870             :  * * %FGP_NOWAIT - Don't get blocked by page lock.
    1871             :  * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
    1872             :  *
    1873             :  * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
    1874             :  * if the %GFP flags specified for %FGP_CREAT are atomic.
    1875             :  *
    1876             :  * If there is a page cache page, it is returned with an increased refcount.
    1877             :  *
    1878             :  * Return: The found folio or an ERR_PTR() otherwise.
    1879             :  */
    1880  7349207987 : struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
    1881             :                 int fgp_flags, gfp_t gfp)
    1882             : {
    1883  7349210302 :         struct folio *folio;
    1884             : 
    1885             : repeat:
    1886  7349210302 :         folio = filemap_get_entry(mapping, index);
    1887  7350138758 :         if (xa_is_value(folio))
    1888             :                 folio = NULL;
    1889  7309875486 :         if (!folio)
    1890  2818547231 :                 goto no_page;
    1891             : 
    1892  4531591527 :         if (fgp_flags & FGP_LOCK) {
    1893  1378852833 :                 if (fgp_flags & FGP_NOWAIT) {
    1894           0 :                         if (!folio_trylock(folio)) {
    1895           0 :                                 folio_put(folio);
    1896           0 :                                 return ERR_PTR(-EAGAIN);
    1897             :                         }
    1898             :                 } else {
    1899  1378852833 :                         folio_lock(folio);
    1900             :                 }
    1901             : 
    1902             :                 /* Has the page been truncated? */
    1903  1378926099 :                 if (unlikely(folio->mapping != mapping)) {
    1904           6 :                         folio_unlock(folio);
    1905           6 :                         folio_put(folio);
    1906           6 :                         goto repeat;
    1907             :                 }
    1908  4531664787 :                 VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
    1909             :         }
    1910             : 
    1911  4531664787 :         if (fgp_flags & FGP_ACCESSED)
    1912   108593436 :                 folio_mark_accessed(folio);
    1913             :         else if (fgp_flags & FGP_WRITE) {
    1914             :                 /* Clear idle flag for buffer write */
    1915             :                 if (folio_test_idle(folio))
    1916             :                         folio_clear_idle(folio);
    1917             :         }
    1918             : 
    1919  4531680409 :         if (fgp_flags & FGP_STABLE)
    1920  1301058493 :                 folio_wait_stable(folio);
    1921  3230621916 : no_page:
    1922  7349754187 :         if (!folio && (fgp_flags & FGP_CREAT)) {
    1923   816973560 :                 int err;
    1924  1572529512 :                 if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
    1925   755568049 :                         gfp |= __GFP_WRITE;
    1926   817054562 :                 if (fgp_flags & FGP_NOFS)
    1927   719266972 :                         gfp &= ~__GFP_FS;
    1928   817054562 :                 if (fgp_flags & FGP_NOWAIT) {
    1929        2349 :                         gfp &= ~GFP_KERNEL;
    1930        2349 :                         gfp |= GFP_NOWAIT | __GFP_NOWARN;
    1931             :                 }
    1932             : 
    1933   817054562 :                 folio = filemap_alloc_folio(gfp, 0);
    1934   818803789 :                 if (!folio)
    1935             :                         return ERR_PTR(-ENOMEM);
    1936             : 
    1937   818803789 :                 if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
    1938           0 :                         fgp_flags |= FGP_LOCK;
    1939             : 
    1940             :                 /* Init accessed so avoid atomic mark_page_accessed later */
    1941   818803789 :                 if (fgp_flags & FGP_ACCESSED)
    1942    62304909 :                         __folio_set_referenced(folio);
    1943             : 
    1944   818801637 :                 err = filemap_add_folio(mapping, folio, index, gfp);
    1945   818016328 :                 if (unlikely(err)) {
    1946        2310 :                         folio_put(folio);
    1947        2309 :                         folio = NULL;
    1948        2309 :                         if (err == -EEXIST)
    1949        2309 :                                 goto repeat;
    1950             :                 }
    1951             : 
    1952             :                 /*
    1953             :                  * filemap_add_folio locks the page, and for mmap
    1954             :                  * we expect an unlocked page.
    1955             :                  */
    1956   818014018 :                 if (folio && (fgp_flags & FGP_FOR_MMAP))
    1957       22360 :                         folio_unlock(folio);
    1958             :         }
    1959             : 
    1960  7350794676 :         if (!folio)
    1961  2001584570 :                 return ERR_PTR(-ENOENT);
    1962             :         return folio;
    1963             : }
    1964             : EXPORT_SYMBOL(__filemap_get_folio);
    1965             : 
    1966  8270296813 : static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
    1967             :                 xa_mark_t mark)
    1968             : {
    1969  8270298688 :         struct folio *folio;
    1970             : 
    1971             : retry:
    1972  8270298688 :         if (mark == XA_PRESENT)
    1973  6186122851 :                 folio = xas_find(xas, max);
    1974             :         else
    1975  2084175837 :                 folio = xas_find_marked(xas, max, mark);
    1976             : 
    1977  8270294006 :         if (xas_retry(xas, folio))
    1978           0 :                 goto retry;
    1979             :         /*
    1980             :          * A shadow entry of a recently evicted page, a swap
    1981             :          * entry from shmem/tmpfs or a DAX entry.  Return it
    1982             :          * without attempting to raise page count.
    1983             :          */
    1984  8270294006 :         if (!folio || xa_is_value(folio))
    1985  3582297253 :                 return folio;
    1986             : 
    1987  4687996753 :         if (!folio_try_get_rcu(folio))
    1988        1844 :                 goto reset;
    1989             : 
    1990  4688391960 :         if (unlikely(folio != xas_reload(xas))) {
    1991          31 :                 folio_put(folio);
    1992          31 :                 goto reset;
    1993             :         }
    1994             : 
    1995             :         return folio;
    1996        1875 : reset:
    1997        1875 :         xas_reset(xas);
    1998        1875 :         goto retry;
    1999             : }
    2000             : 
    2001             : /**
    2002             :  * find_get_entries - gang pagecache lookup
    2003             :  * @mapping:    The address_space to search
    2004             :  * @start:      The starting page cache index
    2005             :  * @end:        The final page index (inclusive).
    2006             :  * @fbatch:     Where the resulting entries are placed.
    2007             :  * @indices:    The cache indices corresponding to the entries in @entries
    2008             :  *
    2009             :  * find_get_entries() will search for and return a batch of entries in
    2010             :  * the mapping.  The entries are placed in @fbatch.  find_get_entries()
    2011             :  * takes a reference on any actual folios it returns.
    2012             :  *
    2013             :  * The entries have ascending indexes.  The indices may not be consecutive
    2014             :  * due to not-present entries or large folios.
    2015             :  *
    2016             :  * Any shadow entries of evicted folios, or swap entries from
    2017             :  * shmem/tmpfs, are included in the returned array.
    2018             :  *
    2019             :  * Return: The number of entries which were found.
    2020             :  */
    2021  1044075022 : unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
    2022             :                 pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
    2023             : {
    2024  1044075022 :         XA_STATE(xas, &mapping->i_pages, *start);
    2025  1044075022 :         struct folio *folio;
    2026             : 
    2027  1044075022 :         rcu_read_lock();
    2028  1141403413 :         while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
    2029   104648138 :                 indices[fbatch->nr] = xas.xa_index;
    2030   104648138 :                 if (!folio_batch_add(fbatch, folio))
    2031             :                         break;
    2032             :         }
    2033  1042180001 :         rcu_read_unlock();
    2034             : 
    2035  1042739822 :         if (folio_batch_count(fbatch)) {
    2036    13911685 :                 unsigned long nr = 1;
    2037    13911685 :                 int idx = folio_batch_count(fbatch) - 1;
    2038             : 
    2039    13911685 :                 folio = fbatch->folios[idx];
    2040    13911685 :                 if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
    2041    13941211 :                         nr = folio_nr_pages(folio);
    2042    13911546 :                 *start = indices[idx] + nr;
    2043             :         }
    2044  1042739683 :         return folio_batch_count(fbatch);
    2045             : }
    2046             : 
    2047             : /**
    2048             :  * find_lock_entries - Find a batch of pagecache entries.
    2049             :  * @mapping:    The address_space to search.
    2050             :  * @start:      The starting page cache index.
    2051             :  * @end:        The final page index (inclusive).
    2052             :  * @fbatch:     Where the resulting entries are placed.
    2053             :  * @indices:    The cache indices of the entries in @fbatch.
    2054             :  *
    2055             :  * find_lock_entries() will return a batch of entries from @mapping.
    2056             :  * Swap, shadow and DAX entries are included.  Folios are returned
    2057             :  * locked and with an incremented refcount.  Folios which are locked
    2058             :  * by somebody else or under writeback are skipped.  Folios which are
    2059             :  * partially outside the range are not returned.
    2060             :  *
    2061             :  * The entries have ascending indexes.  The indices may not be consecutive
    2062             :  * due to not-present entries, large folios, folios which could not be
    2063             :  * locked or folios under writeback.
    2064             :  *
    2065             :  * Return: The number of entries which were found.
    2066             :  */
    2067  1339538308 : unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
    2068             :                 pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
    2069  3568546333 : {
    2070  1339538308 :         XA_STATE(xas, &mapping->i_pages, *start);
    2071  1339538308 :         struct folio *folio;
    2072             : 
    2073  1339538308 :         rcu_read_lock();
    2074  4934263364 :         while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
    2075  3778264376 :                 if (!xa_is_value(folio)) {
    2076  3521235703 :                         if (folio->index < *start)
    2077     4737826 :                                 goto put;
    2078  3632536565 :                         if (folio->index + folio_nr_pages(folio) - 1 > end)
    2079     6846228 :                                 goto put;
    2080  3509651649 :                         if (!folio_trylock(folio))
    2081     7144318 :                                 goto put;
    2082  3502611076 :                         if (folio->mapping != mapping ||
    2083             :                             folio_test_writeback(folio))
    2084     9847823 :                                 goto unlock;
    2085  3749791926 :                         VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
    2086             :                                         folio);
    2087             :                 }
    2088  3749791926 :                 indices[fbatch->nr] = xas.xa_index;
    2089  3749791926 :                 if (!folio_batch_add(fbatch, folio))
    2090             :                         break;
    2091  3568546333 :                 continue;
    2092             : unlock:
    2093     9847823 :                 folio_unlock(folio);
    2094    28575402 : put:
    2095    28575402 :                 folio_put(folio);
    2096             :         }
    2097  1337958410 :         rcu_read_unlock();
    2098             : 
    2099  1337519920 :         if (folio_batch_count(fbatch)) {
    2100   446951815 :                 unsigned long nr = 1;
    2101   446951815 :                 int idx = folio_batch_count(fbatch) - 1;
    2102             : 
    2103   446951815 :                 folio = fbatch->folios[idx];
    2104   446951815 :                 if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
    2105   438815465 :                         nr = folio_nr_pages(folio);
    2106   446899929 :                 *start = indices[idx] + nr;
    2107             :         }
    2108  1337468034 :         return folio_batch_count(fbatch);
    2109             : }
    2110             : 
    2111             : /**
    2112             :  * filemap_get_folios - Get a batch of folios
    2113             :  * @mapping:    The address_space to search
    2114             :  * @start:      The starting page index
    2115             :  * @end:        The final page index (inclusive)
    2116             :  * @fbatch:     The batch to fill.
    2117             :  *
    2118             :  * Search for and return a batch of folios in the mapping starting at
    2119             :  * index @start and up to index @end (inclusive).  The folios are returned
    2120             :  * in @fbatch with an elevated reference count.
    2121             :  *
    2122             :  * The first folio may start before @start; if it does, it will contain
    2123             :  * @start.  The final folio may extend beyond @end; if it does, it will
    2124             :  * contain @end.  The folios have ascending indices.  There may be gaps
    2125             :  * between the folios if there are indices which have no folio in the
    2126             :  * page cache.  If folios are added to or removed from the page cache
    2127             :  * while this is running, they may or may not be found by this call.
    2128             :  *
    2129             :  * Return: The number of folios which were found.
    2130             :  * We also update @start to index the next folio for the traversal.
    2131             :  */
    2132    33343206 : unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
    2133             :                 pgoff_t end, struct folio_batch *fbatch)
    2134             : {
    2135    33343206 :         XA_STATE(xas, &mapping->i_pages, *start);
    2136    33343206 :         struct folio *folio;
    2137             : 
    2138    33343206 :         rcu_read_lock();
    2139    64309208 :         while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
    2140             :                 /* Skip over shadow, swap and DAX entries */
    2141    32865017 :                 if (xa_is_value(folio))
    2142           0 :                         continue;
    2143    32865017 :                 if (!folio_batch_add(fbatch, folio)) {
    2144     1807007 :                         unsigned long nr = folio_nr_pages(folio);
    2145             : 
    2146     1807007 :                         if (folio_test_hugetlb(folio))
    2147           0 :                                 nr = 1;
    2148     1807011 :                         *start = folio->index + nr;
    2149     1807011 :                         goto out;
    2150             :                 }
    2151             :         }
    2152             : 
    2153             :         /*
    2154             :          * We come here when there is no page beyond @end. We take care to not
    2155             :          * overflow the index @start as it confuses some of the callers. This
    2156             :          * breaks the iteration when there is a page at index -1 but that is
    2157             :          * already broken anyway.
    2158             :          */
    2159    31483774 :         if (end == (pgoff_t)-1)
    2160           0 :                 *start = (pgoff_t)-1;
    2161             :         else
    2162    31483774 :                 *start = end + 1;
    2163    33290785 : out:
    2164    33290785 :         rcu_read_unlock();
    2165             : 
    2166    33273657 :         return folio_batch_count(fbatch);
    2167             : }
    2168             : EXPORT_SYMBOL(filemap_get_folios);
    2169             : 
    2170             : static inline
    2171 34464183838 : bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max)
    2172             : {
    2173 34464183838 :         if (!folio_test_large(folio) || folio_test_hugetlb(folio))
    2174 34453569951 :                 return false;
    2175    10613954 :         if (index >= max)
    2176             :                 return false;
    2177    19132012 :         return index < folio->index + folio_nr_pages(folio) - 1;
    2178             : }
    2179             : 
    2180             : /**
    2181             :  * filemap_get_folios_contig - Get a batch of contiguous folios
    2182             :  * @mapping:    The address_space to search
    2183             :  * @start:      The starting page index
    2184             :  * @end:        The final page index (inclusive)
    2185             :  * @fbatch:     The batch to fill
    2186             :  *
    2187             :  * filemap_get_folios_contig() works exactly like filemap_get_folios(),
    2188             :  * except the returned folios are guaranteed to be contiguous. This may
    2189             :  * not return all contiguous folios if the batch gets filled up.
    2190             :  *
    2191             :  * Return: The number of folios found.
    2192             :  * Also update @start to be positioned for traversal of the next folio.
    2193             :  */
    2194             : 
    2195    11155523 : unsigned filemap_get_folios_contig(struct address_space *mapping,
    2196             :                 pgoff_t *start, pgoff_t end, struct folio_batch *fbatch)
    2197   104387693 : {
    2198    11155523 :         XA_STATE(xas, &mapping->i_pages, *start);
    2199    11155523 :         unsigned long nr;
    2200    11155523 :         struct folio *folio;
    2201             : 
    2202    11155523 :         rcu_read_lock();
    2203             : 
    2204   115539222 :         for (folio = xas_load(&xas); folio && xas.xa_index <= end;
    2205   104387693 :                         folio = xas_next(&xas)) {
    2206   110920885 :                 if (xas_retry(&xas, folio))
    2207           0 :                         continue;
    2208             :                 /*
    2209             :                  * If the entry has been swapped out, we can stop looking.
    2210             :                  * No current caller is looking for DAX entries.
    2211             :                  */
    2212   110920885 :                 if (xa_is_value(folio))
    2213           0 :                         goto update_start;
    2214             : 
    2215   110920885 :                 if (!folio_try_get_rcu(folio))
    2216           0 :                         goto retry;
    2217             : 
    2218   110923733 :                 if (unlikely(folio != xas_reload(&xas)))
    2219           0 :                         goto put_folio;
    2220             : 
    2221   110924771 :                 if (!folio_batch_add(fbatch, folio)) {
    2222     6537078 :                         nr = folio_nr_pages(folio);
    2223             : 
    2224     6537078 :                         if (folio_test_hugetlb(folio))
    2225           0 :                                 nr = 1;
    2226     6537085 :                         *start = folio->index + nr;
    2227     6537085 :                         goto out;
    2228             :                 }
    2229   104387693 :                 continue;
    2230             : put_folio:
    2231           0 :                 folio_put(folio);
    2232             : 
    2233           0 : retry:
    2234           0 :                 xas_reset(&xas);
    2235             :         }
    2236             : 
    2237     4618415 : update_start:
    2238     4618415 :         nr = folio_batch_count(fbatch);
    2239             : 
    2240     4618415 :         if (nr) {
    2241     4618407 :                 folio = fbatch->folios[nr - 1];
    2242     4618407 :                 if (folio_test_hugetlb(folio))
    2243           0 :                         *start = folio->index + 1;
    2244             :                 else
    2245     4618369 :                         *start = folio->index + folio_nr_pages(folio);
    2246             :         }
    2247           8 : out:
    2248    11155462 :         rcu_read_unlock();
    2249    11155350 :         return folio_batch_count(fbatch);
    2250             : }
    2251             : EXPORT_SYMBOL(filemap_get_folios_contig);
    2252             : 
    2253             : /**
    2254             :  * filemap_get_folios_tag - Get a batch of folios matching @tag
    2255             :  * @mapping:    The address_space to search
    2256             :  * @start:      The starting page index
    2257             :  * @end:        The final page index (inclusive)
    2258             :  * @tag:        The tag index
    2259             :  * @fbatch:     The batch to fill
    2260             :  *
    2261             :  * Same as filemap_get_folios(), but only returning folios tagged with @tag.
    2262             :  *
    2263             :  * Return: The number of folios found.
    2264             :  * Also update @start to index the next folio for traversal.
    2265             :  */
    2266  1144587010 : unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
    2267             :                         pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch)
    2268             : {
    2269  1144587010 :         XA_STATE(xas, &mapping->i_pages, *start);
    2270  1144587010 :         struct folio *folio;
    2271             : 
    2272  1144587010 :         rcu_read_lock();
    2273  2084043368 :         while ((folio = find_get_entry(&xas, end, tag)) != NULL) {
    2274             :                 /*
    2275             :                  * Shadow entries should never be tagged, but this iteration
    2276             :                  * is lockless so there is a window for page reclaim to evict
    2277             :                  * a page we saw tagged. Skip over it.
    2278             :                  */
    2279   986738144 :                 if (xa_is_value(folio))
    2280           0 :                         continue;
    2281   986738144 :                 if (!folio_batch_add(fbatch, folio)) {
    2282    46647756 :                         unsigned long nr = folio_nr_pages(folio);
    2283             : 
    2284    46647756 :                         if (folio_test_hugetlb(folio))
    2285           0 :                                 nr = 1;
    2286    46647721 :                         *start = folio->index + nr;
    2287    46647721 :                         goto out;
    2288             :                 }
    2289             :         }
    2290             :         /*
    2291             :          * We come here when there is no page beyond @end. We take care to not
    2292             :          * overflow the index @start as it confuses some of the callers. This
    2293             :          * breaks the iteration when there is a page at index -1 but that is
    2294             :          * already broke anyway.
    2295             :          */
    2296  1097550646 :         if (end == (pgoff_t)-1)
    2297    47803900 :                 *start = (pgoff_t)-1;
    2298             :         else
    2299  1049746746 :                 *start = end + 1;
    2300  1144198367 : out:
    2301  1144198367 :         rcu_read_unlock();
    2302             : 
    2303  1143988762 :         return folio_batch_count(fbatch);
    2304             : }
    2305             : EXPORT_SYMBOL(filemap_get_folios_tag);
    2306             : 
    2307             : /*
    2308             :  * CD/DVDs are error prone. When a medium error occurs, the driver may fail
    2309             :  * a _large_ part of the i/o request. Imagine the worst scenario:
    2310             :  *
    2311             :  *      ---R__________________________________________B__________
    2312             :  *         ^ reading here                             ^ bad block(assume 4k)
    2313             :  *
    2314             :  * read(R) => miss => readahead(R...B) => media error => frustrating retries
    2315             :  * => failing the whole request => read(R) => read(R+1) =>
    2316             :  * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
    2317             :  * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
    2318             :  * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
    2319             :  *
    2320             :  * It is going insane. Fix it by quickly scaling down the readahead size.
    2321             :  */
    2322             : static void shrink_readahead_size_eio(struct file_ra_state *ra)
    2323             : {
    2324        8543 :         ra->ra_pages /= 4;
    2325        8543 : }
    2326             : 
    2327             : /*
    2328             :  * filemap_get_read_batch - Get a batch of folios for read
    2329             :  *
    2330             :  * Get a batch of folios which represent a contiguous range of bytes in
    2331             :  * the file.  No exceptional entries will be returned.  If @index is in
    2332             :  * the middle of a folio, the entire folio will be returned.  The last
    2333             :  * folio in the batch may have the readahead flag set or the uptodate flag
    2334             :  * clear so that the caller can take the appropriate action.
    2335             :  */
    2336  1045853163 : static void filemap_get_read_batch(struct address_space *mapping,
    2337             :                 pgoff_t index, pgoff_t max, struct folio_batch *fbatch)
    2338  1581874173 : {
    2339  1045853163 :         XA_STATE(xas, &mapping->i_pages, index);
    2340  1045853163 :         struct folio *folio;
    2341             : 
    2342  1045853163 :         rcu_read_lock();
    2343  2626810171 :         for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
    2344  2520521404 :                 if (xas_retry(&xas, folio))
    2345           0 :                         continue;
    2346  2520521404 :                 if (xas.xa_index > max || xa_is_value(folio))
    2347             :                         break;
    2348  1622876152 :                 if (xa_is_sibling(folio))
    2349             :                         break;
    2350  1622876152 :                 if (!folio_try_get_rcu(folio))
    2351          71 :                         goto retry;
    2352             : 
    2353  1624345905 :                 if (unlikely(folio != xas_reload(&xas)))
    2354         124 :                         goto put_folio;
    2355             : 
    2356  1624337062 :                 if (!folio_batch_add(fbatch, folio))
    2357             :                         break;
    2358  3177626586 :                 if (!folio_test_uptodate(folio))
    2359             :                         break;
    2360  1582973911 :                 if (folio_test_readahead(folio))
    2361             :                         break;
    2362  1804543479 :                 xas_advance(&xas, folio->index + folio_nr_pages(folio) - 1);
    2363  1581874173 :                 continue;
    2364             : put_folio:
    2365         124 :                 folio_put(folio);
    2366         195 : retry:
    2367         195 :                 xas_reset(&xas);
    2368             :         }
    2369  1048074408 :         rcu_read_unlock();
    2370  1047911268 : }
    2371             : 
    2372  1659339631 : static int filemap_read_folio(struct file *file, filler_t filler,
    2373             :                 struct folio *folio)
    2374             : {
    2375  1659339631 :         bool workingset = folio_test_workingset(folio);
    2376  1659339631 :         unsigned long pflags;
    2377  1659339631 :         int error;
    2378             : 
    2379             :         /*
    2380             :          * A previous I/O error may have been due to temporary failures,
    2381             :          * eg. multipath errors.  PG_error will be set again if read_folio
    2382             :          * fails.
    2383             :          */
    2384  1659339631 :         folio_clear_error(folio);
    2385             : 
    2386             :         /* Start the actual read. The read will unlock the page. */
    2387  1660649223 :         if (unlikely(workingset))
    2388     1484036 :                 psi_memstall_enter(&pflags);
    2389  1660649224 :         error = filler(file, folio);
    2390  1660425411 :         if (unlikely(workingset))
    2391     1484036 :                 psi_memstall_leave(&pflags);
    2392  1660425411 :         if (error)
    2393             :                 return error;
    2394             : 
    2395  1660458179 :         error = folio_wait_locked_killable(folio);
    2396  1660393468 :         if (error)
    2397             :                 return error;
    2398  3320768559 :         if (folio_test_uptodate(folio))
    2399             :                 return 0;
    2400        8543 :         if (file)
    2401        8543 :                 shrink_readahead_size_eio(&file->f_ra);
    2402             :         return -EIO;
    2403             : }
    2404             : 
    2405       58794 : static bool filemap_range_uptodate(struct address_space *mapping,
    2406             :                 loff_t pos, size_t count, struct folio *folio,
    2407             :                 bool need_uptodate)
    2408             : {
    2409       60905 :         if (folio_test_uptodate(folio))
    2410             :                 return true;
    2411             :         /* pipes can't handle partially uptodate pages */
    2412       56683 :         if (need_uptodate)
    2413             :                 return false;
    2414       43682 :         if (!mapping->a_ops->is_partially_uptodate)
    2415             :                 return false;
    2416       17185 :         if (mapping->host->i_blkbits >= folio_shift(folio))
    2417             :                 return false;
    2418             : 
    2419          62 :         if (folio_pos(folio) > pos) {
    2420          27 :                 count -= folio_pos(folio) - pos;
    2421          27 :                 pos = 0;
    2422             :         } else {
    2423          35 :                 pos -= folio_pos(folio);
    2424             :         }
    2425             : 
    2426          62 :         return mapping->a_ops->is_partially_uptodate(folio, pos, count);
    2427             : }
    2428             : 
    2429    11719550 : static int filemap_update_page(struct kiocb *iocb,
    2430             :                 struct address_space *mapping, size_t count,
    2431             :                 struct folio *folio, bool need_uptodate)
    2432             : {
    2433    11719550 :         int error;
    2434             : 
    2435    11719550 :         if (iocb->ki_flags & IOCB_NOWAIT) {
    2436           0 :                 if (!filemap_invalidate_trylock_shared(mapping))
    2437             :                         return -EAGAIN;
    2438             :         } else {
    2439    11719550 :                 filemap_invalidate_lock_shared(mapping);
    2440             :         }
    2441             : 
    2442    11719473 :         if (!folio_trylock(folio)) {
    2443    11660762 :                 error = -EAGAIN;
    2444    11660762 :                 if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
    2445           0 :                         goto unlock_mapping;
    2446    11660762 :                 if (!(iocb->ki_flags & IOCB_WAITQ)) {
    2447    11660762 :                         filemap_invalidate_unlock_shared(mapping);
    2448             :                         /*
    2449             :                          * This is where we usually end up waiting for a
    2450             :                          * previously submitted readahead to finish.
    2451             :                          */
    2452    11660723 :                         folio_put_wait_locked(folio, TASK_KILLABLE);
    2453    11660723 :                         return AOP_TRUNCATED_PAGE;
    2454             :                 }
    2455           0 :                 error = __folio_lock_async(folio, iocb->ki_waitq);
    2456           0 :                 if (error)
    2457           0 :                         goto unlock_mapping;
    2458             :         }
    2459             : 
    2460       58805 :         error = AOP_TRUNCATED_PAGE;
    2461       58805 :         if (!folio->mapping)
    2462          11 :                 goto unlock;
    2463             : 
    2464       58794 :         error = 0;
    2465       58794 :         if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio,
    2466             :                                    need_uptodate))
    2467        2112 :                 goto unlock;
    2468             : 
    2469       56682 :         error = -EAGAIN;
    2470       56682 :         if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
    2471           0 :                 goto unlock;
    2472             : 
    2473       56682 :         error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
    2474             :                         folio);
    2475       56682 :         goto unlock_mapping;
    2476        2123 : unlock:
    2477        2123 :         folio_unlock(folio);
    2478       58805 : unlock_mapping:
    2479       58805 :         filemap_invalidate_unlock_shared(mapping);
    2480       58805 :         if (error == AOP_TRUNCATED_PAGE)
    2481          11 :                 folio_put(folio);
    2482             :         return error;
    2483             : }
    2484             : 
    2485      208038 : static int filemap_create_folio(struct file *file,
    2486             :                 struct address_space *mapping, pgoff_t index,
    2487             :                 struct folio_batch *fbatch)
    2488             : {
    2489      208038 :         struct folio *folio;
    2490      208038 :         int error;
    2491             : 
    2492      208038 :         folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0);
    2493      208038 :         if (!folio)
    2494             :                 return -ENOMEM;
    2495             : 
    2496             :         /*
    2497             :          * Protect against truncate / hole punch. Grabbing invalidate_lock
    2498             :          * here assures we cannot instantiate and bring uptodate new
    2499             :          * pagecache folios after evicting page cache during truncate
    2500             :          * and before actually freeing blocks.  Note that we could
    2501             :          * release invalidate_lock after inserting the folio into
    2502             :          * the page cache as the locked folio would then be enough to
    2503             :          * synchronize with hole punching. But there are code paths
    2504             :          * such as filemap_update_page() filling in partially uptodate
    2505             :          * pages or ->readahead() that need to hold invalidate_lock
    2506             :          * while mapping blocks for IO so let's hold the lock here as
    2507             :          * well to keep locking rules simple.
    2508             :          */
    2509      208038 :         filemap_invalidate_lock_shared(mapping);
    2510      208038 :         error = filemap_add_folio(mapping, folio, index,
    2511             :                         mapping_gfp_constraint(mapping, GFP_KERNEL));
    2512      208038 :         if (error == -EEXIST)
    2513             :                 error = AOP_TRUNCATED_PAGE;
    2514      208035 :         if (error)
    2515           3 :                 goto error;
    2516             : 
    2517      208035 :         error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
    2518      208035 :         if (error)
    2519           3 :                 goto error;
    2520             : 
    2521      208032 :         filemap_invalidate_unlock_shared(mapping);
    2522      208032 :         folio_batch_add(fbatch, folio);
    2523      208032 :         return 0;
    2524           6 : error:
    2525           6 :         filemap_invalidate_unlock_shared(mapping);
    2526           6 :         folio_put(folio);
    2527           6 :         return error;
    2528             : }
    2529             : 
    2530     1327658 : static int filemap_readahead(struct kiocb *iocb, struct file *file,
    2531             :                 struct address_space *mapping, struct folio *folio,
    2532             :                 pgoff_t last_index)
    2533             : {
    2534     1327658 :         DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index);
    2535             : 
    2536     1327658 :         if (iocb->ki_flags & IOCB_NOIO)
    2537             :                 return -EAGAIN;
    2538     1327658 :         page_cache_async_ra(&ractl, folio, last_index - folio->index);
    2539     1327658 :         return 0;
    2540             : }
    2541             : 
    2542  1007227146 : static int filemap_get_pages(struct kiocb *iocb, size_t count,
    2543             :                 struct folio_batch *fbatch, bool need_uptodate)
    2544             : {
    2545  1007227146 :         struct file *filp = iocb->ki_filp;
    2546  1007227146 :         struct address_space *mapping = filp->f_mapping;
    2547  1007227146 :         struct file_ra_state *ra = &filp->f_ra;
    2548  1007227146 :         pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
    2549  1007227146 :         pgoff_t last_index;
    2550  1007227146 :         struct folio *folio;
    2551  1007227146 :         int err = 0;
    2552             : 
    2553             :         /* "last_index" is the index of the page beyond the end of the read */
    2554  1007227146 :         last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
    2555             : retry:
    2556  1017108783 :         if (fatal_signal_pending(current))
    2557             :                 return -EINTR;
    2558             : 
    2559  1015633136 :         filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
    2560  1018267360 :         if (!folio_batch_count(fbatch)) {
    2561    29482308 :                 if (iocb->ki_flags & IOCB_NOIO)
    2562             :                         return -EAGAIN;
    2563    29482308 :                 page_cache_sync_readahead(mapping, ra, filp, index,
    2564             :                                 last_index - index);
    2565    29484939 :                 filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
    2566             :         }
    2567  1018270223 :         if (!folio_batch_count(fbatch)) {
    2568      208038 :                 if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
    2569             :                         return -EAGAIN;
    2570      208038 :                 err = filemap_create_folio(filp, mapping,
    2571      208038 :                                 iocb->ki_pos >> PAGE_SHIFT, fbatch);
    2572      208038 :                 if (err == AOP_TRUNCATED_PAGE)
    2573           3 :                         goto retry;
    2574      208035 :                 return err;
    2575             :         }
    2576             : 
    2577  1018062185 :         folio = fbatch->folios[folio_batch_count(fbatch) - 1];
    2578  1018062185 :         if (folio_test_readahead(folio)) {
    2579     1327677 :                 err = filemap_readahead(iocb, filp, mapping, folio, last_index);
    2580     1327693 :                 if (err)
    2581           0 :                         goto err;
    2582             :         }
    2583  2024226332 :         if (!folio_test_uptodate(folio)) {
    2584    11719672 :                 if ((iocb->ki_flags & IOCB_WAITQ) &&
    2585             :                     folio_batch_count(fbatch) > 1)
    2586           0 :                         iocb->ki_flags |= IOCB_NOWAIT;
    2587    11719672 :                 err = filemap_update_page(iocb, mapping, count, folio,
    2588             :                                           need_uptodate);
    2589    11716871 :                 if (err)
    2590    11664348 :                         goto err;
    2591             :         }
    2592             : 
    2593             :         return 0;
    2594    11664348 : err:
    2595    11664348 :         if (err < 0)
    2596        6271 :                 folio_put(folio);
    2597    11664348 :         if (likely(--fbatch->nr))
    2598             :                 return 0;
    2599     9887783 :         if (err == AOP_TRUNCATED_PAGE)
    2600     9881634 :                 goto retry;
    2601             :         return err;
    2602             : }
    2603             : 
    2604             : static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
    2605             : {
    2606   988562158 :         unsigned int shift = folio_shift(folio);
    2607             : 
    2608   988562158 :         return (pos1 >> shift == pos2 >> shift);
    2609             : }
    2610             : 
    2611             : /**
    2612             :  * filemap_read - Read data from the page cache.
    2613             :  * @iocb: The iocb to read.
    2614             :  * @iter: Destination for the data.
    2615             :  * @already_read: Number of bytes already read by the caller.
    2616             :  *
    2617             :  * Copies data from the page cache.  If the data is not currently present,
    2618             :  * uses the readahead and read_folio address_space operations to fetch it.
    2619             :  *
    2620             :  * Return: Total number of bytes copied, including those already read by
    2621             :  * the caller.  If an error happens before any bytes are copied, returns
    2622             :  * a negative error number.
    2623             :  */
    2624   980461766 : ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
    2625             :                 ssize_t already_read)
    2626             : {
    2627   980461766 :         struct file *filp = iocb->ki_filp;
    2628   980461766 :         struct file_ra_state *ra = &filp->f_ra;
    2629   980461766 :         struct address_space *mapping = filp->f_mapping;
    2630   980461766 :         struct inode *inode = mapping->host;
    2631   980461766 :         struct folio_batch fbatch;
    2632   980461766 :         int i, error = 0;
    2633   980461766 :         bool writably_mapped;
    2634   980461766 :         loff_t isize, end_offset;
    2635             : 
    2636   980461766 :         if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
    2637             :                 return 0;
    2638   980461766 :         if (unlikely(!iov_iter_count(iter)))
    2639             :                 return 0;
    2640             : 
    2641   980461766 :         iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
    2642   980461766 :         folio_batch_init(&fbatch);
    2643             : 
    2644  1010425136 :         do {
    2645  1010425136 :                 cond_resched();
    2646             : 
    2647             :                 /*
    2648             :                  * If we've already successfully copied some data, then we
    2649             :                  * can no longer safely return -EIOCBQUEUED. Hence mark
    2650             :                  * an async read NOWAIT at that point.
    2651             :                  */
    2652  1009797334 :                 if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
    2653           0 :                         iocb->ki_flags |= IOCB_NOWAIT;
    2654             : 
    2655  1009797334 :                 if (unlikely(iocb->ki_pos >= i_size_read(inode)))
    2656             :                         break;
    2657             : 
    2658   988731988 :                 error = filemap_get_pages(iocb, iter->count, &fbatch, false);
    2659   988568299 :                 if (error < 0)
    2660             :                         break;
    2661             : 
    2662             :                 /*
    2663             :                  * i_size must be checked after we know the pages are Uptodate.
    2664             :                  *
    2665             :                  * Checking i_size after the check allows us to calculate
    2666             :                  * the correct value for "nr", which means the zero-filled
    2667             :                  * part of the page is not copied back to userspace (unless
    2668             :                  * another truncate extends the file - this is desired though).
    2669             :                  */
    2670   988562158 :                 isize = i_size_read(inode);
    2671   988562158 :                 if (unlikely(iocb->ki_pos >= isize))
    2672           0 :                         goto put_folios;
    2673   988562158 :                 end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
    2674             : 
    2675             :                 /*
    2676             :                  * Once we start copying data, we don't want to be touching any
    2677             :                  * cachelines that might be contended:
    2678             :                  */
    2679   988562158 :                 writably_mapped = mapping_writably_mapped(mapping);
    2680             : 
    2681             :                 /*
    2682             :                  * When a read accesses the same folio several times, only
    2683             :                  * mark it as accessed the first time.
    2684             :                  */
    2685  1127351392 :                 if (!pos_same_folio(iocb->ki_pos, ra->prev_pos - 1,
    2686             :                                                         fbatch.folios[0]))
    2687   466229641 :                         folio_mark_accessed(fbatch.folios[0]);
    2688             : 
    2689  2483665718 :                 for (i = 0; i < folio_batch_count(&fbatch); i++) {
    2690  1494107419 :                         struct folio *folio = fbatch.folios[i];
    2691  1494107419 :                         size_t fsize = folio_size(folio);
    2692  1494107419 :                         size_t offset = iocb->ki_pos & (fsize - 1);
    2693  1494107419 :                         size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
    2694             :                                              fsize - offset);
    2695  1494107419 :                         size_t copied;
    2696             : 
    2697  1494107419 :                         if (end_offset < folio_pos(folio))
    2698             :                                 break;
    2699  1494107419 :                         if (i > 0)
    2700   505230665 :                                 folio_mark_accessed(folio);
    2701             :                         /*
    2702             :                          * If users can be writing to this folio using arbitrary
    2703             :                          * virtual addresses, take care of potential aliasing
    2704             :                          * before reading the folio on the kernel side.
    2705             :                          */
    2706  1494132225 :                         if (writably_mapped)
    2707             :                                 flush_dcache_folio(folio);
    2708             : 
    2709  1494132225 :                         copied = copy_folio_to_iter(folio, offset, bytes, iter);
    2710             : 
    2711  1495170443 :                         already_read += copied;
    2712  1495170443 :                         iocb->ki_pos += copied;
    2713  1495170443 :                         ra->prev_pos = iocb->ki_pos;
    2714             : 
    2715  1495170443 :                         if (copied < bytes) {
    2716             :                                 error = -EFAULT;
    2717             :                                 break;
    2718             :                         }
    2719             :                 }
    2720   989558299 : put_folios:
    2721  2484877287 :                 for (i = 0; i < folio_batch_count(&fbatch); i++)
    2722  1494821165 :                         folio_put(fbatch.folios[i]);
    2723   990056122 :                 folio_batch_init(&fbatch);
    2724   990056122 :         } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
    2725             : 
    2726   981164239 :         file_accessed(filp);
    2727             : 
    2728   978611687 :         return already_read ? already_read : error;
    2729             : }
    2730             : EXPORT_SYMBOL_GPL(filemap_read);
    2731             : 
    2732    46501224 : int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
    2733             : {
    2734    46501224 :         struct address_space *mapping = iocb->ki_filp->f_mapping;
    2735    46501224 :         loff_t pos = iocb->ki_pos;
    2736    46501224 :         loff_t end = pos + count - 1;
    2737             : 
    2738    46501224 :         if (iocb->ki_flags & IOCB_NOWAIT) {
    2739           0 :                 if (filemap_range_needs_writeback(mapping, pos, end))
    2740             :                         return -EAGAIN;
    2741           0 :                 return 0;
    2742             :         }
    2743             : 
    2744    46501224 :         return filemap_write_and_wait_range(mapping, pos, end);
    2745             : }
    2746             : 
    2747    26698422 : int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
    2748             : {
    2749    26698422 :         struct address_space *mapping = iocb->ki_filp->f_mapping;
    2750    26698422 :         loff_t pos = iocb->ki_pos;
    2751    26698422 :         loff_t end = pos + count - 1;
    2752    26698422 :         int ret;
    2753             : 
    2754    26698422 :         if (iocb->ki_flags & IOCB_NOWAIT) {
    2755             :                 /* we could block if there are any pages in the range */
    2756           5 :                 if (filemap_range_has_page(mapping, pos, end))
    2757             :                         return -EAGAIN;
    2758             :         } else {
    2759    26698417 :                 ret = filemap_write_and_wait_range(mapping, pos, end);
    2760    26697656 :                 if (ret)
    2761             :                         return ret;
    2762             :         }
    2763             : 
    2764             :         /*
    2765             :          * After a write we want buffered reads to be sure to go to disk to get
    2766             :          * the new data.  We invalidate clean cached page from the region we're
    2767             :          * about to write.  We do this *before* the write so that we can return
    2768             :          * without clobbering -EIOCBQUEUED from ->direct_IO().
    2769             :          */
    2770    26697612 :         return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
    2771    26697612 :                                              end >> PAGE_SHIFT);
    2772             : }
    2773             : 
    2774             : /**
    2775             :  * generic_file_read_iter - generic filesystem read routine
    2776             :  * @iocb:       kernel I/O control block
    2777             :  * @iter:       destination for the data read
    2778             :  *
    2779             :  * This is the "read_iter()" routine for all filesystems
    2780             :  * that can use the page cache directly.
    2781             :  *
    2782             :  * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
    2783             :  * be returned when no data can be read without waiting for I/O requests
    2784             :  * to complete; it doesn't prevent readahead.
    2785             :  *
    2786             :  * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
    2787             :  * requests shall be made for the read or for readahead.  When no data
    2788             :  * can be read, -EAGAIN shall be returned.  When readahead would be
    2789             :  * triggered, a partial, possibly empty read shall be returned.
    2790             :  *
    2791             :  * Return:
    2792             :  * * number of bytes copied, even for partial reads
    2793             :  * * negative error code (or 0 if IOCB_NOIO) if nothing was read
    2794             :  */
    2795             : ssize_t
    2796   846754237 : generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
    2797             : {
    2798   846754237 :         size_t count = iov_iter_count(iter);
    2799   846754237 :         ssize_t retval = 0;
    2800             : 
    2801   846754237 :         if (!count)
    2802             :                 return 0; /* skip atime */
    2803             : 
    2804   846753810 :         if (iocb->ki_flags & IOCB_DIRECT) {
    2805           0 :                 struct file *file = iocb->ki_filp;
    2806           0 :                 struct address_space *mapping = file->f_mapping;
    2807           0 :                 struct inode *inode = mapping->host;
    2808             : 
    2809           0 :                 retval = kiocb_write_and_wait(iocb, count);
    2810           0 :                 if (retval < 0)
    2811             :                         return retval;
    2812           0 :                 file_accessed(file);
    2813             : 
    2814           0 :                 retval = mapping->a_ops->direct_IO(iocb, iter);
    2815           0 :                 if (retval >= 0) {
    2816           0 :                         iocb->ki_pos += retval;
    2817           0 :                         count -= retval;
    2818             :                 }
    2819           0 :                 if (retval != -EIOCBQUEUED)
    2820           0 :                         iov_iter_revert(iter, count - iov_iter_count(iter));
    2821             : 
    2822             :                 /*
    2823             :                  * Btrfs can have a short DIO read if we encounter
    2824             :                  * compressed extents, so if there was an error, or if
    2825             :                  * we've already read everything we wanted to, or if
    2826             :                  * there was a short read because we hit EOF, go ahead
    2827             :                  * and return.  Otherwise fallthrough to buffered io for
    2828             :                  * the rest of the read.  Buffered reads will not work for
    2829             :                  * DAX files, so don't bother trying.
    2830             :                  */
    2831           0 :                 if (retval < 0 || !count || IS_DAX(inode))
    2832             :                         return retval;
    2833           0 :                 if (iocb->ki_pos >= i_size_read(inode))
    2834             :                         return retval;
    2835             :         }
    2836             : 
    2837   846753810 :         return filemap_read(iocb, iter, retval);
    2838             : }
    2839             : EXPORT_SYMBOL(generic_file_read_iter);
    2840             : 
    2841             : /*
    2842             :  * Splice subpages from a folio into a pipe.
    2843             :  */
    2844   113871602 : size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
    2845             :                               struct folio *folio, loff_t fpos, size_t size)
    2846             : {
    2847   113871602 :         struct page *page;
    2848   113871602 :         size_t spliced = 0, offset = offset_in_folio(folio, fpos);
    2849             : 
    2850   113871602 :         page = folio_page(folio, offset / PAGE_SIZE);
    2851   113871602 :         size = min(size, folio_size(folio) - offset);
    2852   113871602 :         offset %= PAGE_SIZE;
    2853             : 
    2854   248340908 :         while (spliced < size &&
    2855   134836341 :                !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
    2856   134469192 :                 struct pipe_buffer *buf = pipe_head_buf(pipe);
    2857   134469192 :                 size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);
    2858             : 
    2859   134469192 :                 *buf = (struct pipe_buffer) {
    2860             :                         .ops    = &page_cache_pipe_buf_ops,
    2861             :                         .page   = page,
    2862             :                         .offset = offset,
    2863             :                         .len    = part,
    2864             :                 };
    2865   134469192 :                 folio_get(folio);
    2866   134469306 :                 pipe->head++;
    2867   134469306 :                 page++;
    2868   134469306 :                 spliced += part;
    2869   134469306 :                 offset = 0;
    2870             :         }
    2871             : 
    2872   113871716 :         return spliced;
    2873             : }
    2874             : 
    2875             : /**
    2876             :  * filemap_splice_read -  Splice data from a file's pagecache into a pipe
    2877             :  * @in: The file to read from
    2878             :  * @ppos: Pointer to the file position to read from
    2879             :  * @pipe: The pipe to splice into
    2880             :  * @len: The amount to splice
    2881             :  * @flags: The SPLICE_F_* flags
    2882             :  *
    2883             :  * This function gets folios from a file's pagecache and splices them into the
    2884             :  * pipe.  Readahead will be called as necessary to fill more folios.  This may
    2885             :  * be used for blockdevs also.
    2886             :  *
    2887             :  * Return: On success, the number of bytes read will be returned and *@ppos
    2888             :  * will be updated if appropriate; 0 will be returned if there is no more data
    2889             :  * to be read; -EAGAIN will be returned if the pipe had no space, and some
    2890             :  * other negative error code will be returned on error.  A short read may occur
    2891             :  * if the pipe has insufficient space, we reach the end of the data or we hit a
    2892             :  * hole.
    2893             :  */
    2894    12343346 : ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
    2895             :                             struct pipe_inode_info *pipe,
    2896             :                             size_t len, unsigned int flags)
    2897             : {
    2898    12343346 :         struct folio_batch fbatch;
    2899    12343346 :         struct kiocb iocb;
    2900    12343346 :         size_t total_spliced = 0, used, npages;
    2901    12343346 :         loff_t isize, end_offset;
    2902    12343346 :         bool writably_mapped;
    2903    12343346 :         int i, error = 0;
    2904             : 
    2905    12343346 :         if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes))
    2906             :                 return 0;
    2907             : 
    2908    12343346 :         init_sync_kiocb(&iocb, in);
    2909    12343439 :         iocb.ki_pos = *ppos;
    2910             : 
    2911             :         /* Work out how much data we can actually add into the pipe */
    2912    12343439 :         used = pipe_occupancy(pipe->head, pipe->tail);
    2913    12343439 :         npages = max_t(ssize_t, pipe->max_usage - used, 0);
    2914    12343439 :         len = min_t(size_t, len, npages * PAGE_SIZE);
    2915             : 
    2916    12343439 :         folio_batch_init(&fbatch);
    2917             : 
    2918    19293972 :         do {
    2919    19293972 :                 cond_resched();
    2920             : 
    2921    19294029 :                 if (*ppos >= i_size_read(in->f_mapping->host))
    2922             :                         break;
    2923             : 
    2924    19294029 :                 iocb.ki_pos = *ppos;
    2925    19294029 :                 error = filemap_get_pages(&iocb, len, &fbatch, true);
    2926    19294111 :                 if (error < 0)
    2927             :                         break;
    2928             : 
    2929             :                 /*
    2930             :                  * i_size must be checked after we know the pages are Uptodate.
    2931             :                  *
    2932             :                  * Checking i_size after the check allows us to calculate
    2933             :                  * the correct value for "nr", which means the zero-filled
    2934             :                  * part of the page is not copied back to userspace (unless
    2935             :                  * another truncate extends the file - this is desired though).
    2936             :                  */
    2937    19293739 :                 isize = i_size_read(in->f_mapping->host);
    2938    19293739 :                 if (unlikely(*ppos >= isize))
    2939             :                         break;
    2940    19293739 :                 end_offset = min_t(loff_t, isize, *ppos + len);
    2941             : 
    2942             :                 /*
    2943             :                  * Once we start copying data, we don't want to be touching any
    2944             :                  * cachelines that might be contended:
    2945             :                  */
    2946    19293739 :                 writably_mapped = mapping_writably_mapped(in->f_mapping);
    2947             : 
    2948   128615510 :                 for (i = 0; i < folio_batch_count(&fbatch); i++) {
    2949   113871361 :                         struct folio *folio = fbatch.folios[i];
    2950   113871361 :                         size_t n;
    2951             : 
    2952   113871361 :                         if (folio_pos(folio) >= end_offset)
    2953           0 :                                 goto out;
    2954   113871361 :                         folio_mark_accessed(folio);
    2955             : 
    2956             :                         /*
    2957             :                          * If users can be writing to this folio using arbitrary
    2958             :                          * virtual addresses, take care of potential aliasing
    2959             :                          * before reading the folio on the kernel side.
    2960             :                          */
    2961   113871801 :                         if (writably_mapped)
    2962             :                                 flush_dcache_folio(folio);
    2963             : 
    2964   113871801 :                         n = min_t(loff_t, len, isize - *ppos);
    2965   113871801 :                         n = splice_folio_into_pipe(pipe, folio, *ppos, n);
    2966   113871444 :                         if (!n)
    2967           0 :                                 goto out;
    2968   113871444 :                         len -= n;
    2969   113871444 :                         total_spliced += n;
    2970   113871444 :                         *ppos += n;
    2971   113871444 :                         in->f_ra.prev_pos = *ppos;
    2972   113871444 :                         if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
    2973     4549673 :                                 goto out;
    2974             :                 }
    2975             : 
    2976    14744149 :                 folio_batch_release(&fbatch);
    2977    14744142 :         } while (len);
    2978             : 
    2979     7793981 : out:
    2980    12343654 :         folio_batch_release(&fbatch);
    2981    12343652 :         file_accessed(in);
    2982             : 
    2983    12343334 :         return total_spliced ? total_spliced : error;
    2984             : }
    2985             : EXPORT_SYMBOL(filemap_splice_read);
    2986             : 
    2987    48235588 : static inline loff_t folio_seek_hole_data(struct xa_state *xas,
    2988             :                 struct address_space *mapping, struct folio *folio,
    2989             :                 loff_t start, loff_t end, bool seek_data)
    2990             : {
    2991    48235588 :         const struct address_space_operations *ops = mapping->a_ops;
    2992    48235588 :         size_t offset, bsz = i_blocksize(mapping->host);
    2993             : 
    2994    96468262 :         if (xa_is_value(folio) || folio_test_uptodate(folio))
    2995    48057409 :                 return seek_data ? start : end;
    2996      178253 :         if (!ops->is_partially_uptodate)
    2997      151176 :                 return seek_data ? end : start;
    2998             : 
    2999       27077 :         xas_pause(xas);
    3000       27077 :         rcu_read_unlock();
    3001       27077 :         folio_lock(folio);
    3002       27077 :         if (unlikely(folio->mapping != mapping))
    3003           0 :                 goto unlock;
    3004             : 
    3005       27078 :         offset = offset_in_folio(folio, start) & ~(bsz - 1);
    3006             : 
    3007       27080 :         do {
    3008       27080 :                 if (ops->is_partially_uptodate(folio, offset, bsz) ==
    3009             :                                                         seek_data)
    3010             :                         break;
    3011       27080 :                 start = (start + bsz) & ~(bsz - 1);
    3012       27080 :                 offset += bsz;
    3013       27084 :         } while (offset < folio_size(folio));
    3014       27077 : unlock:
    3015       27077 :         folio_unlock(folio);
    3016       27077 :         rcu_read_lock();
    3017       27077 :         return start;
    3018             : }
    3019             : 
    3020    48235655 : static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)
    3021             : {
    3022    48235655 :         if (xa_is_value(folio))
    3023        2987 :                 return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
    3024    48238401 :         return folio_size(folio);
    3025             : }
    3026             : 
    3027             : /**
    3028             :  * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
    3029             :  * @mapping: Address space to search.
    3030             :  * @start: First byte to consider.
    3031             :  * @end: Limit of search (exclusive).
    3032             :  * @whence: Either SEEK_HOLE or SEEK_DATA.
    3033             :  *
    3034             :  * If the page cache knows which blocks contain holes and which blocks
    3035             :  * contain data, your filesystem can use this function to implement
    3036             :  * SEEK_HOLE and SEEK_DATA.  This is useful for filesystems which are
    3037             :  * entirely memory-based such as tmpfs, and filesystems which support
    3038             :  * unwritten extents.
    3039             :  *
    3040             :  * Return: The requested offset on success, or -ENXIO if @whence specifies
    3041             :  * SEEK_DATA and there is no data after @start.  There is an implicit hole
    3042             :  * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
    3043             :  * and @end contain data.
    3044             :  */
    3045    47543328 : loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
    3046             :                 loff_t end, int whence)
    3047             : {
    3048    47543328 :         XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
    3049    47543328 :         pgoff_t max = (end - 1) >> PAGE_SHIFT;
    3050    47543328 :         bool seek_data = (whence == SEEK_DATA);
    3051    47543328 :         struct folio *folio;
    3052             : 
    3053    47543328 :         if (end <= start)
    3054             :                 return -ENXIO;
    3055             : 
    3056    47543328 :         rcu_read_lock();
    3057    48375806 :         while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {
    3058    48331358 :                 loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;
    3059    48331358 :                 size_t seek_size;
    3060             : 
    3061    48331358 :                 if (start < pos) {
    3062    36640106 :                         if (!seek_data)
    3063       95707 :                                 goto unlock;
    3064             :                         start = pos;
    3065             :                 }
    3066             : 
    3067    48235651 :                 seek_size = seek_folio_size(&xas, folio);
    3068    48235508 :                 pos = round_up((u64)pos + 1, seek_size);
    3069    48235508 :                 start = folio_seek_hole_data(&xas, mapping, folio, start, pos,
    3070             :                                 seek_data);
    3071    48235711 :                 if (start < pos)
    3072    47382275 :                         goto unlock;
    3073      853436 :                 if (start >= end)
    3074             :                         break;
    3075      832647 :                 if (seek_size > PAGE_SIZE)
    3076         653 :                         xas_set(&xas, pos >> PAGE_SHIFT);
    3077      832647 :                 if (!xa_is_value(folio))
    3078      832422 :                         folio_put(folio);
    3079             :         }
    3080       65023 :         if (seek_data)
    3081       59982 :                 start = -ENXIO;
    3082        5041 : unlock:
    3083    47543005 :         rcu_read_unlock();
    3084    47542859 :         if (folio && !xa_is_value(folio))
    3085    47495971 :                 folio_put(folio);
    3086    47543454 :         if (start > end)
    3087             :                 return end;
    3088             :         return start;
    3089             : }
    3090             : 
    3091             : #ifdef CONFIG_MMU
    3092             : #define MMAP_LOTSAMISS  (100)
    3093             : /*
    3094             :  * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
    3095             :  * @vmf - the vm_fault for this fault.
    3096             :  * @folio - the folio to lock.
    3097             :  * @fpin - the pointer to the file we may pin (or is already pinned).
    3098             :  *
    3099             :  * This works similar to lock_folio_or_retry in that it can drop the
    3100             :  * mmap_lock.  It differs in that it actually returns the folio locked
    3101             :  * if it returns 1 and 0 if it couldn't lock the folio.  If we did have
    3102             :  * to drop the mmap_lock then fpin will point to the pinned file and
    3103             :  * needs to be fput()'ed at a later point.
    3104             :  */
    3105   476477035 : static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
    3106             :                                      struct file **fpin)
    3107             : {
    3108   476477035 :         if (folio_trylock(folio))
    3109             :                 return 1;
    3110             : 
    3111             :         /*
    3112             :          * NOTE! This will make us return with VM_FAULT_RETRY, but with
    3113             :          * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
    3114             :          * is supposed to work. We have way too many special cases..
    3115             :          */
    3116     2640058 :         if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
    3117             :                 return 0;
    3118             : 
    3119     2640060 :         *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
    3120     2639921 :         if (vmf->flags & FAULT_FLAG_KILLABLE) {
    3121     2639921 :                 if (__folio_lock_killable(folio)) {
    3122             :                         /*
    3123             :                          * We didn't have the right flags to drop the mmap_lock,
    3124             :                          * but all fault_handlers only check for fatal signals
    3125             :                          * if we return VM_FAULT_RETRY, so we need to drop the
    3126             :                          * mmap_lock here and return 0 if we don't have a fpin.
    3127             :                          */
    3128         179 :                         if (*fpin == NULL)
    3129           0 :                                 mmap_read_unlock(vmf->vma->vm_mm);
    3130         179 :                         return 0;
    3131             :                 }
    3132             :         } else
    3133           0 :                 __folio_lock(folio);
    3134             : 
    3135             :         return 1;
    3136             : }
    3137             : 
    3138             : /*
    3139             :  * Synchronous readahead happens when we don't even find a page in the page
    3140             :  * cache at all.  We don't want to perform IO under the mmap sem, so if we have
    3141             :  * to drop the mmap sem we return the file that was pinned in order for us to do
    3142             :  * that.  If we didn't pin a file then we return NULL.  The file that is
    3143             :  * returned needs to be fput()'ed when we're done with it.
    3144             :  */
    3145     4991791 : static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
    3146             : {
    3147     4991791 :         struct file *file = vmf->vma->vm_file;
    3148     4991791 :         struct file_ra_state *ra = &file->f_ra;
    3149     4991791 :         struct address_space *mapping = file->f_mapping;
    3150     4991791 :         DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
    3151     4991791 :         struct file *fpin = NULL;
    3152     4991791 :         unsigned long vm_flags = vmf->vma->vm_flags;
    3153     4991791 :         unsigned int mmap_miss;
    3154             : 
    3155             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    3156             :         /* Use the readahead code, even if readahead is disabled */
    3157     4991791 :         if (vm_flags & VM_HUGEPAGE) {
    3158           0 :                 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
    3159           0 :                 ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
    3160           0 :                 ra->size = HPAGE_PMD_NR;
    3161             :                 /*
    3162             :                  * Fetch two PMD folios, so we get the chance to actually
    3163             :                  * readahead, unless we've been told not to.
    3164             :                  */
    3165           0 :                 if (!(vm_flags & VM_RAND_READ))
    3166           0 :                         ra->size *= 2;
    3167           0 :                 ra->async_size = HPAGE_PMD_NR;
    3168           0 :                 page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
    3169           0 :                 return fpin;
    3170             :         }
    3171             : #endif
    3172             : 
    3173             :         /* If we don't want any read-ahead, don't bother */
    3174     4991791 :         if (vm_flags & VM_RAND_READ)
    3175             :                 return fpin;
    3176     4973398 :         if (!ra->ra_pages)
    3177             :                 return fpin;
    3178             : 
    3179     4973398 :         if (vm_flags & VM_SEQ_READ) {
    3180           0 :                 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
    3181           0 :                 page_cache_sync_ra(&ractl, ra->ra_pages);
    3182           0 :                 return fpin;
    3183             :         }
    3184             : 
    3185             :         /* Avoid banging the cache line if not needed */
    3186     4973398 :         mmap_miss = READ_ONCE(ra->mmap_miss);
    3187     4973398 :         if (mmap_miss < MMAP_LOTSAMISS * 10)
    3188     4973438 :                 WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
    3189             : 
    3190             :         /*
    3191             :          * Do we miss much more than hit in this file? If so,
    3192             :          * stop bothering with read-ahead. It will only hurt.
    3193             :          */
    3194     4973398 :         if (mmap_miss > MMAP_LOTSAMISS)
    3195             :                 return fpin;
    3196             : 
    3197             :         /*
    3198             :          * mmap read-around
    3199             :          */
    3200     4973398 :         fpin = maybe_unlock_mmap_for_io(vmf, fpin);
    3201     4973550 :         ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
    3202     4973550 :         ra->size = ra->ra_pages;
    3203     4973550 :         ra->async_size = ra->ra_pages / 4;
    3204     4973550 :         ractl._index = ra->start;
    3205     4973550 :         page_cache_ra_order(&ractl, ra, 0);
    3206     4973550 :         return fpin;
    3207             : }
    3208             : 
    3209             : /*
    3210             :  * Asynchronous readahead happens when we find the page and PG_readahead,
    3211             :  * so we want to possibly extend the readahead further.  We return the file that
    3212             :  * was pinned if we have to drop the mmap_lock in order to do IO.
    3213             :  */
    3214   466427457 : static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
    3215             :                                             struct folio *folio)
    3216             : {
    3217   466427457 :         struct file *file = vmf->vma->vm_file;
    3218   466427457 :         struct file_ra_state *ra = &file->f_ra;
    3219   466427457 :         DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
    3220   466427457 :         struct file *fpin = NULL;
    3221   466427457 :         unsigned int mmap_miss;
    3222             : 
    3223             :         /* If we don't want any read-ahead, don't bother */
    3224   466427457 :         if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
    3225             :                 return fpin;
    3226             : 
    3227   466400210 :         mmap_miss = READ_ONCE(ra->mmap_miss);
    3228   466400210 :         if (mmap_miss)
    3229     3605797 :                 WRITE_ONCE(ra->mmap_miss, --mmap_miss);
    3230             : 
    3231   466400210 :         if (folio_test_readahead(folio)) {
    3232      367385 :                 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
    3233      367300 :                 page_cache_async_ra(&ractl, folio, ra->ra_pages);
    3234             :         }
    3235             :         return fpin;
    3236             : }
    3237             : 
    3238             : /**
    3239             :  * filemap_fault - read in file data for page fault handling
    3240             :  * @vmf:        struct vm_fault containing details of the fault
    3241             :  *
    3242             :  * filemap_fault() is invoked via the vma operations vector for a
    3243             :  * mapped memory region to read in file data during a page fault.
    3244             :  *
    3245             :  * The goto's are kind of ugly, but this streamlines the normal case of having
    3246             :  * it in the page cache, and handles the special cases reasonably without
    3247             :  * having a lot of duplicated code.
    3248             :  *
    3249             :  * vma->vm_mm->mmap_lock must be held on entry.
    3250             :  *
    3251             :  * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
    3252             :  * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().
    3253             :  *
    3254             :  * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
    3255             :  * has not been released.
    3256             :  *
    3257             :  * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
    3258             :  *
    3259             :  * Return: bitwise-OR of %VM_FAULT_ codes.
    3260             :  */
    3261   476411598 : vm_fault_t filemap_fault(struct vm_fault *vmf)
    3262             : {
    3263   476411598 :         int error;
    3264   476411598 :         struct file *file = vmf->vma->vm_file;
    3265   476411598 :         struct file *fpin = NULL;
    3266   476411598 :         struct address_space *mapping = file->f_mapping;
    3267   476411598 :         struct inode *inode = mapping->host;
    3268   476411598 :         pgoff_t max_idx, index = vmf->pgoff;
    3269   476411598 :         struct folio *folio;
    3270   476411598 :         vm_fault_t ret = 0;
    3271   476411598 :         bool mapping_locked = false;
    3272             : 
    3273   476411598 :         max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
    3274   476411598 :         if (unlikely(index >= max_idx))
    3275             :                 return VM_FAULT_SIGBUS;
    3276             : 
    3277             :         /*
    3278             :          * Do we have something in the page cache already?
    3279             :          */
    3280   476408394 :         folio = filemap_get_folio(mapping, index);
    3281   476742460 :         if (likely(!IS_ERR(folio))) {
    3282             :                 /*
    3283             :                  * We found the page, so try async readahead before waiting for
    3284             :                  * the lock.
    3285             :                  */
    3286   471750992 :                 if (!(vmf->flags & FAULT_FLAG_TRIED))
    3287   466582141 :                         fpin = do_async_mmap_readahead(vmf, folio);
    3288   942349474 :                 if (unlikely(!folio_test_uptodate(folio))) {
    3289      582688 :                         filemap_invalidate_lock_shared(mapping);
    3290      582688 :                         mapping_locked = true;
    3291             :                 }
    3292             :         } else {
    3293             :                 /* No page in the page cache at all */
    3294     4991468 :                 count_vm_event(PGMAJFAULT);
    3295     4991704 :                 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
    3296     4991954 :                 ret = VM_FAULT_MAJOR;
    3297     4991954 :                 fpin = do_sync_mmap_readahead(vmf);
    3298     4995745 : retry_find:
    3299             :                 /*
    3300             :                  * See comment in filemap_create_folio() why we need
    3301             :                  * invalidate_lock
    3302             :                  */
    3303     4995745 :                 if (!mapping_locked) {
    3304     4995206 :                         filemap_invalidate_lock_shared(mapping);
    3305     4995206 :                         mapping_locked = true;
    3306             :                 }
    3307     4995760 :                 folio = __filemap_get_folio(mapping, index,
    3308             :                                           FGP_CREAT|FGP_FOR_MMAP,
    3309     4995760 :                                           vmf->gfp_mask);
    3310     4995787 :                 if (IS_ERR(folio)) {
    3311           0 :                         if (fpin)
    3312           0 :                                 goto out_retry;
    3313           0 :                         filemap_invalidate_unlock_shared(mapping);
    3314           0 :                         return VM_FAULT_OOM;
    3315             :                 }
    3316             :         }
    3317             : 
    3318   476459334 :         if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))
    3319         179 :                 goto out_retry;
    3320             : 
    3321             :         /* Did it get truncated? */
    3322   476726525 :         if (unlikely(folio->mapping != mapping)) {
    3323        3094 :                 folio_unlock(folio);
    3324        3094 :                 folio_put(folio);
    3325        3094 :                 goto retry_find;
    3326             :         }
    3327   476723431 :         VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
    3328             : 
    3329             :         /*
    3330             :          * We have a locked page in the page cache, now we need to check
    3331             :          * that it's up-to-date. If not, it is going to be due to an error.
    3332             :          */
    3333   953334450 :         if (unlikely(!folio_test_uptodate(folio))) {
    3334             :                 /*
    3335             :                  * The page was in cache and uptodate and now it is not.
    3336             :                  * Strange but possible since we didn't hold the page lock all
    3337             :                  * the time. Let's drop everything get the invalidate lock and
    3338             :                  * try again.
    3339             :                  */
    3340       64605 :                 if (!mapping_locked) {
    3341           0 :                         folio_unlock(folio);
    3342           0 :                         folio_put(folio);
    3343           0 :                         goto retry_find;
    3344             :                 }
    3345       64605 :                 goto page_not_uptodate;
    3346             :         }
    3347             : 
    3348             :         /*
    3349             :          * We've made it this far and we had to drop our mmap_lock, now is the
    3350             :          * time to return to the upper layer and have it re-find the vma and
    3351             :          * redo the fault.
    3352             :          */
    3353   476629158 :         if (fpin) {
    3354     7158244 :                 folio_unlock(folio);
    3355     7158285 :                 goto out_retry;
    3356             :         }
    3357   469470914 :         if (mapping_locked)
    3358         526 :                 filemap_invalidate_unlock_shared(mapping);
    3359             : 
    3360             :         /*
    3361             :          * Found the page and have a reference on it.
    3362             :          * We must recheck i_size under page lock.
    3363             :          */
    3364   469470914 :         max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
    3365   469470914 :         if (unlikely(index >= max_idx)) {
    3366           1 :                 folio_unlock(folio);
    3367           1 :                 folio_put(folio);
    3368           1 :                 return VM_FAULT_SIGBUS;
    3369             :         }
    3370             : 
    3371   469470913 :         vmf->page = folio_file_page(folio, index);
    3372   469393454 :         return ret | VM_FAULT_LOCKED;
    3373             : 
    3374             : page_not_uptodate:
    3375             :         /*
    3376             :          * Umm, take care of errors if the page isn't up-to-date.
    3377             :          * Try to re-read it _once_. We do this synchronously,
    3378             :          * because there really aren't any performance issues here
    3379             :          * and we need to check for errors.
    3380             :          */
    3381       64605 :         fpin = maybe_unlock_mmap_for_io(vmf, fpin);
    3382       64586 :         error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
    3383       64603 :         if (fpin)
    3384       63984 :                 goto out_retry;
    3385         619 :         folio_put(folio);
    3386             : 
    3387         619 :         if (!error || error == AOP_TRUNCATED_PAGE)
    3388         515 :                 goto retry_find;
    3389         104 :         filemap_invalidate_unlock_shared(mapping);
    3390             : 
    3391         104 :         return VM_FAULT_SIGBUS;
    3392             : 
    3393     7222448 : out_retry:
    3394             :         /*
    3395             :          * We dropped the mmap_lock, we need to return to the fault handler to
    3396             :          * re-find the vma and come back and find our hopefully still populated
    3397             :          * page.
    3398             :          */
    3399     7222448 :         if (!IS_ERR(folio))
    3400     7222448 :                 folio_put(folio);
    3401     7222442 :         if (mapping_locked)
    3402     5577356 :                 filemap_invalidate_unlock_shared(mapping);
    3403     7222434 :         if (fpin)
    3404     7222434 :                 fput(fpin);
    3405     7222429 :         return ret | VM_FAULT_RETRY;
    3406             : }
    3407             : EXPORT_SYMBOL(filemap_fault);
    3408             : 
    3409  2646355327 : static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
    3410             :                 pgoff_t start)
    3411             : {
    3412  2646355327 :         struct mm_struct *mm = vmf->vma->vm_mm;
    3413             : 
    3414             :         /* Huge page is mapped? No need to proceed. */
    3415  2646355327 :         if (pmd_trans_huge(*vmf->pmd)) {
    3416           0 :                 folio_unlock(folio);
    3417           0 :                 folio_put(folio);
    3418           0 :                 return true;
    3419             :         }
    3420             : 
    3421  2646407556 :         if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
    3422       36383 :                 struct page *page = folio_file_page(folio, start);
    3423       36382 :                 vm_fault_t ret = do_set_pmd(vmf, page);
    3424       36389 :                 if (!ret) {
    3425             :                         /* The page is mapped successfully, reference consumed. */
    3426       36389 :                         folio_unlock(folio);
    3427       36389 :                         return true;
    3428             :                 }
    3429             :         }
    3430             : 
    3431  2646318944 :         if (pmd_none(*vmf->pmd))
    3432    82246205 :                 pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
    3433             : 
    3434             :         return false;
    3435             : }
    3436             : 
    3437 37321407500 : static struct folio *next_uptodate_page(struct folio *folio,
    3438             :                                        struct address_space *mapping,
    3439             :                                        struct xa_state *xas, pgoff_t end_pgoff)
    3440             : {
    3441 40525548786 :         unsigned long max_idx;
    3442             : 
    3443 40525548786 :         do {
    3444 40525548786 :                 if (!folio)
    3445             :                         return NULL;
    3446 37911466641 :                 if (xas_retry(xas, folio))
    3447           0 :                         continue;
    3448 37911466641 :                 if (xa_is_value(folio))
    3449   297373891 :                         continue;
    3450 37614092750 :                 if (folio_test_locked(folio))
    3451     9722594 :                         continue;
    3452 37604370156 :                 if (!folio_try_get_rcu(folio))
    3453           0 :                         continue;
    3454             :                 /* Has the page moved or been split? */
    3455 37648606747 :                 if (unlikely(folio != xas_reload(xas)))
    3456           0 :                         goto skip;
    3457 75283602281 :                 if (!folio_test_uptodate(folio) || folio_test_readahead(folio))
    3458  3168228688 :                         goto skip;
    3459 34475436661 :                 if (!folio_trylock(folio))
    3460     7396108 :                         goto skip;
    3461 34474520880 :                 if (folio->mapping != mapping)
    3462           0 :                         goto unlock;
    3463 68943386359 :                 if (!folio_test_uptodate(folio))
    3464           0 :                         goto unlock;
    3465 34469006755 :                 max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
    3466 34469006755 :                 if (xas->xa_index >= max_idx)
    3467           0 :                         goto unlock;
    3468             :                 return folio;
    3469           0 : unlock:
    3470           0 :                 folio_unlock(folio);
    3471  3175624796 : skip:
    3472  3175624796 :                 folio_put(folio);
    3473  3483265155 :         } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);
    3474             : 
    3475             :         return NULL;
    3476             : }
    3477             : 
    3478  2879705049 : static inline struct folio *first_map_page(struct address_space *mapping,
    3479             :                                           struct xa_state *xas,
    3480             :                                           pgoff_t end_pgoff)
    3481             : {
    3482  2879705049 :         return next_uptodate_page(xas_find(xas, end_pgoff),
    3483             :                                   mapping, xas, end_pgoff);
    3484             : }
    3485             : 
    3486 34457810046 : static inline struct folio *next_map_page(struct address_space *mapping,
    3487             :                                          struct xa_state *xas,
    3488             :                                          pgoff_t end_pgoff)
    3489             : {
    3490 34457810046 :         return next_uptodate_page(xas_next_entry(xas, end_pgoff),
    3491             :                                   mapping, xas, end_pgoff);
    3492             : }
    3493             : 
    3494  2880130442 : vm_fault_t filemap_map_pages(struct vm_fault *vmf,
    3495             :                              pgoff_t start_pgoff, pgoff_t end_pgoff)
    3496 32457765203 : {
    3497  2880130442 :         struct vm_area_struct *vma = vmf->vma;
    3498  2880130442 :         struct file *file = vma->vm_file;
    3499  2880130442 :         struct address_space *mapping = file->f_mapping;
    3500  2880130442 :         pgoff_t last_pgoff = start_pgoff;
    3501  2880130442 :         unsigned long addr;
    3502  2880130442 :         XA_STATE(xas, &mapping->i_pages, start_pgoff);
    3503  2880130442 :         struct folio *folio;
    3504  2880130442 :         struct page *page;
    3505  2880130442 :         unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
    3506  2880130442 :         vm_fault_t ret = 0;
    3507             : 
    3508  2880130442 :         rcu_read_lock();
    3509  2879640651 :         folio = first_map_page(mapping, &xas, end_pgoff);
    3510  2880106499 :         if (!folio)
    3511   233614232 :                 goto out;
    3512             : 
    3513  2646492267 :         if (filemap_map_pmd(vmf, folio, start_pgoff)) {
    3514       36389 :                 ret = VM_FAULT_NOPAGE;
    3515       36389 :                 goto out;
    3516             :         }
    3517             : 
    3518  2646264001 :         addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
    3519  2646264001 :         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
    3520  2646745740 :         if (!vmf->pte) {
    3521           0 :                 folio_unlock(folio);
    3522           0 :                 folio_put(folio);
    3523           0 :                 goto out;
    3524             :         }
    3525 34469557911 :         do {
    3526 34462091918 : again:
    3527 34469557911 :                 page = folio_file_page(folio, xas.xa_index);
    3528 34461295427 :                 if (PageHWPoison(page))
    3529           0 :                         goto unlock;
    3530             : 
    3531 34461295427 :                 if (mmap_miss > 0)
    3532     1258919 :                         mmap_miss--;
    3533             : 
    3534 34461295427 :                 addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
    3535 34461295427 :                 vmf->pte += xas.xa_index - last_pgoff;
    3536 34461295427 :                 last_pgoff = xas.xa_index;
    3537             : 
    3538             :                 /*
    3539             :                  * NOTE: If there're PTE markers, we'll leave them to be
    3540             :                  * handled in the specific fault path, and it'll prohibit the
    3541             :                  * fault-around logic.
    3542             :                  */
    3543 34461295427 :                 if (!pte_none(ptep_get(vmf->pte)))
    3544  2009043104 :                         goto unlock;
    3545             : 
    3546             :                 /* We're about to handle the fault */
    3547 32452252323 :                 if (vmf->address == addr)
    3548  2644761102 :                         ret = VM_FAULT_NOPAGE;
    3549             : 
    3550 32452252323 :                 do_set_pte(vmf, page, addr);
    3551             :                 /* no need to invalidate: a not-present page won't be cached */
    3552 32470763008 :                 update_mmu_cache(vma, addr, vmf->pte);
    3553 32470763008 :                 if (folio_more_pages(folio, xas.xa_index, end_pgoff)) {
    3554     6346842 :                         xas.xa_index++;
    3555     6346842 :                         folio_ref_inc(folio);
    3556     6347534 :                         goto again;
    3557             :                 }
    3558 32450055113 :                 folio_unlock(folio);
    3559 32457765203 :                 continue;
    3560  2009043104 : unlock:
    3561  2009043104 :                 if (folio_more_pages(folio, xas.xa_index, end_pgoff)) {
    3562     1118459 :                         xas.xa_index++;
    3563     1118459 :                         goto again;
    3564             :                 }
    3565  2007890557 :                 folio_unlock(folio);
    3566  2008015605 :                 folio_put(folio);
    3567 34465738024 :         } while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL);
    3568  2646284273 :         pte_unmap_unlock(vmf->pte, vmf->ptl);
    3569  2880356412 : out:
    3570  2880356412 :         rcu_read_unlock();
    3571  2880266399 :         WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
    3572  2880266399 :         return ret;
    3573             : }
    3574             : EXPORT_SYMBOL(filemap_map_pages);
    3575             : 
    3576        2233 : vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
    3577             : {
    3578        2233 :         struct address_space *mapping = vmf->vma->vm_file->f_mapping;
    3579        2233 :         struct folio *folio = page_folio(vmf->page);
    3580        2233 :         vm_fault_t ret = VM_FAULT_LOCKED;
    3581             : 
    3582        2233 :         sb_start_pagefault(mapping->host->i_sb);
    3583        2233 :         file_update_time(vmf->vma->vm_file);
    3584        2233 :         folio_lock(folio);
    3585        2233 :         if (folio->mapping != mapping) {
    3586           0 :                 folio_unlock(folio);
    3587           0 :                 ret = VM_FAULT_NOPAGE;
    3588           0 :                 goto out;
    3589             :         }
    3590             :         /*
    3591             :          * We mark the folio dirty already here so that when freeze is in
    3592             :          * progress, we are guaranteed that writeback during freezing will
    3593             :          * see the dirty folio and writeprotect it again.
    3594             :          */
    3595        2233 :         folio_mark_dirty(folio);
    3596        2233 :         folio_wait_stable(folio);
    3597        2233 : out:
    3598        2233 :         sb_end_pagefault(mapping->host->i_sb);
    3599        2233 :         return ret;
    3600             : }
    3601             : 
    3602             : const struct vm_operations_struct generic_file_vm_ops = {
    3603             :         .fault          = filemap_fault,
    3604             :         .map_pages      = filemap_map_pages,
    3605             :         .page_mkwrite   = filemap_page_mkwrite,
    3606             : };
    3607             : 
    3608             : /* This is used for a general mmap of a disk file */
    3609             : 
    3610   736607833 : int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
    3611             : {
    3612   736607833 :         struct address_space *mapping = file->f_mapping;
    3613             : 
    3614   736607833 :         if (!mapping->a_ops->read_folio)
    3615             :                 return -ENOEXEC;
    3616   736607833 :         file_accessed(file);
    3617   736565179 :         vma->vm_ops = &generic_file_vm_ops;
    3618   736565179 :         return 0;
    3619             : }
    3620             : 
    3621             : /*
    3622             :  * This is for filesystems which do not implement ->writepage.
    3623             :  */
    3624           0 : int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
    3625             : {
    3626           0 :         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
    3627             :                 return -EINVAL;
    3628           0 :         return generic_file_mmap(file, vma);
    3629             : }
    3630             : #else
    3631             : vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
    3632             : {
    3633             :         return VM_FAULT_SIGBUS;
    3634             : }
    3635             : int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
    3636             : {
    3637             :         return -ENOSYS;
    3638             : }
    3639             : int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
    3640             : {
    3641             :         return -ENOSYS;
    3642             : }
    3643             : #endif /* CONFIG_MMU */
    3644             : 
    3645             : EXPORT_SYMBOL(filemap_page_mkwrite);
    3646             : EXPORT_SYMBOL(generic_file_mmap);
    3647             : EXPORT_SYMBOL(generic_file_readonly_mmap);
    3648             : 
    3649  3872244681 : static struct folio *do_read_cache_folio(struct address_space *mapping,
    3650             :                 pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)
    3651             : {
    3652  3872244681 :         struct folio *folio;
    3653  3872244681 :         int err;
    3654             : 
    3655  3872244681 :         if (!filler)
    3656  3859714118 :                 filler = mapping->a_ops->read_folio;
    3657  3872244681 : repeat:
    3658  3872265565 :         folio = filemap_get_folio(mapping, index);
    3659  3873473320 :         if (IS_ERR(folio)) {
    3660  1659476088 :                 folio = filemap_alloc_folio(gfp, 0);
    3661  1659209935 :                 if (!folio)
    3662             :                         return ERR_PTR(-ENOMEM);
    3663  1659209935 :                 err = filemap_add_folio(mapping, folio, index, gfp);
    3664  1659167184 :                 if (unlikely(err)) {
    3665          83 :                         folio_put(folio);
    3666          82 :                         if (err == -EEXIST)
    3667          82 :                                 goto repeat;
    3668             :                         /* Presumably ENOMEM for xarray node */
    3669           0 :                         return ERR_PTR(err);
    3670             :                 }
    3671             : 
    3672  1659167101 :                 goto filler;
    3673             :         }
    3674  4427801325 :         if (folio_test_uptodate(folio))
    3675  2213886023 :                 goto out;
    3676             : 
    3677       21775 :         if (!folio_trylock(folio)) {
    3678       20805 :                 folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
    3679       20802 :                 goto repeat;
    3680             :         }
    3681             : 
    3682             :         /* Folio was truncated from mapping */
    3683         970 :         if (!folio->mapping) {
    3684           0 :                 folio_unlock(folio);
    3685           0 :                 folio_put(folio);
    3686           0 :                 goto repeat;
    3687             :         }
    3688             : 
    3689             :         /* Someone else locked and filled the page in a very small window */
    3690         970 :         if (folio_test_uptodate(folio)) {
    3691           0 :                 folio_unlock(folio);
    3692           0 :                 goto out;
    3693             :         }
    3694             : 
    3695         970 : filler:
    3696  1659168071 :         err = filemap_read_folio(file, filler, folio);
    3697  1660022847 :         if (err) {
    3698        2509 :                 folio_put(folio);
    3699        2509 :                 if (err == AOP_TRUNCATED_PAGE)
    3700           0 :                         goto repeat;
    3701        2509 :                 return ERR_PTR(err);
    3702             :         }
    3703             : 
    3704  1660020338 : out:
    3705  3873906361 :         folio_mark_accessed(folio);
    3706  3873906361 :         return folio;
    3707             : }
    3708             : 
    3709             : /**
    3710             :  * read_cache_folio - Read into page cache, fill it if needed.
    3711             :  * @mapping: The address_space to read from.
    3712             :  * @index: The index to read.
    3713             :  * @filler: Function to perform the read, or NULL to use aops->read_folio().
    3714             :  * @file: Passed to filler function, may be NULL if not required.
    3715             :  *
    3716             :  * Read one page into the page cache.  If it succeeds, the folio returned
    3717             :  * will contain @index, but it may not be the first page of the folio.
    3718             :  *
    3719             :  * If the filler function returns an error, it will be returned to the
    3720             :  * caller.
    3721             :  *
    3722             :  * Context: May sleep.  Expects mapping->invalidate_lock to be held.
    3723             :  * Return: An uptodate folio on success, ERR_PTR() on failure.
    3724             :  */
    3725  3859713282 : struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,
    3726             :                 filler_t filler, struct file *file)
    3727             : {
    3728  3859713282 :         return do_read_cache_folio(mapping, index, filler, file,
    3729             :                         mapping_gfp_mask(mapping));
    3730             : }
    3731             : EXPORT_SYMBOL(read_cache_folio);
    3732             : 
    3733             : /**
    3734             :  * mapping_read_folio_gfp - Read into page cache, using specified allocation flags.
    3735             :  * @mapping:    The address_space for the folio.
    3736             :  * @index:      The index that the allocated folio will contain.
    3737             :  * @gfp:        The page allocator flags to use if allocating.
    3738             :  *
    3739             :  * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with
    3740             :  * any new memory allocations done using the specified allocation flags.
    3741             :  *
    3742             :  * The most likely error from this function is EIO, but ENOMEM is
    3743             :  * possible and so is EINTR.  If ->read_folio returns another error,
    3744             :  * that will be returned to the caller.
    3745             :  *
    3746             :  * The function expects mapping->invalidate_lock to be already held.
    3747             :  *
    3748             :  * Return: Uptodate folio on success, ERR_PTR() on failure.
    3749             :  */
    3750           0 : struct folio *mapping_read_folio_gfp(struct address_space *mapping,
    3751             :                 pgoff_t index, gfp_t gfp)
    3752             : {
    3753           0 :         return do_read_cache_folio(mapping, index, NULL, NULL, gfp);
    3754             : }
    3755             : EXPORT_SYMBOL(mapping_read_folio_gfp);
    3756             : 
    3757    12575409 : static struct page *do_read_cache_page(struct address_space *mapping,
    3758             :                 pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp)
    3759             : {
    3760    12575409 :         struct folio *folio;
    3761             : 
    3762    12575409 :         folio = do_read_cache_folio(mapping, index, filler, file, gfp);
    3763    12575879 :         if (IS_ERR(folio))
    3764          53 :                 return &folio->page;
    3765    12575826 :         return folio_file_page(folio, index);
    3766             : }
    3767             : 
    3768    12562411 : struct page *read_cache_page(struct address_space *mapping,
    3769             :                         pgoff_t index, filler_t *filler, struct file *file)
    3770             : {
    3771    12562411 :         return do_read_cache_page(mapping, index, filler, file,
    3772             :                         mapping_gfp_mask(mapping));
    3773             : }
    3774             : EXPORT_SYMBOL(read_cache_page);
    3775             : 
    3776             : /**
    3777             :  * read_cache_page_gfp - read into page cache, using specified page allocation flags.
    3778             :  * @mapping:    the page's address_space
    3779             :  * @index:      the page index
    3780             :  * @gfp:        the page allocator flags to use if allocating
    3781             :  *
    3782             :  * This is the same as "read_mapping_page(mapping, index, NULL)", but with
    3783             :  * any new page allocations done using the specified allocation flags.
    3784             :  *
    3785             :  * If the page does not get brought uptodate, return -EIO.
    3786             :  *
    3787             :  * The function expects mapping->invalidate_lock to be already held.
    3788             :  *
    3789             :  * Return: up to date page on success, ERR_PTR() on failure.
    3790             :  */
    3791       13229 : struct page *read_cache_page_gfp(struct address_space *mapping,
    3792             :                                 pgoff_t index,
    3793             :                                 gfp_t gfp)
    3794             : {
    3795       13229 :         return do_read_cache_page(mapping, index, NULL, NULL, gfp);
    3796             : }
    3797             : EXPORT_SYMBOL(read_cache_page_gfp);
    3798             : 
    3799             : /*
    3800             :  * Warn about a page cache invalidation failure during a direct I/O write.
    3801             :  */
    3802        4237 : static void dio_warn_stale_pagecache(struct file *filp)
    3803             : {
    3804        4237 :         static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
    3805        4237 :         char pathname[128];
    3806        4237 :         char *path;
    3807             : 
    3808        4237 :         errseq_set(&filp->f_mapping->wb_err, -EIO);
    3809        4237 :         if (__ratelimit(&_rs)) {
    3810         130 :                 path = file_path(filp, pathname, sizeof(pathname));
    3811         130 :                 if (IS_ERR(path))
    3812           0 :                         path = "(unknown)";
    3813         130 :                 pr_crit("Page cache invalidation failure on direct I/O.  Possible data corruption due to collision with buffered I/O!\n");
    3814         130 :                 pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
    3815             :                         current->comm);
    3816             :         }
    3817        4237 : }
    3818             : 
    3819    23763357 : void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)
    3820             : {
    3821    23763357 :         struct address_space *mapping = iocb->ki_filp->f_mapping;
    3822             : 
    3823    33147982 :         if (mapping->nrpages &&
    3824     9384829 :             invalidate_inode_pages2_range(mapping,
    3825     9384829 :                         iocb->ki_pos >> PAGE_SHIFT,
    3826     9384829 :                         (iocb->ki_pos + count - 1) >> PAGE_SHIFT))
    3827        4237 :                 dio_warn_stale_pagecache(iocb->ki_filp);
    3828    23763153 : }
    3829             : 
    3830             : ssize_t
    3831     5308776 : generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
    3832             : {
    3833     5308776 :         struct address_space *mapping = iocb->ki_filp->f_mapping;
    3834     5308776 :         size_t write_len = iov_iter_count(from);
    3835     5308776 :         ssize_t written;
    3836             : 
    3837             :         /*
    3838             :          * If a page can not be invalidated, return 0 to fall back
    3839             :          * to buffered write.
    3840             :          */
    3841     5308776 :         written = kiocb_invalidate_pages(iocb, write_len);
    3842     5308776 :         if (written) {
    3843           0 :                 if (written == -EBUSY)
    3844             :                         return 0;
    3845           0 :                 return written;
    3846             :         }
    3847             : 
    3848     5308776 :         written = mapping->a_ops->direct_IO(iocb, from);
    3849             : 
    3850             :         /*
    3851             :          * Finally, try again to invalidate clean pages which might have been
    3852             :          * cached by non-direct readahead, or faulted in by get_user_pages()
    3853             :          * if the source of the write was an mmap'ed region of the file
    3854             :          * we're writing.  Either one is a pretty crazy thing to do,
    3855             :          * so we don't support it 100%.  If this invalidation
    3856             :          * fails, tough, the write still worked...
    3857             :          *
    3858             :          * Most of the time we do not need this since dio_complete() will do
    3859             :          * the invalidation for us. However there are some file systems that
    3860             :          * do not end up with dio_complete() being called, so let's not break
    3861             :          * them by removing it completely.
    3862             :          *
    3863             :          * Noticeable example is a blkdev_direct_IO().
    3864             :          *
    3865             :          * Skip invalidation for async writes or if mapping has no pages.
    3866             :          */
    3867     5308776 :         if (written > 0) {
    3868     5300371 :                 struct inode *inode = mapping->host;
    3869     5300371 :                 loff_t pos = iocb->ki_pos;
    3870             : 
    3871     5300371 :                 kiocb_invalidate_post_direct_write(iocb, written);
    3872     5300371 :                 pos += written;
    3873     5300371 :                 write_len -= written;
    3874     5300371 :                 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
    3875           0 :                         i_size_write(inode, pos);
    3876           0 :                         mark_inode_dirty(inode);
    3877             :                 }
    3878     5300371 :                 iocb->ki_pos = pos;
    3879             :         }
    3880     5308776 :         if (written != -EIOCBQUEUED)
    3881     5300382 :                 iov_iter_revert(from, write_len - iov_iter_count(from));
    3882             :         return written;
    3883             : }
    3884             : EXPORT_SYMBOL(generic_file_direct_write);
    3885             : 
    3886   284654356 : ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
    3887             : {
    3888   284654356 :         struct file *file = iocb->ki_filp;
    3889   284654356 :         loff_t pos = iocb->ki_pos;
    3890   284654356 :         struct address_space *mapping = file->f_mapping;
    3891   284654356 :         const struct address_space_operations *a_ops = mapping->a_ops;
    3892   284654356 :         long status = 0;
    3893   284654356 :         ssize_t written = 0;
    3894             : 
    3895   392539795 :         do {
    3896   392539795 :                 struct page *page;
    3897   392539795 :                 unsigned long offset;   /* Offset into pagecache page */
    3898   392539795 :                 unsigned long bytes;    /* Bytes to write to page */
    3899   392539795 :                 size_t copied;          /* Bytes copied from user */
    3900   392539795 :                 void *fsdata = NULL;
    3901             : 
    3902   392539795 :                 offset = (pos & (PAGE_SIZE - 1));
    3903   392539795 :                 bytes = min_t(unsigned long, PAGE_SIZE - offset,
    3904             :                                                 iov_iter_count(i));
    3905             : 
    3906   392539795 : again:
    3907             :                 /*
    3908             :                  * Bring in the user page that we will copy from _first_.
    3909             :                  * Otherwise there's a nasty deadlock on copying from the
    3910             :                  * same page as we're writing to, without it being marked
    3911             :                  * up-to-date.
    3912             :                  */
    3913   392539795 :                 if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
    3914             :                         status = -EFAULT;
    3915       59790 :                         break;
    3916             :                 }
    3917             : 
    3918   392545050 :                 if (fatal_signal_pending(current)) {
    3919             :                         status = -EINTR;
    3920             :                         break;
    3921             :                 }
    3922             : 
    3923   392419060 :                 status = a_ops->write_begin(file, mapping, pos, bytes,
    3924             :                                                 &page, &fsdata);
    3925   392484516 :                 if (unlikely(status < 0))
    3926             :                         break;
    3927             : 
    3928   392426603 :                 if (mapping_writably_mapped(mapping))
    3929             :                         flush_dcache_page(page);
    3930             : 
    3931   392426603 :                 copied = copy_page_from_iter_atomic(page, offset, bytes, i);
    3932   392505343 :                 flush_dcache_page(page);
    3933             : 
    3934   392505343 :                 status = a_ops->write_end(file, mapping, pos, bytes, copied,
    3935             :                                                 page, fsdata);
    3936   392543934 :                 if (unlikely(status != copied)) {
    3937           0 :                         iov_iter_revert(i, copied - max(status, 0L));
    3938           0 :                         if (unlikely(status < 0))
    3939             :                                 break;
    3940             :                 }
    3941   392543934 :                 cond_resched();
    3942             : 
    3943   392538055 :                 if (unlikely(status == 0)) {
    3944             :                         /*
    3945             :                          * A short copy made ->write_end() reject the
    3946             :                          * thing entirely.  Might be memory poisoning
    3947             :                          * halfway through, might be a race with munmap,
    3948             :                          * might be severe memory pressure.
    3949             :                          */
    3950           0 :                         if (copied)
    3951           0 :                                 bytes = copied;
    3952           0 :                         goto again;
    3953             :                 }
    3954   392538055 :                 pos += status;
    3955   392538055 :                 written += status;
    3956             : 
    3957   392538055 :                 balance_dirty_pages_ratelimited(mapping);
    3958   392509333 :         } while (iov_iter_count(i));
    3959             : 
    3960   284683683 :         if (!written)
    3961             :                 return status;
    3962   284633117 :         iocb->ki_pos += written;
    3963   284633117 :         return written;
    3964             : }
    3965             : EXPORT_SYMBOL(generic_perform_write);
    3966             : 
    3967             : /**
    3968             :  * __generic_file_write_iter - write data to a file
    3969             :  * @iocb:       IO state structure (file, offset, etc.)
    3970             :  * @from:       iov_iter with data to write
    3971             :  *
    3972             :  * This function does all the work needed for actually writing data to a
    3973             :  * file. It does all basic checks, removes SUID from the file, updates
    3974             :  * modification times and calls proper subroutines depending on whether we
    3975             :  * do direct IO or a standard buffered write.
    3976             :  *
    3977             :  * It expects i_rwsem to be grabbed unless we work on a block device or similar
    3978             :  * object which does not need locking at all.
    3979             :  *
    3980             :  * This function does *not* take care of syncing data in case of O_SYNC write.
    3981             :  * A caller has to handle it. This is mainly due to the fact that we want to
    3982             :  * avoid syncing under i_rwsem.
    3983             :  *
    3984             :  * Return:
    3985             :  * * number of bytes written, even for truncated writes
    3986             :  * * negative error code if no data has been written at all
    3987             :  */
    3988   274060322 : ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
    3989             : {
    3990   274060322 :         struct file *file = iocb->ki_filp;
    3991   274060322 :         struct address_space *mapping = file->f_mapping;
    3992   274060322 :         struct inode *inode = mapping->host;
    3993   274060322 :         ssize_t ret;
    3994             : 
    3995   274060322 :         ret = file_remove_privs(file);
    3996   274060203 :         if (ret)
    3997             :                 return ret;
    3998             : 
    3999   274060154 :         ret = file_update_time(file);
    4000   274059872 :         if (ret)
    4001             :                 return ret;
    4002             : 
    4003   274059872 :         if (iocb->ki_flags & IOCB_DIRECT) {
    4004     5308776 :                 ret = generic_file_direct_write(iocb, from);
    4005             :                 /*
    4006             :                  * If the write stopped short of completing, fall back to
    4007             :                  * buffered writes.  Some filesystems do this for writes to
    4008             :                  * holes, for example.  For DAX files, a buffered write will
    4009             :                  * not succeed (even if it did, DAX does not handle dirty
    4010             :                  * page-cache pages correctly).
    4011             :                  */
    4012     5308776 :                 if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))
    4013             :                         return ret;
    4014           0 :                 return direct_write_fallback(iocb, from, ret,
    4015             :                                 generic_perform_write(iocb, from));
    4016             :         }
    4017             : 
    4018   268751096 :         return generic_perform_write(iocb, from);
    4019             : }
    4020             : EXPORT_SYMBOL(__generic_file_write_iter);
    4021             : 
    4022             : /**
    4023             :  * generic_file_write_iter - write data to a file
    4024             :  * @iocb:       IO state structure
    4025             :  * @from:       iov_iter with data to write
    4026             :  *
    4027             :  * This is a wrapper around __generic_file_write_iter() to be used by most
    4028             :  * filesystems. It takes care of syncing the file in case of O_SYNC file
    4029             :  * and acquires i_rwsem as needed.
    4030             :  * Return:
    4031             :  * * negative error code if no data has been written at all of
    4032             :  *   vfs_fsync_range() failed for a synchronous write
    4033             :  * * number of bytes written, even for truncated writes
    4034             :  */
    4035   251401673 : ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
    4036             : {
    4037   251401673 :         struct file *file = iocb->ki_filp;
    4038   251401673 :         struct inode *inode = file->f_mapping->host;
    4039   251401673 :         ssize_t ret;
    4040             : 
    4041   251401673 :         inode_lock(inode);
    4042   251402260 :         ret = generic_write_checks(iocb, from);
    4043   251402292 :         if (ret > 0)
    4044   251402307 :                 ret = __generic_file_write_iter(iocb, from);
    4045   251402229 :         inode_unlock(inode);
    4046             : 
    4047   251402408 :         if (ret > 0)
    4048   251402404 :                 ret = generic_write_sync(iocb, ret);
    4049   251402358 :         return ret;
    4050             : }
    4051             : EXPORT_SYMBOL(generic_file_write_iter);
    4052             : 
    4053             : /**
    4054             :  * filemap_release_folio() - Release fs-specific metadata on a folio.
    4055             :  * @folio: The folio which the kernel is trying to free.
    4056             :  * @gfp: Memory allocation flags (and I/O mode).
    4057             :  *
    4058             :  * The address_space is trying to release any data attached to a folio
    4059             :  * (presumably at folio->private).
    4060             :  *
    4061             :  * This will also be called if the private_2 flag is set on a page,
    4062             :  * indicating that the folio has other metadata associated with it.
    4063             :  *
    4064             :  * The @gfp argument specifies whether I/O may be performed to release
    4065             :  * this page (__GFP_IO), and whether the call may block
    4066             :  * (__GFP_RECLAIM & __GFP_FS).
    4067             :  *
    4068             :  * Return: %true if the release was successful, otherwise %false.
    4069             :  */
    4070    86224640 : bool filemap_release_folio(struct folio *folio, gfp_t gfp)
    4071             : {
    4072    86224640 :         struct address_space * const mapping = folio->mapping;
    4073             : 
    4074    86224640 :         BUG_ON(!folio_test_locked(folio));
    4075    86224640 :         if (folio_test_writeback(folio))
    4076             :                 return false;
    4077             : 
    4078    86224640 :         if (mapping && mapping->a_ops->release_folio)
    4079    73708280 :                 return mapping->a_ops->release_folio(folio, gfp);
    4080    12516360 :         return try_to_free_buffers(folio);
    4081             : }
    4082             : EXPORT_SYMBOL(filemap_release_folio);
    4083             : 
    4084             : #ifdef CONFIG_CACHESTAT_SYSCALL
    4085             : /**
    4086             :  * filemap_cachestat() - compute the page cache statistics of a mapping
    4087             :  * @mapping:    The mapping to compute the statistics for.
    4088             :  * @first_index:        The starting page cache index.
    4089             :  * @last_index: The final page index (inclusive).
    4090             :  * @cs: the cachestat struct to write the result to.
    4091             :  *
    4092             :  * This will query the page cache statistics of a mapping in the
    4093             :  * page range of [first_index, last_index] (inclusive). The statistics
    4094             :  * queried include: number of dirty pages, number of pages marked for
    4095             :  * writeback, and the number of (recently) evicted pages.
    4096             :  */
    4097           0 : static void filemap_cachestat(struct address_space *mapping,
    4098             :                 pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)
    4099             : {
    4100           0 :         XA_STATE(xas, &mapping->i_pages, first_index);
    4101           0 :         struct folio *folio;
    4102             : 
    4103           0 :         rcu_read_lock();
    4104           0 :         xas_for_each(&xas, folio, last_index) {
    4105           0 :                 unsigned long nr_pages;
    4106           0 :                 pgoff_t folio_first_index, folio_last_index;
    4107             : 
    4108           0 :                 if (xas_retry(&xas, folio))
    4109           0 :                         continue;
    4110             : 
    4111           0 :                 if (xa_is_value(folio)) {
    4112             :                         /* page is evicted */
    4113           0 :                         void *shadow = (void *)folio;
    4114           0 :                         bool workingset; /* not used */
    4115           0 :                         int order = xa_get_order(xas.xa, xas.xa_index);
    4116             : 
    4117           0 :                         nr_pages = 1 << order;
    4118           0 :                         folio_first_index = round_down(xas.xa_index, 1 << order);
    4119           0 :                         folio_last_index = folio_first_index + nr_pages - 1;
    4120             : 
    4121             :                         /* Folios might straddle the range boundaries, only count covered pages */
    4122           0 :                         if (folio_first_index < first_index)
    4123           0 :                                 nr_pages -= first_index - folio_first_index;
    4124             : 
    4125           0 :                         if (folio_last_index > last_index)
    4126           0 :                                 nr_pages -= folio_last_index - last_index;
    4127             : 
    4128           0 :                         cs->nr_evicted += nr_pages;
    4129             : 
    4130             : #ifdef CONFIG_SWAP /* implies CONFIG_MMU */
    4131           0 :                         if (shmem_mapping(mapping)) {
    4132             :                                 /* shmem file - in swap cache */
    4133           0 :                                 swp_entry_t swp = radix_to_swp_entry(folio);
    4134             : 
    4135           0 :                                 shadow = get_shadow_from_swap_cache(swp);
    4136             :                         }
    4137             : #endif
    4138           0 :                         if (workingset_test_recent(shadow, true, &workingset))
    4139           0 :                                 cs->nr_recently_evicted += nr_pages;
    4140             : 
    4141           0 :                         goto resched;
    4142             :                 }
    4143             : 
    4144           0 :                 nr_pages = folio_nr_pages(folio);
    4145           0 :                 folio_first_index = folio_pgoff(folio);
    4146           0 :                 folio_last_index = folio_first_index + nr_pages - 1;
    4147             : 
    4148             :                 /* Folios might straddle the range boundaries, only count covered pages */
    4149           0 :                 if (folio_first_index < first_index)
    4150           0 :                         nr_pages -= first_index - folio_first_index;
    4151             : 
    4152           0 :                 if (folio_last_index > last_index)
    4153           0 :                         nr_pages -= folio_last_index - last_index;
    4154             : 
    4155             :                 /* page is in cache */
    4156           0 :                 cs->nr_cache += nr_pages;
    4157             : 
    4158           0 :                 if (folio_test_dirty(folio))
    4159           0 :                         cs->nr_dirty += nr_pages;
    4160             : 
    4161           0 :                 if (folio_test_writeback(folio))
    4162           0 :                         cs->nr_writeback += nr_pages;
    4163             : 
    4164           0 : resched:
    4165           0 :                 if (need_resched()) {
    4166           0 :                         xas_pause(&xas);
    4167           0 :                         cond_resched_rcu();
    4168             :                 }
    4169             :         }
    4170           0 :         rcu_read_unlock();
    4171           0 : }
    4172             : 
    4173             : /*
    4174             :  * The cachestat(2) system call.
    4175             :  *
    4176             :  * cachestat() returns the page cache statistics of a file in the
    4177             :  * bytes range specified by `off` and `len`: number of cached pages,
    4178             :  * number of dirty pages, number of pages marked for writeback,
    4179             :  * number of evicted pages, and number of recently evicted pages.
    4180             :  *
    4181             :  * An evicted page is a page that is previously in the page cache
    4182             :  * but has been evicted since. A page is recently evicted if its last
    4183             :  * eviction was recent enough that its reentry to the cache would
    4184             :  * indicate that it is actively being used by the system, and that
    4185             :  * there is memory pressure on the system.
    4186             :  *
    4187             :  * `off` and `len` must be non-negative integers. If `len` > 0,
    4188             :  * the queried range is [`off`, `off` + `len`]. If `len` == 0,
    4189             :  * we will query in the range from `off` to the end of the file.
    4190             :  *
    4191             :  * The `flags` argument is unused for now, but is included for future
    4192             :  * extensibility. User should pass 0 (i.e no flag specified).
    4193             :  *
    4194             :  * Currently, hugetlbfs is not supported.
    4195             :  *
    4196             :  * Because the status of a page can change after cachestat() checks it
    4197             :  * but before it returns to the application, the returned values may
    4198             :  * contain stale information.
    4199             :  *
    4200             :  * return values:
    4201             :  *  zero        - success
    4202             :  *  -EFAULT     - cstat or cstat_range points to an illegal address
    4203             :  *  -EINVAL     - invalid flags
    4204             :  *  -EBADF      - invalid file descriptor
    4205             :  *  -EOPNOTSUPP - file descriptor is of a hugetlbfs file
    4206             :  */
    4207           0 : SYSCALL_DEFINE4(cachestat, unsigned int, fd,
    4208             :                 struct cachestat_range __user *, cstat_range,
    4209             :                 struct cachestat __user *, cstat, unsigned int, flags)
    4210             : {
    4211           0 :         struct fd f = fdget(fd);
    4212           0 :         struct address_space *mapping;
    4213           0 :         struct cachestat_range csr;
    4214           0 :         struct cachestat cs;
    4215           0 :         pgoff_t first_index, last_index;
    4216             : 
    4217           0 :         if (!f.file)
    4218             :                 return -EBADF;
    4219             : 
    4220           0 :         if (copy_from_user(&csr, cstat_range,
    4221             :                         sizeof(struct cachestat_range))) {
    4222           0 :                 fdput(f);
    4223           0 :                 return -EFAULT;
    4224             :         }
    4225             : 
    4226             :         /* hugetlbfs is not supported */
    4227           0 :         if (is_file_hugepages(f.file)) {
    4228           0 :                 fdput(f);
    4229           0 :                 return -EOPNOTSUPP;
    4230             :         }
    4231             : 
    4232           0 :         if (flags != 0) {
    4233           0 :                 fdput(f);
    4234           0 :                 return -EINVAL;
    4235             :         }
    4236             : 
    4237           0 :         first_index = csr.off >> PAGE_SHIFT;
    4238           0 :         last_index =
    4239           0 :                 csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
    4240           0 :         memset(&cs, 0, sizeof(struct cachestat));
    4241           0 :         mapping = f.file->f_mapping;
    4242           0 :         filemap_cachestat(mapping, first_index, last_index, &cs);
    4243           0 :         fdput(f);
    4244             : 
    4245           0 :         if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
    4246           0 :                 return -EFAULT;
    4247             : 
    4248             :         return 0;
    4249             : }
    4250             : #endif /* CONFIG_CACHESTAT_SYSCALL */

Generated by: LCOV version 1.14