LCOV - fstests of 6.5.0-rc4-xfsx @ Mon Jul 31 20:08:34 PDT 2023

LCOV - code coverage report

Current view:	top level - fs/ext4 - fast_commit.c (source / functions)		Hit	Total	Coverage
Test:	fstests of 6.5.0-rc4-xfsx @ Mon Jul 31 20:08:34 PDT 2023	Lines:	62	1168	5.3 %
Date:	2023-07-31 20:08:34	Functions:	12	54	22.2 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0
       2             : 
       3             : /*
       4             :  * fs/ext4/fast_commit.c
       5             :  *
       6             :  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
       7             :  *
       8             :  * Ext4 fast commits routines.
       9             :  */
      10             : #include "ext4.h"
      11             : #include "ext4_jbd2.h"
      12             : #include "ext4_extents.h"
      13             : #include "mballoc.h"
      14             : 
      15             : /*
      16             :  * Ext4 Fast Commits
      17             :  * -----------------
      18             :  *
      19             :  * Ext4 fast commits implement fine grained journalling for Ext4.
      20             :  *
      21             :  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
      22             :  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
      23             :  * TLV during the recovery phase. For the scenarios for which we currently
      24             :  * don't have replay code, fast commit falls back to full commits.
      25             :  * Fast commits record delta in one of the following three categories.
      26             :  *
      27             :  * (A) Directory entry updates:
      28             :  *
      29             :  * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
      30             :  * - EXT4_FC_TAG_LINK           - records directory entry link
      31             :  * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
      32             :  *
      33             :  * (B) File specific data range updates:
      34             :  *
      35             :  * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
      36             :  * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
      37             :  *
      38             :  * (C) Inode metadata (mtime / ctime etc):
      39             :  *
      40             :  * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
      41             :  *                                during recovery. Note that iblocks field is
      42             :  *                                not replayed and instead derived during
      43             :  *                                replay.
      44             :  * Commit Operation
      45             :  * ----------------
      46             :  * With fast commits, we maintain all the directory entry operations in the
      47             :  * order in which they are issued in an in-memory queue. This queue is flushed
      48             :  * to disk during the commit operation. We also maintain a list of inodes
      49             :  * that need to be committed during a fast commit in another in memory queue of
      50             :  * inodes. During the commit operation, we commit in the following order:
      51             :  *
      52             :  * [1] Lock inodes for any further data updates by setting COMMITTING state
      53             :  * [2] Submit data buffers of all the inodes
      54             :  * [3] Wait for [2] to complete
      55             :  * [4] Commit all the directory entry updates in the fast commit space
      56             :  * [5] Commit all the changed inode structures
      57             :  * [6] Write tail tag (this tag ensures the atomicity, please read the following
      58             :  *     section for more details).
      59             :  * [7] Wait for [4], [5] and [6] to complete.
      60             :  *
      61             :  * All the inode updates must call ext4_fc_start_update() before starting an
      62             :  * update. If such an ongoing update is present, fast commit waits for it to
      63             :  * complete. The completion of such an update is marked by
      64             :  * ext4_fc_stop_update().
      65             :  *
      66             :  * Fast Commit Ineligibility
      67             :  * -------------------------
      68             :  *
      69             :  * Not all operations are supported by fast commits today (e.g extended
      70             :  * attributes). Fast commit ineligibility is marked by calling
      71             :  * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
      72             :  * to full commit.
      73             :  *
      74             :  * Atomicity of commits
      75             :  * --------------------
      76             :  * In order to guarantee atomicity during the commit operation, fast commit
      77             :  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
      78             :  * tag contains CRC of the contents and TID of the transaction after which
      79             :  * this fast commit should be applied. Recovery code replays fast commit
      80             :  * logs only if there's at least 1 valid tail present. For every fast commit
      81             :  * operation, there is 1 tail. This means, we may end up with multiple tails
      82             :  * in the fast commit space. Here's an example:
      83             :  *
      84             :  * - Create a new file A and remove existing file B
      85             :  * - fsync()
      86             :  * - Append contents to file A
      87             :  * - Truncate file A
      88             :  * - fsync()
      89             :  *
      90             :  * The fast commit space at the end of above operations would look like this:
      91             :  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
      92             :  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
      93             :  *
      94             :  * Replay code should thus check for all the valid tails in the FC area.
      95             :  *
      96             :  * Fast Commit Replay Idempotence
      97             :  * ------------------------------
      98             :  *
      99             :  * Fast commits tags are idempotent in nature provided the recovery code follows
     100             :  * certain rules. The guiding principle that the commit path follows while
     101             :  * committing is that it stores the result of a particular operation instead of
     102             :  * storing the procedure.
     103             :  *
     104             :  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
     105             :  * was associated with inode 10. During fast commit, instead of storing this
     106             :  * operation as a procedure "rename a to b", we store the resulting file system
     107             :  * state as a "series" of outcomes:
     108             :  *
     109             :  * - Link dirent b to inode 10
     110             :  * - Unlink dirent a
     111             :  * - Inode <10> with valid refcount
     112             :  *
     113             :  * Now when recovery code runs, it needs "enforce" this state on the file
     114             :  * system. This is what guarantees idempotence of fast commit replay.
     115             :  *
     116             :  * Let's take an example of a procedure that is not idempotent and see how fast
     117             :  * commits make it idempotent. Consider following sequence of operations:
     118             :  *
     119             :  *     rm A;    mv B A;    read A
     120             :  *  (x)     (y)        (z)
     121             :  *
     122             :  * (x), (y) and (z) are the points at which we can crash. If we store this
     123             :  * sequence of operations as is then the replay is not idempotent. Let's say
     124             :  * while in replay, we crash at (z). During the second replay, file A (which was
     125             :  * actually created as a result of "mv B A" operation) would get deleted. Thus,
     126             :  * file named A would be absent when we try to read A. So, this sequence of
     127             :  * operations is not idempotent. However, as mentioned above, instead of storing
     128             :  * the procedure fast commits store the outcome of each procedure. Thus the fast
     129             :  * commit log for above procedure would be as follows:
     130             :  *
     131             :  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
     132             :  * inode 11 before the replay)
     133             :  *
     134             :  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
     135             :  * (w)          (x)                    (y)          (z)
     136             :  *
     137             :  * If we crash at (z), we will have file A linked to inode 11. During the second
     138             :  * replay, we will remove file A (inode 11). But we will create it back and make
     139             :  * it point to inode 11. We won't find B, so we'll just skip that step. At this
     140             :  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
     141             :  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
     142             :  * similarly. Thus, by converting a non-idempotent procedure into a series of
     143             :  * idempotent outcomes, fast commits ensured idempotence during the replay.
     144             :  *
     145             :  * TODOs
     146             :  * -----
     147             :  *
     148             :  * 0) Fast commit replay path hardening: Fast commit replay code should use
     149             :  *    journal handles to make sure all the updates it does during the replay
     150             :  *    path are atomic. With that if we crash during fast commit replay, after
     151             :  *    trying to do recovery again, we will find a file system where fast commit
     152             :  *    area is invalid (because new full commit would be found). In order to deal
     153             :  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
     154             :  *    superblock state is persisted before starting the replay, so that after
     155             :  *    the crash, fast commit recovery code can look at that flag and perform
     156             :  *    fast commit recovery even if that area is invalidated by later full
     157             :  *    commits.
     158             :  *
     159             :  * 1) Fast commit's commit path locks the entire file system during fast
     160             :  *    commit. This has significant performance penalty. Instead of that, we
     161             :  *    should use ext4_fc_start/stop_update functions to start inode level
     162             :  *    updates from ext4_journal_start/stop. Once we do that we can drop file
     163             :  *    system locking during commit path.
     164             :  *
     165             :  * 2) Handle more ineligible cases.
     166             :  */
     167             : 
     168             : #include <trace/events/ext4.h>
     169             : static struct kmem_cache *ext4_fc_dentry_cachep;
     170             : 
     171           0 : static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
     172             : {
     173           0 :         BUFFER_TRACE(bh, "");
     174           0 :         if (uptodate) {
     175           0 :                 ext4_debug("%s: Block %lld up-to-date",
     176             :                            __func__, bh->b_blocknr);
     177           0 :                 set_buffer_uptodate(bh);
     178             :         } else {
     179           0 :                 ext4_debug("%s: Block %lld not up-to-date",
     180             :                            __func__, bh->b_blocknr);
     181           0 :                 clear_buffer_uptodate(bh);
     182             :         }
     183             : 
     184           0 :         unlock_buffer(bh);
     185           0 : }
     186             : 
     187             : static inline void ext4_fc_reset_inode(struct inode *inode)
     188             : {
     189     6016724 :         struct ext4_inode_info *ei = EXT4_I(inode);
     190             : 
     191     6016724 :         ei->i_fc_lblk_start = 0;
     192     6016724 :         ei->i_fc_lblk_len = 0;
     193           0 : }
     194             : 
     195     6016724 : void ext4_fc_init_inode(struct inode *inode)
     196             : {
     197     6016724 :         struct ext4_inode_info *ei = EXT4_I(inode);
     198             : 
     199     6016724 :         ext4_fc_reset_inode(inode);
     200     6016724 :         ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
     201     6033024 :         INIT_LIST_HEAD(&ei->i_fc_list);
     202     6033024 :         INIT_LIST_HEAD(&ei->i_fc_dilist);
     203     6033024 :         init_waitqueue_head(&ei->i_fc_wait);
     204     6028932 :         atomic_set(&ei->i_fc_updates, 0);
     205     6028932 : }
     206             : 
     207             : /* This function must be called with sbi->s_fc_lock held. */
     208           0 : static void ext4_fc_wait_committing_inode(struct inode *inode)
     209             : __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
     210             : {
     211           0 :         wait_queue_head_t *wq;
     212           0 :         struct ext4_inode_info *ei = EXT4_I(inode);
     213             : 
     214             : #if (BITS_PER_LONG < 64)
     215             :         DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
     216             :                         EXT4_STATE_FC_COMMITTING);
     217             :         wq = bit_waitqueue(&ei->i_state_flags,
     218             :                                 EXT4_STATE_FC_COMMITTING);
     219             : #else
     220           0 :         DEFINE_WAIT_BIT(wait, &ei->i_flags,
     221             :                         EXT4_STATE_FC_COMMITTING);
     222           0 :         wq = bit_waitqueue(&ei->i_flags,
     223             :                                 EXT4_STATE_FC_COMMITTING);
     224             : #endif
     225           0 :         lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
     226           0 :         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
     227           0 :         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
     228           0 :         schedule();
     229           0 :         finish_wait(wq, &wait.wq_entry);
     230           0 : }
     231             : 
     232             : static bool ext4_fc_disabled(struct super_block *sb)
     233             : {
     234   166208158 :         return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
     235           0 :                 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
     236             : }
     237             : 
     238             : /*
     239             :  * Inform Ext4's fast about start of an inode update
     240             :  *
     241             :  * This function is called by the high level call VFS callbacks before
     242             :  * performing any inode update. This function blocks if there's an ongoing
     243             :  * fast commit on the inode in question.
     244             :  */
     245           0 : void ext4_fc_start_update(struct inode *inode)
     246             : {
     247           0 :         struct ext4_inode_info *ei = EXT4_I(inode);
     248             : 
     249           0 :         if (ext4_fc_disabled(inode->i_sb))
     250             :                 return;
     251             : 
     252           0 : restart:
     253           0 :         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
     254           0 :         if (list_empty(&ei->i_fc_list))
     255           0 :                 goto out;
     256             : 
     257           0 :         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
     258           0 :                 ext4_fc_wait_committing_inode(inode);
     259           0 :                 goto restart;
     260             :         }
     261           0 : out:
     262           0 :         atomic_inc(&ei->i_fc_updates);
     263           0 :         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
     264             : }
     265             : 
     266             : /*
     267             :  * Stop inode update and wake up waiting fast commits if any.
     268             :  */
     269           0 : void ext4_fc_stop_update(struct inode *inode)
     270             : {
     271           0 :         struct ext4_inode_info *ei = EXT4_I(inode);
     272             : 
     273           0 :         if (ext4_fc_disabled(inode->i_sb))
     274             :                 return;
     275             : 
     276           0 :         if (atomic_dec_and_test(&ei->i_fc_updates))
     277           0 :                 wake_up_all(&ei->i_fc_wait);
     278             : }
     279             : 
     280             : /*
     281             :  * Remove inode from fast commit list. If the inode is being committed
     282             :  * we wait until inode commit is done.
     283             :  */
     284     3248680 : void ext4_fc_del(struct inode *inode)
     285             : {
     286     3248680 :         struct ext4_inode_info *ei = EXT4_I(inode);
     287     3248680 :         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
     288     3248680 :         struct ext4_fc_dentry_update *fc_dentry;
     289             : 
     290     3248680 :         if (ext4_fc_disabled(inode->i_sb))
     291             :                 return;
     292             : 
     293           0 : restart:
     294           0 :         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
     295           0 :         if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
     296           0 :                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
     297           0 :                 return;
     298             :         }
     299             : 
     300           0 :         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
     301           0 :                 ext4_fc_wait_committing_inode(inode);
     302           0 :                 goto restart;
     303             :         }
     304             : 
     305           0 :         if (!list_empty(&ei->i_fc_list))
     306           0 :                 list_del_init(&ei->i_fc_list);
     307             : 
     308             :         /*
     309             :          * Since this inode is getting removed, let's also remove all FC
     310             :          * dentry create references, since it is not needed to log it anyways.
     311             :          */
     312           0 :         if (list_empty(&ei->i_fc_dilist)) {
     313           0 :                 spin_unlock(&sbi->s_fc_lock);
     314           0 :                 return;
     315             :         }
     316             : 
     317           0 :         fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
     318           0 :         WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
     319           0 :         list_del_init(&fc_dentry->fcd_list);
     320           0 :         list_del_init(&fc_dentry->fcd_dilist);
     321             : 
     322           0 :         WARN_ON(!list_empty(&ei->i_fc_dilist));
     323           0 :         spin_unlock(&sbi->s_fc_lock);
     324             : 
     325           0 :         if (fc_dentry->fcd_name.name &&
     326           0 :                 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
     327           0 :                 kfree(fc_dentry->fcd_name.name);
     328           0 :         kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
     329             : 
     330           0 :         return;
     331             : }
     332             : 
     333             : /*
     334             :  * Mark file system as fast commit ineligible, and record latest
     335             :  * ineligible transaction tid. This means until the recorded
     336             :  * transaction, commit operation would result in a full jbd2 commit.
     337             :  */
     338     1191727 : void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
     339             : {
     340     1191727 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
     341     1191727 :         tid_t tid;
     342             : 
     343     1191727 :         if (ext4_fc_disabled(sb))
     344             :                 return;
     345             : 
     346           0 :         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
     347           0 :         if (handle && !IS_ERR(handle))
     348           0 :                 tid = handle->h_transaction->t_tid;
     349             :         else {
     350           0 :                 read_lock(&sbi->s_journal->j_state_lock);
     351           0 :                 tid = sbi->s_journal->j_running_transaction ?
     352           0 :                                 sbi->s_journal->j_running_transaction->t_tid : 0;
     353           0 :                 read_unlock(&sbi->s_journal->j_state_lock);
     354             :         }
     355           0 :         spin_lock(&sbi->s_fc_lock);
     356           0 :         if (sbi->s_fc_ineligible_tid < tid)
     357           0 :                 sbi->s_fc_ineligible_tid = tid;
     358           0 :         spin_unlock(&sbi->s_fc_lock);
     359           0 :         WARN_ON(reason >= EXT4_FC_REASON_MAX);
     360           0 :         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
     361             : }
     362             : 
     363             : /*
     364             :  * Generic fast commit tracking function. If this is the first time this we are
     365             :  * called after a full commit, we initialize fast commit fields and then call
     366             :  * __fc_track_fn() with update = 0. If we have already been called after a full
     367             :  * commit, we pass update = 1. Based on that, the track function can determine
     368             :  * if it needs to track a field for the first time or if it needs to just
     369             :  * update the previously tracked value.
     370             :  *
     371             :  * If enqueue is set, this function enqueues the inode in fast commit list.
     372             :  */
     373           0 : static int ext4_fc_track_template(
     374             :         handle_t *handle, struct inode *inode,
     375             :         int (*__fc_track_fn)(struct inode *, void *, bool),
     376             :         void *args, int enqueue)
     377             : {
     378           0 :         bool update = false;
     379           0 :         struct ext4_inode_info *ei = EXT4_I(inode);
     380           0 :         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
     381           0 :         tid_t tid = 0;
     382           0 :         int ret;
     383             : 
     384           0 :         tid = handle->h_transaction->t_tid;
     385           0 :         mutex_lock(&ei->i_fc_lock);
     386           0 :         if (tid == ei->i_sync_tid) {
     387             :                 update = true;
     388             :         } else {
     389           0 :                 ext4_fc_reset_inode(inode);
     390           0 :                 ei->i_sync_tid = tid;
     391             :         }
     392           0 :         ret = __fc_track_fn(inode, args, update);
     393           0 :         mutex_unlock(&ei->i_fc_lock);
     394             : 
     395           0 :         if (!enqueue)
     396             :                 return ret;
     397             : 
     398           0 :         spin_lock(&sbi->s_fc_lock);
     399           0 :         if (list_empty(&EXT4_I(inode)->i_fc_list))
     400           0 :                 list_add_tail(&EXT4_I(inode)->i_fc_list,
     401           0 :                                 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
     402             :                                  sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
     403             :                                 &sbi->s_fc_q[FC_Q_STAGING] :
     404             :                                 &sbi->s_fc_q[FC_Q_MAIN]);
     405           0 :         spin_unlock(&sbi->s_fc_lock);
     406             : 
     407           0 :         return ret;
     408             : }
     409             : 
     410             : struct __track_dentry_update_args {
     411             :         struct dentry *dentry;
     412             :         int op;
     413             : };
     414             : 
     415             : /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
     416           0 : static int __track_dentry_update(struct inode *inode, void *arg, bool update)
     417             : {
     418           0 :         struct ext4_fc_dentry_update *node;
     419           0 :         struct ext4_inode_info *ei = EXT4_I(inode);
     420           0 :         struct __track_dentry_update_args *dentry_update =
     421             :                 (struct __track_dentry_update_args *)arg;
     422           0 :         struct dentry *dentry = dentry_update->dentry;
     423           0 :         struct inode *dir = dentry->d_parent->d_inode;
     424           0 :         struct super_block *sb = inode->i_sb;
     425           0 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
     426             : 
     427           0 :         mutex_unlock(&ei->i_fc_lock);
     428             : 
     429           0 :         if (IS_ENCRYPTED(dir)) {
     430           0 :                 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
     431             :                                         NULL);
     432           0 :                 mutex_lock(&ei->i_fc_lock);
     433           0 :                 return -EOPNOTSUPP;
     434             :         }
     435             : 
     436           0 :         node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
     437           0 :         if (!node) {
     438           0 :                 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
     439           0 :                 mutex_lock(&ei->i_fc_lock);
     440           0 :                 return -ENOMEM;
     441             :         }
     442             : 
     443           0 :         node->fcd_op = dentry_update->op;
     444           0 :         node->fcd_parent = dir->i_ino;
     445           0 :         node->fcd_ino = inode->i_ino;
     446           0 :         if (dentry->d_name.len > DNAME_INLINE_LEN) {
     447           0 :                 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
     448           0 :                 if (!node->fcd_name.name) {
     449           0 :                         kmem_cache_free(ext4_fc_dentry_cachep, node);
     450           0 :                         ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
     451           0 :                         mutex_lock(&ei->i_fc_lock);
     452           0 :                         return -ENOMEM;
     453             :                 }
     454           0 :                 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
     455             :                         dentry->d_name.len);
     456             :         } else {
     457           0 :                 memcpy(node->fcd_iname, dentry->d_name.name,
     458             :                         dentry->d_name.len);
     459           0 :                 node->fcd_name.name = node->fcd_iname;
     460             :         }
     461           0 :         node->fcd_name.len = dentry->d_name.len;
     462           0 :         INIT_LIST_HEAD(&node->fcd_dilist);
     463           0 :         spin_lock(&sbi->s_fc_lock);
     464           0 :         if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
     465             :                 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
     466           0 :                 list_add_tail(&node->fcd_list,
     467             :                                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
     468             :         else
     469           0 :                 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
     470             : 
     471             :         /*
     472             :          * This helps us keep a track of all fc_dentry updates which is part of
     473             :          * this ext4 inode. So in case the inode is getting unlinked, before
     474             :          * even we get a chance to fsync, we could remove all fc_dentry
     475             :          * references while evicting the inode in ext4_fc_del().
     476             :          * Also with this, we don't need to loop over all the inodes in
     477             :          * sbi->s_fc_q to get the corresponding inode in
     478             :          * ext4_fc_commit_dentry_updates().
     479             :          */
     480           0 :         if (dentry_update->op == EXT4_FC_TAG_CREAT) {
     481           0 :                 WARN_ON(!list_empty(&ei->i_fc_dilist));
     482           0 :                 list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
     483             :         }
     484           0 :         spin_unlock(&sbi->s_fc_lock);
     485           0 :         mutex_lock(&ei->i_fc_lock);
     486             : 
     487           0 :         return 0;
     488             : }
     489             : 
     490           0 : void __ext4_fc_track_unlink(handle_t *handle,
     491             :                 struct inode *inode, struct dentry *dentry)
     492             : {
     493           0 :         struct __track_dentry_update_args args;
     494           0 :         int ret;
     495             : 
     496           0 :         args.dentry = dentry;
     497           0 :         args.op = EXT4_FC_TAG_UNLINK;
     498             : 
     499           0 :         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
     500             :                                         (void *)&args, 0);
     501           0 :         trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
     502           0 : }
     503             : 
     504     1568711 : void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
     505             : {
     506     1568711 :         struct inode *inode = d_inode(dentry);
     507             : 
     508     1568711 :         if (ext4_fc_disabled(inode->i_sb))
     509             :                 return;
     510             : 
     511           0 :         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
     512             :                 return;
     513             : 
     514           0 :         __ext4_fc_track_unlink(handle, inode, dentry);
     515             : }
     516             : 
     517           0 : void __ext4_fc_track_link(handle_t *handle,
     518             :         struct inode *inode, struct dentry *dentry)
     519             : {
     520           0 :         struct __track_dentry_update_args args;
     521           0 :         int ret;
     522             : 
     523           0 :         args.dentry = dentry;
     524           0 :         args.op = EXT4_FC_TAG_LINK;
     525             : 
     526           0 :         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
     527             :                                         (void *)&args, 0);
     528           0 :         trace_ext4_fc_track_link(handle, inode, dentry, ret);
     529           0 : }
     530             : 
     531       76611 : void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
     532             : {
     533       76611 :         struct inode *inode = d_inode(dentry);
     534             : 
     535       76611 :         if (ext4_fc_disabled(inode->i_sb))
     536             :                 return;
     537             : 
     538           0 :         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
     539             :                 return;
     540             : 
     541           0 :         __ext4_fc_track_link(handle, inode, dentry);
     542             : }
     543             : 
     544           0 : void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
     545             :                           struct dentry *dentry)
     546             : {
     547           0 :         struct __track_dentry_update_args args;
     548           0 :         int ret;
     549             : 
     550           0 :         args.dentry = dentry;
     551           0 :         args.op = EXT4_FC_TAG_CREAT;
     552             : 
     553           0 :         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
     554             :                                         (void *)&args, 0);
     555           0 :         trace_ext4_fc_track_create(handle, inode, dentry, ret);
     556           0 : }
     557             : 
     558     2169347 : void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
     559             : {
     560     2169347 :         struct inode *inode = d_inode(dentry);
     561             : 
     562     2169347 :         if (ext4_fc_disabled(inode->i_sb))
     563             :                 return;
     564             : 
     565           0 :         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
     566             :                 return;
     567             : 
     568           0 :         __ext4_fc_track_create(handle, inode, dentry);
     569             : }
     570             : 
     571             : /* __track_fn for inode tracking */
     572           0 : static int __track_inode(struct inode *inode, void *arg, bool update)
     573             : {
     574           0 :         if (update)
     575             :                 return -EEXIST;
     576             : 
     577           0 :         EXT4_I(inode)->i_fc_lblk_len = 0;
     578             : 
     579           0 :         return 0;
     580             : }
     581             : 
     582    78545956 : void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
     583             : {
     584    78545956 :         int ret;
     585             : 
     586    78545956 :         if (S_ISDIR(inode->i_mode))
     587             :                 return;
     588             : 
     589    68091899 :         if (ext4_fc_disabled(inode->i_sb))
     590             :                 return;
     591             : 
     592           0 :         if (ext4_should_journal_data(inode)) {
     593           0 :                 ext4_fc_mark_ineligible(inode->i_sb,
     594             :                                         EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
     595           0 :                 return;
     596             :         }
     597             : 
     598           0 :         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
     599             :                 return;
     600             : 
     601           0 :         ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
     602           0 :         trace_ext4_fc_track_inode(handle, inode, ret);
     603             : }
     604             : 
     605             : struct __track_range_args {
     606             :         ext4_lblk_t start, end;
     607             : };
     608             : 
     609             : /* __track_fn for tracking data updates */
     610           0 : static int __track_range(struct inode *inode, void *arg, bool update)
     611             : {
     612           0 :         struct ext4_inode_info *ei = EXT4_I(inode);
     613           0 :         ext4_lblk_t oldstart;
     614           0 :         struct __track_range_args *__arg =
     615             :                 (struct __track_range_args *)arg;
     616             : 
     617           0 :         if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
     618             :                 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
     619             :                 return -ECANCELED;
     620             :         }
     621             : 
     622           0 :         oldstart = ei->i_fc_lblk_start;
     623             : 
     624           0 :         if (update && ei->i_fc_lblk_len > 0) {
     625           0 :                 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
     626           0 :                 ei->i_fc_lblk_len =
     627           0 :                         max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
     628           0 :                                 ei->i_fc_lblk_start + 1;
     629             :         } else {
     630           0 :                 ei->i_fc_lblk_start = __arg->start;
     631           0 :                 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
     632             :         }
     633             : 
     634             :         return 0;
     635             : }
     636             : 
     637     7138102 : void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
     638             :                          ext4_lblk_t end)
     639             : {
     640     7138102 :         struct __track_range_args args;
     641     7138102 :         int ret;
     642             : 
     643     7138102 :         if (S_ISDIR(inode->i_mode))
     644     7138102 :                 return;
     645             : 
     646     6757104 :         if (ext4_fc_disabled(inode->i_sb))
     647             :                 return;
     648             : 
     649           0 :         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
     650             :                 return;
     651             : 
     652           0 :         args.start = start;
     653           0 :         args.end = end;
     654             : 
     655           0 :         ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
     656             : 
     657           0 :         trace_ext4_fc_track_range(handle, inode, start, end, ret);
     658             : }
     659             : 
     660           0 : static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
     661             : {
     662           0 :         blk_opf_t write_flags = REQ_SYNC;
     663           0 :         struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
     664             : 
     665             :         /* Add REQ_FUA | REQ_PREFLUSH only its tail */
     666           0 :         if (test_opt(sb, BARRIER) && is_tail)
     667           0 :                 write_flags |= REQ_FUA | REQ_PREFLUSH;
     668           0 :         lock_buffer(bh);
     669           0 :         set_buffer_dirty(bh);
     670           0 :         set_buffer_uptodate(bh);
     671           0 :         bh->b_end_io = ext4_end_buffer_io_sync;
     672           0 :         submit_bh(REQ_OP_WRITE | write_flags, bh);
     673           0 :         EXT4_SB(sb)->s_fc_bh = NULL;
     674           0 : }
     675             : 
     676             : /* Ext4 commit path routines */
     677             : 
     678             : /*
     679             :  * Allocate len bytes on a fast commit buffer.
     680             :  *
     681             :  * During the commit time this function is used to manage fast commit
     682             :  * block space. We don't split a fast commit log onto different
     683             :  * blocks. So this function makes sure that if there's not enough space
     684             :  * on the current block, the remaining space in the current block is
     685             :  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
     686             :  * new block is from jbd2 and CRC is updated to reflect the padding
     687             :  * we added.
     688             :  */
     689           0 : static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
     690             : {
     691           0 :         struct ext4_fc_tl tl;
     692           0 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
     693           0 :         struct buffer_head *bh;
     694           0 :         int bsize = sbi->s_journal->j_blocksize;
     695           0 :         int ret, off = sbi->s_fc_bytes % bsize;
     696           0 :         int remaining;
     697           0 :         u8 *dst;
     698             : 
     699             :         /*
     700             :          * If 'len' is too long to fit in any block alongside a PAD tlv, then we
     701             :          * cannot fulfill the request.
     702             :          */
     703           0 :         if (len > bsize - EXT4_FC_TAG_BASE_LEN)
     704             :                 return NULL;
     705             : 
     706           0 :         if (!sbi->s_fc_bh) {
     707           0 :                 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
     708           0 :                 if (ret)
     709             :                         return NULL;
     710           0 :                 sbi->s_fc_bh = bh;
     711             :         }
     712           0 :         dst = sbi->s_fc_bh->b_data + off;
     713             : 
     714             :         /*
     715             :          * Allocate the bytes in the current block if we can do so while still
     716             :          * leaving enough space for a PAD tlv.
     717             :          */
     718           0 :         remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
     719           0 :         if (len <= remaining) {
     720           0 :                 sbi->s_fc_bytes += len;
     721           0 :                 return dst;
     722             :         }
     723             : 
     724             :         /*
     725             :          * Else, terminate the current block with a PAD tlv, then allocate a new
     726             :          * block and allocate the bytes at the start of that new block.
     727             :          */
     728             : 
     729           0 :         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
     730           0 :         tl.fc_len = cpu_to_le16(remaining);
     731           0 :         memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
     732           0 :         memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
     733           0 :         *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize);
     734             : 
     735           0 :         ext4_fc_submit_bh(sb, false);
     736             : 
     737           0 :         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
     738           0 :         if (ret)
     739             :                 return NULL;
     740           0 :         sbi->s_fc_bh = bh;
     741           0 :         sbi->s_fc_bytes += bsize - off + len;
     742           0 :         return sbi->s_fc_bh->b_data;
     743             : }
     744             : 
     745             : /*
     746             :  * Complete a fast commit by writing tail tag.
     747             :  *
     748             :  * Writing tail tag marks the end of a fast commit. In order to guarantee
     749             :  * atomicity, after writing tail tag, even if there's space remaining
     750             :  * in the block, next commit shouldn't use it. That's why tail tag
     751             :  * has the length as that of the remaining space on the block.
     752             :  */
     753           0 : static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
     754             : {
     755           0 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
     756           0 :         struct ext4_fc_tl tl;
     757           0 :         struct ext4_fc_tail tail;
     758           0 :         int off, bsize = sbi->s_journal->j_blocksize;
     759           0 :         u8 *dst;
     760             : 
     761             :         /*
     762             :          * ext4_fc_reserve_space takes care of allocating an extra block if
     763             :          * there's no enough space on this block for accommodating this tail.
     764             :          */
     765           0 :         dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
     766           0 :         if (!dst)
     767             :                 return -ENOSPC;
     768             : 
     769           0 :         off = sbi->s_fc_bytes % bsize;
     770             : 
     771           0 :         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
     772           0 :         tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
     773           0 :         sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
     774             : 
     775           0 :         memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
     776           0 :         dst += EXT4_FC_TAG_BASE_LEN;
     777           0 :         tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
     778           0 :         memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
     779           0 :         dst += sizeof(tail.fc_tid);
     780           0 :         crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data,
     781           0 :                           dst - (u8 *)sbi->s_fc_bh->b_data);
     782           0 :         tail.fc_crc = cpu_to_le32(crc);
     783           0 :         memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
     784           0 :         dst += sizeof(tail.fc_crc);
     785           0 :         memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
     786             : 
     787           0 :         ext4_fc_submit_bh(sb, true);
     788             : 
     789           0 :         return 0;
     790             : }
     791             : 
     792             : /*
     793             :  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
     794             :  * Returns false if there's not enough space.
     795             :  */
     796           0 : static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
     797             :                            u32 *crc)
     798             : {
     799           0 :         struct ext4_fc_tl tl;
     800           0 :         u8 *dst;
     801             : 
     802           0 :         dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
     803           0 :         if (!dst)
     804             :                 return false;
     805             : 
     806           0 :         tl.fc_tag = cpu_to_le16(tag);
     807           0 :         tl.fc_len = cpu_to_le16(len);
     808             : 
     809           0 :         memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
     810           0 :         memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);
     811             : 
     812           0 :         return true;
     813             : }
     814             : 
     815             : /* Same as above, but adds dentry tlv. */
     816           0 : static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
     817             :                                    struct ext4_fc_dentry_update *fc_dentry)
     818             : {
     819           0 :         struct ext4_fc_dentry_info fcd;
     820           0 :         struct ext4_fc_tl tl;
     821           0 :         int dlen = fc_dentry->fcd_name.len;
     822           0 :         u8 *dst = ext4_fc_reserve_space(sb,
     823           0 :                         EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
     824             : 
     825           0 :         if (!dst)
     826             :                 return false;
     827             : 
     828           0 :         fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
     829           0 :         fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
     830           0 :         tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
     831           0 :         tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
     832           0 :         memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
     833           0 :         dst += EXT4_FC_TAG_BASE_LEN;
     834           0 :         memcpy(dst, &fcd, sizeof(fcd));
     835           0 :         dst += sizeof(fcd);
     836           0 :         memcpy(dst, fc_dentry->fcd_name.name, dlen);
     837             : 
     838           0 :         return true;
     839             : }
     840             : 
     841             : /*
     842             :  * Writes inode in the fast commit space under TLV with tag @tag.
     843             :  * Returns 0 on success, error on failure.
     844             :  */
     845           0 : static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
     846             : {
     847           0 :         struct ext4_inode_info *ei = EXT4_I(inode);
     848           0 :         int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
     849           0 :         int ret;
     850           0 :         struct ext4_iloc iloc;
     851           0 :         struct ext4_fc_inode fc_inode;
     852           0 :         struct ext4_fc_tl tl;
     853           0 :         u8 *dst;
     854             : 
     855           0 :         ret = ext4_get_inode_loc(inode, &iloc);
     856           0 :         if (ret)
     857             :                 return ret;
     858             : 
     859           0 :         if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
     860           0 :                 inode_len = EXT4_INODE_SIZE(inode->i_sb);
     861           0 :         else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
     862           0 :                 inode_len += ei->i_extra_isize;
     863             : 
     864           0 :         fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
     865           0 :         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
     866           0 :         tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
     867             : 
     868           0 :         ret = -ECANCELED;
     869           0 :         dst = ext4_fc_reserve_space(inode->i_sb,
     870           0 :                 EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
     871           0 :         if (!dst)
     872           0 :                 goto err;
     873             : 
     874           0 :         memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
     875           0 :         dst += EXT4_FC_TAG_BASE_LEN;
     876           0 :         memcpy(dst, &fc_inode, sizeof(fc_inode));
     877           0 :         dst += sizeof(fc_inode);
     878           0 :         memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
     879           0 :         ret = 0;
     880           0 : err:
     881           0 :         brelse(iloc.bh);
     882             :         return ret;
     883             : }
     884             : 
     885             : /*
     886             :  * Writes updated data ranges for the inode in question. Updates CRC.
     887             :  * Returns 0 on success, error otherwise.
     888             :  */
     889           0 : static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
     890             : {
     891           0 :         ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
     892           0 :         struct ext4_inode_info *ei = EXT4_I(inode);
     893           0 :         struct ext4_map_blocks map;
     894           0 :         struct ext4_fc_add_range fc_ext;
     895           0 :         struct ext4_fc_del_range lrange;
     896           0 :         struct ext4_extent *ex;
     897           0 :         int ret;
     898             : 
     899           0 :         mutex_lock(&ei->i_fc_lock);
     900           0 :         if (ei->i_fc_lblk_len == 0) {
     901           0 :                 mutex_unlock(&ei->i_fc_lock);
     902           0 :                 return 0;
     903             :         }
     904           0 :         old_blk_size = ei->i_fc_lblk_start;
     905           0 :         new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
     906           0 :         ei->i_fc_lblk_len = 0;
     907           0 :         mutex_unlock(&ei->i_fc_lock);
     908             : 
     909           0 :         cur_lblk_off = old_blk_size;
     910           0 :         ext4_debug("will try writing %d to %d for inode %ld\n",
     911             :                    cur_lblk_off, new_blk_size, inode->i_ino);
     912             : 
     913           0 :         while (cur_lblk_off <= new_blk_size) {
     914           0 :                 map.m_lblk = cur_lblk_off;
     915           0 :                 map.m_len = new_blk_size - cur_lblk_off + 1;
     916           0 :                 ret = ext4_map_blocks(NULL, inode, &map, 0);
     917           0 :                 if (ret < 0)
     918             :                         return -ECANCELED;
     919             : 
     920           0 :                 if (map.m_len == 0) {
     921           0 :                         cur_lblk_off++;
     922           0 :                         continue;
     923             :                 }
     924             : 
     925           0 :                 if (ret == 0) {
     926           0 :                         lrange.fc_ino = cpu_to_le32(inode->i_ino);
     927           0 :                         lrange.fc_lblk = cpu_to_le32(map.m_lblk);
     928           0 :                         lrange.fc_len = cpu_to_le32(map.m_len);
     929           0 :                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
     930             :                                             sizeof(lrange), (u8 *)&lrange, crc))
     931             :                                 return -ENOSPC;
     932             :                 } else {
     933           0 :                         unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
     934             :                                 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
     935             : 
     936             :                         /* Limit the number of blocks in one extent */
     937           0 :                         map.m_len = min(max, map.m_len);
     938             : 
     939           0 :                         fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
     940           0 :                         ex = (struct ext4_extent *)&fc_ext.fc_ex;
     941           0 :                         ex->ee_block = cpu_to_le32(map.m_lblk);
     942           0 :                         ex->ee_len = cpu_to_le16(map.m_len);
     943           0 :                         ext4_ext_store_pblock(ex, map.m_pblk);
     944           0 :                         if (map.m_flags & EXT4_MAP_UNWRITTEN)
     945           0 :                                 ext4_ext_mark_unwritten(ex);
     946             :                         else
     947           0 :                                 ext4_ext_mark_initialized(ex);
     948           0 :                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
     949             :                                             sizeof(fc_ext), (u8 *)&fc_ext, crc))
     950             :                                 return -ENOSPC;
     951             :                 }
     952             : 
     953           0 :                 cur_lblk_off += map.m_len;
     954             :         }
     955             : 
     956             :         return 0;
     957             : }
     958             : 
     959             : 
     960             : /* Submit data for all the fast commit inodes */
     961           0 : static int ext4_fc_submit_inode_data_all(journal_t *journal)
     962             : {
     963           0 :         struct super_block *sb = journal->j_private;
     964           0 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
     965           0 :         struct ext4_inode_info *ei;
     966           0 :         int ret = 0;
     967             : 
     968           0 :         spin_lock(&sbi->s_fc_lock);
     969           0 :         list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
     970           0 :                 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
     971           0 :                 while (atomic_read(&ei->i_fc_updates)) {
     972           0 :                         DEFINE_WAIT(wait);
     973             : 
     974           0 :                         prepare_to_wait(&ei->i_fc_wait, &wait,
     975             :                                                 TASK_UNINTERRUPTIBLE);
     976           0 :                         if (atomic_read(&ei->i_fc_updates)) {
     977           0 :                                 spin_unlock(&sbi->s_fc_lock);
     978           0 :                                 schedule();
     979           0 :                                 spin_lock(&sbi->s_fc_lock);
     980             :                         }
     981           0 :                         finish_wait(&ei->i_fc_wait, &wait);
     982             :                 }
     983           0 :                 spin_unlock(&sbi->s_fc_lock);
     984           0 :                 ret = jbd2_submit_inode_data(journal, ei->jinode);
     985           0 :                 if (ret)
     986           0 :                         return ret;
     987           0 :                 spin_lock(&sbi->s_fc_lock);
     988             :         }
     989           0 :         spin_unlock(&sbi->s_fc_lock);
     990             : 
     991           0 :         return ret;
     992             : }
     993             : 
     994             : /* Wait for completion of data for all the fast commit inodes */
     995           0 : static int ext4_fc_wait_inode_data_all(journal_t *journal)
     996             : {
     997           0 :         struct super_block *sb = journal->j_private;
     998           0 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
     999           0 :         struct ext4_inode_info *pos, *n;
    1000           0 :         int ret = 0;
    1001             : 
    1002           0 :         spin_lock(&sbi->s_fc_lock);
    1003           0 :         list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
    1004           0 :                 if (!ext4_test_inode_state(&pos->vfs_inode,
    1005             :                                            EXT4_STATE_FC_COMMITTING))
    1006           0 :                         continue;
    1007           0 :                 spin_unlock(&sbi->s_fc_lock);
    1008             : 
    1009           0 :                 ret = jbd2_wait_inode_data(journal, pos->jinode);
    1010           0 :                 if (ret)
    1011           0 :                         return ret;
    1012           0 :                 spin_lock(&sbi->s_fc_lock);
    1013             :         }
    1014           0 :         spin_unlock(&sbi->s_fc_lock);
    1015             : 
    1016           0 :         return 0;
    1017             : }
    1018             : 
    1019             : /* Commit all the directory entry updates */
    1020           0 : static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
    1021             : __acquires(&sbi->s_fc_lock)
    1022             : __releases(&sbi->s_fc_lock)
    1023             : {
    1024           0 :         struct super_block *sb = journal->j_private;
    1025           0 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    1026           0 :         struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
    1027           0 :         struct inode *inode;
    1028           0 :         struct ext4_inode_info *ei;
    1029           0 :         int ret;
    1030             : 
    1031           0 :         if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
    1032             :                 return 0;
    1033           0 :         list_for_each_entry_safe(fc_dentry, fc_dentry_n,
    1034             :                                  &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
    1035           0 :                 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
    1036           0 :                         spin_unlock(&sbi->s_fc_lock);
    1037           0 :                         if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
    1038           0 :                                 ret = -ENOSPC;
    1039           0 :                                 goto lock_and_exit;
    1040             :                         }
    1041           0 :                         spin_lock(&sbi->s_fc_lock);
    1042           0 :                         continue;
    1043             :                 }
    1044             :                 /*
    1045             :                  * With fcd_dilist we need not loop in sbi->s_fc_q to get the
    1046             :                  * corresponding inode pointer
    1047             :                  */
    1048           0 :                 WARN_ON(list_empty(&fc_dentry->fcd_dilist));
    1049           0 :                 ei = list_first_entry(&fc_dentry->fcd_dilist,
    1050             :                                 struct ext4_inode_info, i_fc_dilist);
    1051           0 :                 inode = &ei->vfs_inode;
    1052           0 :                 WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
    1053             : 
    1054           0 :                 spin_unlock(&sbi->s_fc_lock);
    1055             : 
    1056             :                 /*
    1057             :                  * We first write the inode and then the create dirent. This
    1058             :                  * allows the recovery code to create an unnamed inode first
    1059             :                  * and then link it to a directory entry. This allows us
    1060             :                  * to use namei.c routines almost as is and simplifies
    1061             :                  * the recovery code.
    1062             :                  */
    1063           0 :                 ret = ext4_fc_write_inode(inode, crc);
    1064           0 :                 if (ret)
    1065           0 :                         goto lock_and_exit;
    1066             : 
    1067           0 :                 ret = ext4_fc_write_inode_data(inode, crc);
    1068           0 :                 if (ret)
    1069           0 :                         goto lock_and_exit;
    1070             : 
    1071           0 :                 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
    1072           0 :                         ret = -ENOSPC;
    1073           0 :                         goto lock_and_exit;
    1074             :                 }
    1075             : 
    1076           0 :                 spin_lock(&sbi->s_fc_lock);
    1077             :         }
    1078             :         return 0;
    1079           0 : lock_and_exit:
    1080           0 :         spin_lock(&sbi->s_fc_lock);
    1081           0 :         return ret;
    1082             : }
    1083             : 
    1084           0 : static int ext4_fc_perform_commit(journal_t *journal)
    1085             : {
    1086           0 :         struct super_block *sb = journal->j_private;
    1087           0 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    1088           0 :         struct ext4_inode_info *iter;
    1089           0 :         struct ext4_fc_head head;
    1090           0 :         struct inode *inode;
    1091           0 :         struct blk_plug plug;
    1092           0 :         int ret = 0;
    1093           0 :         u32 crc = 0;
    1094             : 
    1095           0 :         ret = ext4_fc_submit_inode_data_all(journal);
    1096           0 :         if (ret)
    1097             :                 return ret;
    1098             : 
    1099           0 :         ret = ext4_fc_wait_inode_data_all(journal);
    1100           0 :         if (ret)
    1101             :                 return ret;
    1102             : 
    1103             :         /*
    1104             :          * If file system device is different from journal device, issue a cache
    1105             :          * flush before we start writing fast commit blocks.
    1106             :          */
    1107           0 :         if (journal->j_fs_dev != journal->j_dev)
    1108           0 :                 blkdev_issue_flush(journal->j_fs_dev);
    1109             : 
    1110           0 :         blk_start_plug(&plug);
    1111           0 :         if (sbi->s_fc_bytes == 0) {
    1112             :                 /*
    1113             :                  * Add a head tag only if this is the first fast commit
    1114             :                  * in this TID.
    1115             :                  */
    1116           0 :                 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
    1117           0 :                 head.fc_tid = cpu_to_le32(
    1118             :                         sbi->s_journal->j_running_transaction->t_tid);
    1119           0 :                 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
    1120             :                         (u8 *)&head, &crc)) {
    1121           0 :                         ret = -ENOSPC;
    1122           0 :                         goto out;
    1123             :                 }
    1124             :         }
    1125             : 
    1126           0 :         spin_lock(&sbi->s_fc_lock);
    1127           0 :         ret = ext4_fc_commit_dentry_updates(journal, &crc);
    1128           0 :         if (ret) {
    1129           0 :                 spin_unlock(&sbi->s_fc_lock);
    1130           0 :                 goto out;
    1131             :         }
    1132             : 
    1133           0 :         list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
    1134           0 :                 inode = &iter->vfs_inode;
    1135           0 :                 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
    1136           0 :                         continue;
    1137             : 
    1138           0 :                 spin_unlock(&sbi->s_fc_lock);
    1139           0 :                 ret = ext4_fc_write_inode_data(inode, &crc);
    1140           0 :                 if (ret)
    1141           0 :                         goto out;
    1142           0 :                 ret = ext4_fc_write_inode(inode, &crc);
    1143           0 :                 if (ret)
    1144           0 :                         goto out;
    1145           0 :                 spin_lock(&sbi->s_fc_lock);
    1146             :         }
    1147           0 :         spin_unlock(&sbi->s_fc_lock);
    1148             : 
    1149           0 :         ret = ext4_fc_write_tail(sb, crc);
    1150             : 
    1151           0 : out:
    1152           0 :         blk_finish_plug(&plug);
    1153           0 :         return ret;
    1154             : }
    1155             : 
    1156           0 : static void ext4_fc_update_stats(struct super_block *sb, int status,
    1157             :                                  u64 commit_time, int nblks, tid_t commit_tid)
    1158             : {
    1159           0 :         struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
    1160             : 
    1161           0 :         ext4_debug("Fast commit ended with status = %d for tid %u",
    1162             :                         status, commit_tid);
    1163           0 :         if (status == EXT4_FC_STATUS_OK) {
    1164           0 :                 stats->fc_num_commits++;
    1165           0 :                 stats->fc_numblks += nblks;
    1166           0 :                 if (likely(stats->s_fc_avg_commit_time))
    1167           0 :                         stats->s_fc_avg_commit_time =
    1168           0 :                                 (commit_time +
    1169           0 :                                  stats->s_fc_avg_commit_time * 3) / 4;
    1170             :                 else
    1171           0 :                         stats->s_fc_avg_commit_time = commit_time;
    1172           0 :         } else if (status == EXT4_FC_STATUS_FAILED ||
    1173           0 :                    status == EXT4_FC_STATUS_INELIGIBLE) {
    1174           0 :                 if (status == EXT4_FC_STATUS_FAILED)
    1175           0 :                         stats->fc_failed_commits++;
    1176           0 :                 stats->fc_ineligible_commits++;
    1177             :         } else {
    1178           0 :                 stats->fc_skipped_commits++;
    1179             :         }
    1180           0 :         trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
    1181           0 : }
    1182             : 
    1183             : /*
    1184             :  * The main commit entry point. Performs a fast commit for transaction
    1185             :  * commit_tid if needed. If it's not possible to perform a fast commit
    1186             :  * due to various reasons, we fall back to full commit. Returns 0
    1187             :  * on success, error otherwise.
    1188             :  */
    1189      237134 : int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
    1190             : {
    1191      237134 :         struct super_block *sb = journal->j_private;
    1192      237134 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    1193      237134 :         int nblks = 0, ret, bsize = journal->j_blocksize;
    1194      237134 :         int subtid = atomic_read(&sbi->s_fc_subtid);
    1195      237134 :         int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
    1196      237134 :         ktime_t start_time, commit_time;
    1197             : 
    1198      237134 :         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
    1199      237134 :                 return jbd2_complete_transaction(journal, commit_tid);
    1200             : 
    1201           0 :         trace_ext4_fc_commit_start(sb, commit_tid);
    1202             : 
    1203           0 :         start_time = ktime_get();
    1204             : 
    1205           0 : restart_fc:
    1206           0 :         ret = jbd2_fc_begin_commit(journal, commit_tid);
    1207           0 :         if (ret == -EALREADY) {
    1208             :                 /* There was an ongoing commit, check if we need to restart */
    1209           0 :                 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
    1210           0 :                         commit_tid > journal->j_commit_sequence)
    1211           0 :                         goto restart_fc;
    1212           0 :                 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
    1213             :                                 commit_tid);
    1214           0 :                 return 0;
    1215           0 :         } else if (ret) {
    1216             :                 /*
    1217             :                  * Commit couldn't start. Just update stats and perform a
    1218             :                  * full commit.
    1219             :                  */
    1220           0 :                 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
    1221             :                                 commit_tid);
    1222           0 :                 return jbd2_complete_transaction(journal, commit_tid);
    1223             :         }
    1224             : 
    1225             :         /*
    1226             :          * After establishing journal barrier via jbd2_fc_begin_commit(), check
    1227             :          * if we are fast commit ineligible.
    1228             :          */
    1229           0 :         if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
    1230           0 :                 status = EXT4_FC_STATUS_INELIGIBLE;
    1231           0 :                 goto fallback;
    1232             :         }
    1233             : 
    1234           0 :         fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
    1235           0 :         ret = ext4_fc_perform_commit(journal);
    1236           0 :         if (ret < 0) {
    1237           0 :                 status = EXT4_FC_STATUS_FAILED;
    1238           0 :                 goto fallback;
    1239             :         }
    1240           0 :         nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
    1241           0 :         ret = jbd2_fc_wait_bufs(journal, nblks);
    1242           0 :         if (ret < 0) {
    1243           0 :                 status = EXT4_FC_STATUS_FAILED;
    1244           0 :                 goto fallback;
    1245             :         }
    1246           0 :         atomic_inc(&sbi->s_fc_subtid);
    1247           0 :         ret = jbd2_fc_end_commit(journal);
    1248             :         /*
    1249             :          * weight the commit time higher than the average time so we
    1250             :          * don't react too strongly to vast changes in the commit time
    1251             :          */
    1252           0 :         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
    1253           0 :         ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
    1254           0 :         return ret;
    1255             : 
    1256           0 : fallback:
    1257           0 :         ret = jbd2_fc_end_commit_fallback(journal);
    1258           0 :         ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
    1259           0 :         return ret;
    1260             : }
    1261             : 
    1262             : /*
    1263             :  * Fast commit cleanup routine. This is called after every fast commit and
    1264             :  * full commit. full is true if we are called after a full commit.
    1265             :  */
    1266           0 : static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
    1267             : {
    1268           0 :         struct super_block *sb = journal->j_private;
    1269           0 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    1270           0 :         struct ext4_inode_info *iter, *iter_n;
    1271           0 :         struct ext4_fc_dentry_update *fc_dentry;
    1272             : 
    1273           0 :         if (full && sbi->s_fc_bh)
    1274           0 :                 sbi->s_fc_bh = NULL;
    1275             : 
    1276           0 :         trace_ext4_fc_cleanup(journal, full, tid);
    1277           0 :         jbd2_fc_release_bufs(journal);
    1278             : 
    1279           0 :         spin_lock(&sbi->s_fc_lock);
    1280           0 :         list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
    1281             :                                  i_fc_list) {
    1282           0 :                 list_del_init(&iter->i_fc_list);
    1283           0 :                 ext4_clear_inode_state(&iter->vfs_inode,
    1284             :                                        EXT4_STATE_FC_COMMITTING);
    1285           0 :                 if (iter->i_sync_tid <= tid)
    1286           0 :                         ext4_fc_reset_inode(&iter->vfs_inode);
    1287             :                 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
    1288           0 :                 smp_mb();
    1289             : #if (BITS_PER_LONG < 64)
    1290             :                 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
    1291             : #else
    1292           0 :                 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
    1293             : #endif
    1294             :         }
    1295             : 
    1296           0 :         while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
    1297           0 :                 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
    1298             :                                              struct ext4_fc_dentry_update,
    1299             :                                              fcd_list);
    1300           0 :                 list_del_init(&fc_dentry->fcd_list);
    1301           0 :                 list_del_init(&fc_dentry->fcd_dilist);
    1302           0 :                 spin_unlock(&sbi->s_fc_lock);
    1303             : 
    1304           0 :                 if (fc_dentry->fcd_name.name &&
    1305           0 :                         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
    1306           0 :                         kfree(fc_dentry->fcd_name.name);
    1307           0 :                 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
    1308           0 :                 spin_lock(&sbi->s_fc_lock);
    1309             :         }
    1310             : 
    1311           0 :         list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
    1312             :                                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
    1313           0 :         list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
    1314             :                                 &sbi->s_fc_q[FC_Q_MAIN]);
    1315             : 
    1316           0 :         if (tid >= sbi->s_fc_ineligible_tid) {
    1317           0 :                 sbi->s_fc_ineligible_tid = 0;
    1318           0 :                 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
    1319             :         }
    1320             : 
    1321           0 :         if (full)
    1322           0 :                 sbi->s_fc_bytes = 0;
    1323           0 :         spin_unlock(&sbi->s_fc_lock);
    1324           0 :         trace_ext4_fc_stats(sb);
    1325           0 : }
    1326             : 
    1327             : /* Ext4 Replay Path Routines */
    1328             : 
    1329             : /* Helper struct for dentry replay routines */
    1330             : struct dentry_info_args {
    1331             :         int parent_ino, dname_len, ino, inode_len;
    1332             :         char *dname;
    1333             : };
    1334             : 
    1335             : /* Same as struct ext4_fc_tl, but uses native endianness fields */
    1336             : struct ext4_fc_tl_mem {
    1337             :         u16 fc_tag;
    1338             :         u16 fc_len;
    1339             : };
    1340             : 
    1341           0 : static inline void tl_to_darg(struct dentry_info_args *darg,
    1342             :                               struct ext4_fc_tl_mem *tl, u8 *val)
    1343             : {
    1344           0 :         struct ext4_fc_dentry_info fcd;
    1345             : 
    1346           0 :         memcpy(&fcd, val, sizeof(fcd));
    1347             : 
    1348           0 :         darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
    1349           0 :         darg->ino = le32_to_cpu(fcd.fc_ino);
    1350           0 :         darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
    1351           0 :         darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
    1352           0 : }
    1353             : 
    1354           0 : static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val)
    1355             : {
    1356           0 :         struct ext4_fc_tl tl_disk;
    1357             : 
    1358           0 :         memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN);
    1359           0 :         tl->fc_len = le16_to_cpu(tl_disk.fc_len);
    1360           0 :         tl->fc_tag = le16_to_cpu(tl_disk.fc_tag);
    1361           0 : }
    1362             : 
    1363             : /* Unlink replay function */
    1364           0 : static int ext4_fc_replay_unlink(struct super_block *sb,
    1365             :                                  struct ext4_fc_tl_mem *tl, u8 *val)
    1366             : {
    1367           0 :         struct inode *inode, *old_parent;
    1368           0 :         struct qstr entry;
    1369           0 :         struct dentry_info_args darg;
    1370           0 :         int ret = 0;
    1371             : 
    1372           0 :         tl_to_darg(&darg, tl, val);
    1373             : 
    1374           0 :         trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
    1375             :                         darg.parent_ino, darg.dname_len);
    1376             : 
    1377           0 :         entry.name = darg.dname;
    1378           0 :         entry.len = darg.dname_len;
    1379           0 :         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
    1380             : 
    1381           0 :         if (IS_ERR(inode)) {
    1382             :                 ext4_debug("Inode %d not found", darg.ino);
    1383             :                 return 0;
    1384             :         }
    1385             : 
    1386           0 :         old_parent = ext4_iget(sb, darg.parent_ino,
    1387             :                                 EXT4_IGET_NORMAL);
    1388           0 :         if (IS_ERR(old_parent)) {
    1389           0 :                 ext4_debug("Dir with inode %d not found", darg.parent_ino);
    1390           0 :                 iput(inode);
    1391           0 :                 return 0;
    1392             :         }
    1393             : 
    1394           0 :         ret = __ext4_unlink(old_parent, &entry, inode, NULL);
    1395             :         /* -ENOENT ok coz it might not exist anymore. */
    1396           0 :         if (ret == -ENOENT)
    1397           0 :                 ret = 0;
    1398           0 :         iput(old_parent);
    1399           0 :         iput(inode);
    1400           0 :         return ret;
    1401             : }
    1402             : 
    1403           0 : static int ext4_fc_replay_link_internal(struct super_block *sb,
    1404             :                                 struct dentry_info_args *darg,
    1405             :                                 struct inode *inode)
    1406             : {
    1407           0 :         struct inode *dir = NULL;
    1408           0 :         struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
    1409           0 :         struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
    1410           0 :         int ret = 0;
    1411             : 
    1412           0 :         dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
    1413           0 :         if (IS_ERR(dir)) {
    1414           0 :                 ext4_debug("Dir with inode %d not found.", darg->parent_ino);
    1415           0 :                 dir = NULL;
    1416           0 :                 goto out;
    1417             :         }
    1418             : 
    1419           0 :         dentry_dir = d_obtain_alias(dir);
    1420           0 :         if (IS_ERR(dentry_dir)) {
    1421           0 :                 ext4_debug("Failed to obtain dentry");
    1422           0 :                 dentry_dir = NULL;
    1423           0 :                 goto out;
    1424             :         }
    1425             : 
    1426           0 :         dentry_inode = d_alloc(dentry_dir, &qstr_dname);
    1427           0 :         if (!dentry_inode) {
    1428           0 :                 ext4_debug("Inode dentry not created.");
    1429           0 :                 ret = -ENOMEM;
    1430           0 :                 goto out;
    1431             :         }
    1432             : 
    1433           0 :         ret = __ext4_link(dir, inode, dentry_inode);
    1434             :         /*
    1435             :          * It's possible that link already existed since data blocks
    1436             :          * for the dir in question got persisted before we crashed OR
    1437             :          * we replayed this tag and crashed before the entire replay
    1438             :          * could complete.
    1439             :          */
    1440           0 :         if (ret && ret != -EEXIST) {
    1441           0 :                 ext4_debug("Failed to link\n");
    1442           0 :                 goto out;
    1443             :         }
    1444             : 
    1445             :         ret = 0;
    1446           0 : out:
    1447           0 :         if (dentry_dir) {
    1448           0 :                 d_drop(dentry_dir);
    1449           0 :                 dput(dentry_dir);
    1450           0 :         } else if (dir) {
    1451           0 :                 iput(dir);
    1452             :         }
    1453           0 :         if (dentry_inode) {
    1454           0 :                 d_drop(dentry_inode);
    1455           0 :                 dput(dentry_inode);
    1456             :         }
    1457             : 
    1458           0 :         return ret;
    1459             : }
    1460             : 
    1461             : /* Link replay function */
    1462           0 : static int ext4_fc_replay_link(struct super_block *sb,
    1463             :                                struct ext4_fc_tl_mem *tl, u8 *val)
    1464             : {
    1465           0 :         struct inode *inode;
    1466           0 :         struct dentry_info_args darg;
    1467           0 :         int ret = 0;
    1468             : 
    1469           0 :         tl_to_darg(&darg, tl, val);
    1470           0 :         trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
    1471             :                         darg.parent_ino, darg.dname_len);
    1472             : 
    1473           0 :         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
    1474           0 :         if (IS_ERR(inode)) {
    1475             :                 ext4_debug("Inode not found.");
    1476             :                 return 0;
    1477             :         }
    1478             : 
    1479           0 :         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
    1480           0 :         iput(inode);
    1481           0 :         return ret;
    1482             : }
    1483             : 
    1484             : /*
    1485             :  * Record all the modified inodes during replay. We use this later to setup
    1486             :  * block bitmaps correctly.
    1487             :  */
    1488           0 : static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
    1489             : {
    1490           0 :         struct ext4_fc_replay_state *state;
    1491           0 :         int i;
    1492             : 
    1493           0 :         state = &EXT4_SB(sb)->s_fc_replay_state;
    1494           0 :         for (i = 0; i < state->fc_modified_inodes_used; i++)
    1495           0 :                 if (state->fc_modified_inodes[i] == ino)
    1496             :                         return 0;
    1497           0 :         if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
    1498           0 :                 int *fc_modified_inodes;
    1499             : 
    1500           0 :                 fc_modified_inodes = krealloc(state->fc_modified_inodes,
    1501           0 :                                 sizeof(int) * (state->fc_modified_inodes_size +
    1502             :                                 EXT4_FC_REPLAY_REALLOC_INCREMENT),
    1503             :                                 GFP_KERNEL);
    1504           0 :                 if (!fc_modified_inodes)
    1505             :                         return -ENOMEM;
    1506           0 :                 state->fc_modified_inodes = fc_modified_inodes;
    1507           0 :                 state->fc_modified_inodes_size +=
    1508             :                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
    1509             :         }
    1510           0 :         state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
    1511           0 :         return 0;
    1512             : }
    1513             : 
    1514             : /*
    1515             :  * Inode replay function
    1516             :  */
    1517           0 : static int ext4_fc_replay_inode(struct super_block *sb,
    1518             :                                 struct ext4_fc_tl_mem *tl, u8 *val)
    1519             : {
    1520           0 :         struct ext4_fc_inode fc_inode;
    1521           0 :         struct ext4_inode *raw_inode;
    1522           0 :         struct ext4_inode *raw_fc_inode;
    1523           0 :         struct inode *inode = NULL;
    1524           0 :         struct ext4_iloc iloc;
    1525           0 :         int inode_len, ino, ret, tag = tl->fc_tag;
    1526           0 :         struct ext4_extent_header *eh;
    1527           0 :         size_t off_gen = offsetof(struct ext4_inode, i_generation);
    1528             : 
    1529           0 :         memcpy(&fc_inode, val, sizeof(fc_inode));
    1530             : 
    1531           0 :         ino = le32_to_cpu(fc_inode.fc_ino);
    1532           0 :         trace_ext4_fc_replay(sb, tag, ino, 0, 0);
    1533             : 
    1534           0 :         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
    1535           0 :         if (!IS_ERR(inode)) {
    1536           0 :                 ext4_ext_clear_bb(inode);
    1537           0 :                 iput(inode);
    1538             :         }
    1539           0 :         inode = NULL;
    1540             : 
    1541           0 :         ret = ext4_fc_record_modified_inode(sb, ino);
    1542           0 :         if (ret)
    1543           0 :                 goto out;
    1544             : 
    1545           0 :         raw_fc_inode = (struct ext4_inode *)
    1546             :                 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
    1547           0 :         ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
    1548           0 :         if (ret)
    1549           0 :                 goto out;
    1550             : 
    1551           0 :         inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
    1552           0 :         raw_inode = ext4_raw_inode(&iloc);
    1553             : 
    1554           0 :         memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
    1555           0 :         memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
    1556             :                inode_len - off_gen);
    1557           0 :         if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
    1558           0 :                 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
    1559           0 :                 if (eh->eh_magic != EXT4_EXT_MAGIC) {
    1560           0 :                         memset(eh, 0, sizeof(*eh));
    1561           0 :                         eh->eh_magic = EXT4_EXT_MAGIC;
    1562           0 :                         eh->eh_max = cpu_to_le16(
    1563             :                                 (sizeof(raw_inode->i_block) -
    1564             :                                  sizeof(struct ext4_extent_header))
    1565             :                                  / sizeof(struct ext4_extent));
    1566             :                 }
    1567           0 :         } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
    1568           0 :                 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
    1569             :                         sizeof(raw_inode->i_block));
    1570             :         }
    1571             : 
    1572             :         /* Immediately update the inode on disk. */
    1573           0 :         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
    1574           0 :         if (ret)
    1575           0 :                 goto out;
    1576           0 :         ret = sync_dirty_buffer(iloc.bh);
    1577           0 :         if (ret)
    1578           0 :                 goto out;
    1579           0 :         ret = ext4_mark_inode_used(sb, ino);
    1580           0 :         if (ret)
    1581           0 :                 goto out;
    1582             : 
    1583             :         /* Given that we just wrote the inode on disk, this SHOULD succeed. */
    1584           0 :         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
    1585           0 :         if (IS_ERR(inode)) {
    1586             :                 ext4_debug("Inode not found.");
    1587             :                 return -EFSCORRUPTED;
    1588             :         }
    1589             : 
    1590             :         /*
    1591             :          * Our allocator could have made different decisions than before
    1592             :          * crashing. This should be fixed but until then, we calculate
    1593             :          * the number of blocks the inode.
    1594             :          */
    1595           0 :         if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
    1596           0 :                 ext4_ext_replay_set_iblocks(inode);
    1597             : 
    1598           0 :         inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
    1599           0 :         ext4_reset_inode_seed(inode);
    1600             : 
    1601           0 :         ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
    1602           0 :         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
    1603           0 :         sync_dirty_buffer(iloc.bh);
    1604           0 :         brelse(iloc.bh);
    1605           0 : out:
    1606           0 :         iput(inode);
    1607           0 :         if (!ret)
    1608           0 :                 blkdev_issue_flush(sb->s_bdev);
    1609             : 
    1610             :         return 0;
    1611             : }
    1612             : 
    1613             : /*
    1614             :  * Dentry create replay function.
    1615             :  *
    1616             :  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
    1617             :  * inode for which we are trying to create a dentry here, should already have
    1618             :  * been replayed before we start here.
    1619             :  */
    1620           0 : static int ext4_fc_replay_create(struct super_block *sb,
    1621             :                                  struct ext4_fc_tl_mem *tl, u8 *val)
    1622             : {
    1623           0 :         int ret = 0;
    1624           0 :         struct inode *inode = NULL;
    1625           0 :         struct inode *dir = NULL;
    1626           0 :         struct dentry_info_args darg;
    1627             : 
    1628           0 :         tl_to_darg(&darg, tl, val);
    1629             : 
    1630           0 :         trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
    1631             :                         darg.parent_ino, darg.dname_len);
    1632             : 
    1633             :         /* This takes care of update group descriptor and other metadata */
    1634           0 :         ret = ext4_mark_inode_used(sb, darg.ino);
    1635           0 :         if (ret)
    1636           0 :                 goto out;
    1637             : 
    1638           0 :         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
    1639           0 :         if (IS_ERR(inode)) {
    1640           0 :                 ext4_debug("inode %d not found.", darg.ino);
    1641           0 :                 inode = NULL;
    1642           0 :                 ret = -EINVAL;
    1643           0 :                 goto out;
    1644             :         }
    1645             : 
    1646           0 :         if (S_ISDIR(inode->i_mode)) {
    1647             :                 /*
    1648             :                  * If we are creating a directory, we need to make sure that the
    1649             :                  * dot and dot dot dirents are setup properly.
    1650             :                  */
    1651           0 :                 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
    1652           0 :                 if (IS_ERR(dir)) {
    1653           0 :                         ext4_debug("Dir %d not found.", darg.ino);
    1654           0 :                         goto out;
    1655             :                 }
    1656           0 :                 ret = ext4_init_new_dir(NULL, dir, inode);
    1657           0 :                 iput(dir);
    1658           0 :                 if (ret) {
    1659           0 :                         ret = 0;
    1660           0 :                         goto out;
    1661             :                 }
    1662             :         }
    1663           0 :         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
    1664           0 :         if (ret)
    1665           0 :                 goto out;
    1666           0 :         set_nlink(inode, 1);
    1667           0 :         ext4_mark_inode_dirty(NULL, inode);
    1668           0 : out:
    1669           0 :         iput(inode);
    1670           0 :         return ret;
    1671             : }
    1672             : 
    1673             : /*
    1674             :  * Record physical disk regions which are in use as per fast commit area,
    1675             :  * and used by inodes during replay phase. Our simple replay phase
    1676             :  * allocator excludes these regions from allocation.
    1677             :  */
    1678           0 : int ext4_fc_record_regions(struct super_block *sb, int ino,
    1679             :                 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
    1680             : {
    1681           0 :         struct ext4_fc_replay_state *state;
    1682           0 :         struct ext4_fc_alloc_region *region;
    1683             : 
    1684           0 :         state = &EXT4_SB(sb)->s_fc_replay_state;
    1685             :         /*
    1686             :          * during replay phase, the fc_regions_valid may not same as
    1687             :          * fc_regions_used, update it when do new additions.
    1688             :          */
    1689           0 :         if (replay && state->fc_regions_used != state->fc_regions_valid)
    1690           0 :                 state->fc_regions_used = state->fc_regions_valid;
    1691           0 :         if (state->fc_regions_used == state->fc_regions_size) {
    1692           0 :                 struct ext4_fc_alloc_region *fc_regions;
    1693             : 
    1694           0 :                 fc_regions = krealloc(state->fc_regions,
    1695             :                                       sizeof(struct ext4_fc_alloc_region) *
    1696           0 :                                       (state->fc_regions_size +
    1697             :                                        EXT4_FC_REPLAY_REALLOC_INCREMENT),
    1698             :                                       GFP_KERNEL);
    1699           0 :                 if (!fc_regions)
    1700             :                         return -ENOMEM;
    1701           0 :                 state->fc_regions_size +=
    1702             :                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
    1703           0 :                 state->fc_regions = fc_regions;
    1704             :         }
    1705           0 :         region = &state->fc_regions[state->fc_regions_used++];
    1706           0 :         region->ino = ino;
    1707           0 :         region->lblk = lblk;
    1708           0 :         region->pblk = pblk;
    1709           0 :         region->len = len;
    1710             : 
    1711           0 :         if (replay)
    1712           0 :                 state->fc_regions_valid++;
    1713             : 
    1714             :         return 0;
    1715             : }
    1716             : 
    1717             : /* Replay add range tag */
    1718           0 : static int ext4_fc_replay_add_range(struct super_block *sb,
    1719             :                                     struct ext4_fc_tl_mem *tl, u8 *val)
    1720             : {
    1721           0 :         struct ext4_fc_add_range fc_add_ex;
    1722           0 :         struct ext4_extent newex, *ex;
    1723           0 :         struct inode *inode;
    1724           0 :         ext4_lblk_t start, cur;
    1725           0 :         int remaining, len;
    1726           0 :         ext4_fsblk_t start_pblk;
    1727           0 :         struct ext4_map_blocks map;
    1728           0 :         struct ext4_ext_path *path = NULL;
    1729           0 :         int ret;
    1730             : 
    1731           0 :         memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
    1732           0 :         ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
    1733             : 
    1734           0 :         trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
    1735             :                 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
    1736             :                 ext4_ext_get_actual_len(ex));
    1737             : 
    1738           0 :         inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
    1739           0 :         if (IS_ERR(inode)) {
    1740             :                 ext4_debug("Inode not found.");
    1741             :                 return 0;
    1742             :         }
    1743             : 
    1744           0 :         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
    1745           0 :         if (ret)
    1746           0 :                 goto out;
    1747             : 
    1748           0 :         start = le32_to_cpu(ex->ee_block);
    1749           0 :         start_pblk = ext4_ext_pblock(ex);
    1750           0 :         len = ext4_ext_get_actual_len(ex);
    1751             : 
    1752             :         cur = start;
    1753             :         remaining = len;
    1754             :         ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
    1755             :                   start, start_pblk, len, ext4_ext_is_unwritten(ex),
    1756             :                   inode->i_ino);
    1757             : 
    1758           0 :         while (remaining > 0) {
    1759           0 :                 map.m_lblk = cur;
    1760           0 :                 map.m_len = remaining;
    1761           0 :                 map.m_pblk = 0;
    1762           0 :                 ret = ext4_map_blocks(NULL, inode, &map, 0);
    1763             : 
    1764           0 :                 if (ret < 0)
    1765           0 :                         goto out;
    1766             : 
    1767           0 :                 if (ret == 0) {
    1768             :                         /* Range is not mapped */
    1769           0 :                         path = ext4_find_extent(inode, cur, NULL, 0);
    1770           0 :                         if (IS_ERR(path))
    1771           0 :                                 goto out;
    1772           0 :                         memset(&newex, 0, sizeof(newex));
    1773           0 :                         newex.ee_block = cpu_to_le32(cur);
    1774           0 :                         ext4_ext_store_pblock(
    1775           0 :                                 &newex, start_pblk + cur - start);
    1776           0 :                         newex.ee_len = cpu_to_le16(map.m_len);
    1777           0 :                         if (ext4_ext_is_unwritten(ex))
    1778           0 :                                 ext4_ext_mark_unwritten(&newex);
    1779           0 :                         down_write(&EXT4_I(inode)->i_data_sem);
    1780           0 :                         ret = ext4_ext_insert_extent(
    1781             :                                 NULL, inode, &path, &newex, 0);
    1782           0 :                         up_write((&EXT4_I(inode)->i_data_sem));
    1783           0 :                         ext4_free_ext_path(path);
    1784           0 :                         if (ret)
    1785           0 :                                 goto out;
    1786           0 :                         goto next;
    1787             :                 }
    1788             : 
    1789           0 :                 if (start_pblk + cur - start != map.m_pblk) {
    1790             :                         /*
    1791             :                          * Logical to physical mapping changed. This can happen
    1792             :                          * if this range was removed and then reallocated to
    1793             :                          * map to new physical blocks during a fast commit.
    1794             :                          */
    1795           0 :                         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
    1796             :                                         ext4_ext_is_unwritten(ex),
    1797             :                                         start_pblk + cur - start);
    1798           0 :                         if (ret)
    1799           0 :                                 goto out;
    1800             :                         /*
    1801             :                          * Mark the old blocks as free since they aren't used
    1802             :                          * anymore. We maintain an array of all the modified
    1803             :                          * inodes. In case these blocks are still used at either
    1804             :                          * a different logical range in the same inode or in
    1805             :                          * some different inode, we will mark them as allocated
    1806             :                          * at the end of the FC replay using our array of
    1807             :                          * modified inodes.
    1808             :                          */
    1809           0 :                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
    1810           0 :                         goto next;
    1811             :                 }
    1812             : 
    1813             :                 /* Range is mapped and needs a state change */
    1814           0 :                 ext4_debug("Converting from %ld to %d %lld",
    1815             :                                 map.m_flags & EXT4_MAP_UNWRITTEN,
    1816             :                         ext4_ext_is_unwritten(ex), map.m_pblk);
    1817           0 :                 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
    1818             :                                         ext4_ext_is_unwritten(ex), map.m_pblk);
    1819           0 :                 if (ret)
    1820           0 :                         goto out;
    1821             :                 /*
    1822             :                  * We may have split the extent tree while toggling the state.
    1823             :                  * Try to shrink the extent tree now.
    1824             :                  */
    1825           0 :                 ext4_ext_replay_shrink_inode(inode, start + len);
    1826           0 : next:
    1827           0 :                 cur += map.m_len;
    1828           0 :                 remaining -= map.m_len;
    1829             :         }
    1830           0 :         ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
    1831           0 :                                         sb->s_blocksize_bits);
    1832           0 : out:
    1833           0 :         iput(inode);
    1834           0 :         return 0;
    1835             : }
    1836             : 
    1837             : /* Replay DEL_RANGE tag */
    1838             : static int
    1839           0 : ext4_fc_replay_del_range(struct super_block *sb,
    1840             :                          struct ext4_fc_tl_mem *tl, u8 *val)
    1841             : {
    1842           0 :         struct inode *inode;
    1843           0 :         struct ext4_fc_del_range lrange;
    1844           0 :         struct ext4_map_blocks map;
    1845           0 :         ext4_lblk_t cur, remaining;
    1846           0 :         int ret;
    1847             : 
    1848           0 :         memcpy(&lrange, val, sizeof(lrange));
    1849           0 :         cur = le32_to_cpu(lrange.fc_lblk);
    1850           0 :         remaining = le32_to_cpu(lrange.fc_len);
    1851             : 
    1852           0 :         trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
    1853           0 :                 le32_to_cpu(lrange.fc_ino), cur, remaining);
    1854             : 
    1855           0 :         inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
    1856           0 :         if (IS_ERR(inode)) {
    1857             :                 ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
    1858             :                 return 0;
    1859             :         }
    1860             : 
    1861           0 :         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
    1862           0 :         if (ret)
    1863           0 :                 goto out;
    1864             : 
    1865             :         ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
    1866             :                         inode->i_ino, le32_to_cpu(lrange.fc_lblk),
    1867             :                         le32_to_cpu(lrange.fc_len));
    1868           0 :         while (remaining > 0) {
    1869           0 :                 map.m_lblk = cur;
    1870           0 :                 map.m_len = remaining;
    1871             : 
    1872           0 :                 ret = ext4_map_blocks(NULL, inode, &map, 0);
    1873           0 :                 if (ret < 0)
    1874           0 :                         goto out;
    1875           0 :                 if (ret > 0) {
    1876           0 :                         remaining -= ret;
    1877           0 :                         cur += ret;
    1878           0 :                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
    1879             :                 } else {
    1880           0 :                         remaining -= map.m_len;
    1881           0 :                         cur += map.m_len;
    1882             :                 }
    1883             :         }
    1884             : 
    1885           0 :         down_write(&EXT4_I(inode)->i_data_sem);
    1886           0 :         ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
    1887           0 :                                 le32_to_cpu(lrange.fc_lblk) +
    1888             :                                 le32_to_cpu(lrange.fc_len) - 1);
    1889           0 :         up_write(&EXT4_I(inode)->i_data_sem);
    1890           0 :         if (ret)
    1891           0 :                 goto out;
    1892           0 :         ext4_ext_replay_shrink_inode(inode,
    1893           0 :                 i_size_read(inode) >> sb->s_blocksize_bits);
    1894           0 :         ext4_mark_inode_dirty(NULL, inode);
    1895           0 : out:
    1896           0 :         iput(inode);
    1897           0 :         return 0;
    1898             : }
    1899             : 
    1900           0 : static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
    1901             : {
    1902           0 :         struct ext4_fc_replay_state *state;
    1903           0 :         struct inode *inode;
    1904           0 :         struct ext4_ext_path *path = NULL;
    1905           0 :         struct ext4_map_blocks map;
    1906           0 :         int i, ret, j;
    1907           0 :         ext4_lblk_t cur, end;
    1908             : 
    1909           0 :         state = &EXT4_SB(sb)->s_fc_replay_state;
    1910           0 :         for (i = 0; i < state->fc_modified_inodes_used; i++) {
    1911           0 :                 inode = ext4_iget(sb, state->fc_modified_inodes[i],
    1912             :                         EXT4_IGET_NORMAL);
    1913           0 :                 if (IS_ERR(inode)) {
    1914           0 :                         ext4_debug("Inode %d not found.",
    1915             :                                 state->fc_modified_inodes[i]);
    1916           0 :                         continue;
    1917             :                 }
    1918           0 :                 cur = 0;
    1919           0 :                 end = EXT_MAX_BLOCKS;
    1920           0 :                 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
    1921           0 :                         iput(inode);
    1922           0 :                         continue;
    1923             :                 }
    1924           0 :                 while (cur < end) {
    1925           0 :                         map.m_lblk = cur;
    1926           0 :                         map.m_len = end - cur;
    1927             : 
    1928           0 :                         ret = ext4_map_blocks(NULL, inode, &map, 0);
    1929           0 :                         if (ret < 0)
    1930             :                                 break;
    1931             : 
    1932           0 :                         if (ret > 0) {
    1933           0 :                                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
    1934           0 :                                 if (!IS_ERR(path)) {
    1935           0 :                                         for (j = 0; j < path->p_depth; j++)
    1936           0 :                                                 ext4_mb_mark_bb(inode->i_sb,
    1937           0 :                                                         path[j].p_block, 1, 1);
    1938           0 :                                         ext4_free_ext_path(path);
    1939             :                                 }
    1940           0 :                                 cur += ret;
    1941           0 :                                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
    1942           0 :                                                         map.m_len, 1);
    1943             :                         } else {
    1944           0 :                                 cur = cur + (map.m_len ? map.m_len : 1);
    1945             :                         }
    1946             :                 }
    1947           0 :                 iput(inode);
    1948             :         }
    1949           0 : }
    1950             : 
    1951             : /*
    1952             :  * Check if block is in excluded regions for block allocation. The simple
    1953             :  * allocator that runs during replay phase is calls this function to see
    1954             :  * if it is okay to use a block.
    1955             :  */
    1956           0 : bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
    1957             : {
    1958           0 :         int i;
    1959           0 :         struct ext4_fc_replay_state *state;
    1960             : 
    1961           0 :         state = &EXT4_SB(sb)->s_fc_replay_state;
    1962           0 :         for (i = 0; i < state->fc_regions_valid; i++) {
    1963           0 :                 if (state->fc_regions[i].ino == 0 ||
    1964           0 :                         state->fc_regions[i].len == 0)
    1965           0 :                         continue;
    1966           0 :                 if (in_range(blk, state->fc_regions[i].pblk,
    1967             :                                         state->fc_regions[i].len))
    1968             :                         return true;
    1969             :         }
    1970             :         return false;
    1971             : }
    1972             : 
    1973             : /* Cleanup function called after replay */
    1974        2536 : void ext4_fc_replay_cleanup(struct super_block *sb)
    1975             : {
    1976        2536 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    1977             : 
    1978        2536 :         sbi->s_mount_state &= ~EXT4_FC_REPLAY;
    1979        2536 :         kfree(sbi->s_fc_replay_state.fc_regions);
    1980        2536 :         kfree(sbi->s_fc_replay_state.fc_modified_inodes);
    1981        2536 : }
    1982             : 
    1983           0 : static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
    1984             :                                       int tag, int len)
    1985             : {
    1986           0 :         switch (tag) {
    1987           0 :         case EXT4_FC_TAG_ADD_RANGE:
    1988           0 :                 return len == sizeof(struct ext4_fc_add_range);
    1989           0 :         case EXT4_FC_TAG_DEL_RANGE:
    1990           0 :                 return len == sizeof(struct ext4_fc_del_range);
    1991           0 :         case EXT4_FC_TAG_CREAT:
    1992             :         case EXT4_FC_TAG_LINK:
    1993             :         case EXT4_FC_TAG_UNLINK:
    1994           0 :                 len -= sizeof(struct ext4_fc_dentry_info);
    1995           0 :                 return len >= 1 && len <= EXT4_NAME_LEN;
    1996           0 :         case EXT4_FC_TAG_INODE:
    1997           0 :                 len -= sizeof(struct ext4_fc_inode);
    1998           0 :                 return len >= EXT4_GOOD_OLD_INODE_SIZE &&
    1999           0 :                         len <= sbi->s_inode_size;
    2000             :         case EXT4_FC_TAG_PAD:
    2001             :                 return true; /* padding can have any length */
    2002           0 :         case EXT4_FC_TAG_TAIL:
    2003           0 :                 return len >= sizeof(struct ext4_fc_tail);
    2004           0 :         case EXT4_FC_TAG_HEAD:
    2005           0 :                 return len == sizeof(struct ext4_fc_head);
    2006             :         }
    2007           0 :         return false;
    2008             : }
    2009             : 
    2010             : /*
    2011             :  * Recovery Scan phase handler
    2012             :  *
    2013             :  * This function is called during the scan phase and is responsible
    2014             :  * for doing following things:
    2015             :  * - Make sure the fast commit area has valid tags for replay
    2016             :  * - Count number of tags that need to be replayed by the replay handler
    2017             :  * - Verify CRC
    2018             :  * - Create a list of excluded blocks for allocation during replay phase
    2019             :  *
    2020             :  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
    2021             :  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
    2022             :  * to indicate that scan has finished and JBD2 can now start replay phase.
    2023             :  * It returns a negative error to indicate that there was an error. At the end
    2024             :  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
    2025             :  * to indicate the number of tags that need to replayed during the replay phase.
    2026             :  */
    2027           0 : static int ext4_fc_replay_scan(journal_t *journal,
    2028             :                                 struct buffer_head *bh, int off,
    2029             :                                 tid_t expected_tid)
    2030             : {
    2031           0 :         struct super_block *sb = journal->j_private;
    2032           0 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    2033           0 :         struct ext4_fc_replay_state *state;
    2034           0 :         int ret = JBD2_FC_REPLAY_CONTINUE;
    2035           0 :         struct ext4_fc_add_range ext;
    2036           0 :         struct ext4_fc_tl_mem tl;
    2037           0 :         struct ext4_fc_tail tail;
    2038           0 :         __u8 *start, *end, *cur, *val;
    2039           0 :         struct ext4_fc_head head;
    2040           0 :         struct ext4_extent *ex;
    2041             : 
    2042           0 :         state = &sbi->s_fc_replay_state;
    2043             : 
    2044           0 :         start = (u8 *)bh->b_data;
    2045           0 :         end = start + journal->j_blocksize;
    2046             : 
    2047           0 :         if (state->fc_replay_expected_off == 0) {
    2048           0 :                 state->fc_cur_tag = 0;
    2049           0 :                 state->fc_replay_num_tags = 0;
    2050           0 :                 state->fc_crc = 0;
    2051           0 :                 state->fc_regions = NULL;
    2052           0 :                 state->fc_regions_valid = state->fc_regions_used =
    2053           0 :                         state->fc_regions_size = 0;
    2054             :                 /* Check if we can stop early */
    2055           0 :                 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
    2056             :                         != EXT4_FC_TAG_HEAD)
    2057             :                         return 0;
    2058             :         }
    2059             : 
    2060           0 :         if (off != state->fc_replay_expected_off) {
    2061           0 :                 ret = -EFSCORRUPTED;
    2062           0 :                 goto out_err;
    2063             :         }
    2064             : 
    2065           0 :         state->fc_replay_expected_off++;
    2066           0 :         for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
    2067           0 :              cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
    2068           0 :                 ext4_fc_get_tl(&tl, cur);
    2069           0 :                 val = cur + EXT4_FC_TAG_BASE_LEN;
    2070           0 :                 if (tl.fc_len > end - val ||
    2071           0 :                     !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
    2072           0 :                         ret = state->fc_replay_num_tags ?
    2073           0 :                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
    2074           0 :                         goto out_err;
    2075             :                 }
    2076           0 :                 ext4_debug("Scan phase, tag:%s, blk %lld\n",
    2077             :                            tag2str(tl.fc_tag), bh->b_blocknr);
    2078           0 :                 switch (tl.fc_tag) {
    2079           0 :                 case EXT4_FC_TAG_ADD_RANGE:
    2080           0 :                         memcpy(&ext, val, sizeof(ext));
    2081           0 :                         ex = (struct ext4_extent *)&ext.fc_ex;
    2082           0 :                         ret = ext4_fc_record_regions(sb,
    2083             :                                 le32_to_cpu(ext.fc_ino),
    2084             :                                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
    2085             :                                 ext4_ext_get_actual_len(ex), 0);
    2086           0 :                         if (ret < 0)
    2087             :                                 break;
    2088             :                         ret = JBD2_FC_REPLAY_CONTINUE;
    2089           0 :                         fallthrough;
    2090           0 :                 case EXT4_FC_TAG_DEL_RANGE:
    2091             :                 case EXT4_FC_TAG_LINK:
    2092             :                 case EXT4_FC_TAG_UNLINK:
    2093             :                 case EXT4_FC_TAG_CREAT:
    2094             :                 case EXT4_FC_TAG_INODE:
    2095             :                 case EXT4_FC_TAG_PAD:
    2096           0 :                         state->fc_cur_tag++;
    2097           0 :                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
    2098           0 :                                 EXT4_FC_TAG_BASE_LEN + tl.fc_len);
    2099           0 :                         break;
    2100           0 :                 case EXT4_FC_TAG_TAIL:
    2101           0 :                         state->fc_cur_tag++;
    2102           0 :                         memcpy(&tail, val, sizeof(tail));
    2103           0 :                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
    2104             :                                                 EXT4_FC_TAG_BASE_LEN +
    2105             :                                                 offsetof(struct ext4_fc_tail,
    2106             :                                                 fc_crc));
    2107           0 :                         if (le32_to_cpu(tail.fc_tid) == expected_tid &&
    2108             :                                 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
    2109           0 :                                 state->fc_replay_num_tags = state->fc_cur_tag;
    2110           0 :                                 state->fc_regions_valid =
    2111           0 :                                         state->fc_regions_used;
    2112             :                         } else {
    2113           0 :                                 ret = state->fc_replay_num_tags ?
    2114           0 :                                         JBD2_FC_REPLAY_STOP : -EFSBADCRC;
    2115             :                         }
    2116           0 :                         state->fc_crc = 0;
    2117           0 :                         break;
    2118           0 :                 case EXT4_FC_TAG_HEAD:
    2119           0 :                         memcpy(&head, val, sizeof(head));
    2120           0 :                         if (le32_to_cpu(head.fc_features) &
    2121             :                                 ~EXT4_FC_SUPPORTED_FEATURES) {
    2122             :                                 ret = -EOPNOTSUPP;
    2123             :                                 break;
    2124             :                         }
    2125           0 :                         if (le32_to_cpu(head.fc_tid) != expected_tid) {
    2126             :                                 ret = JBD2_FC_REPLAY_STOP;
    2127             :                                 break;
    2128             :                         }
    2129           0 :                         state->fc_cur_tag++;
    2130           0 :                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
    2131           0 :                                 EXT4_FC_TAG_BASE_LEN + tl.fc_len);
    2132           0 :                         break;
    2133           0 :                 default:
    2134           0 :                         ret = state->fc_replay_num_tags ?
    2135           0 :                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
    2136             :                 }
    2137           0 :                 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
    2138             :                         break;
    2139             :         }
    2140             : 
    2141           0 : out_err:
    2142           0 :         trace_ext4_fc_replay_scan(sb, ret, off);
    2143           0 :         return ret;
    2144             : }
    2145             : 
    2146             : /*
    2147             :  * Main recovery path entry point.
    2148             :  * The meaning of return codes is similar as above.
    2149             :  */
    2150           0 : static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
    2151             :                                 enum passtype pass, int off, tid_t expected_tid)
    2152             : {
    2153           0 :         struct super_block *sb = journal->j_private;
    2154           0 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    2155           0 :         struct ext4_fc_tl_mem tl;
    2156           0 :         __u8 *start, *end, *cur, *val;
    2157           0 :         int ret = JBD2_FC_REPLAY_CONTINUE;
    2158           0 :         struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
    2159           0 :         struct ext4_fc_tail tail;
    2160             : 
    2161           0 :         if (pass == PASS_SCAN) {
    2162           0 :                 state->fc_current_pass = PASS_SCAN;
    2163           0 :                 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
    2164             :         }
    2165             : 
    2166           0 :         if (state->fc_current_pass != pass) {
    2167           0 :                 state->fc_current_pass = pass;
    2168           0 :                 sbi->s_mount_state |= EXT4_FC_REPLAY;
    2169             :         }
    2170           0 :         if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
    2171           0 :                 ext4_debug("Replay stops\n");
    2172           0 :                 ext4_fc_set_bitmaps_and_counters(sb);
    2173           0 :                 return 0;
    2174             :         }
    2175             : 
    2176             : #ifdef CONFIG_EXT4_DEBUG
    2177           0 :         if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
    2178           0 :                 pr_warn("Dropping fc block %d because max_replay set\n", off);
    2179           0 :                 return JBD2_FC_REPLAY_STOP;
    2180             :         }
    2181             : #endif
    2182             : 
    2183           0 :         start = (u8 *)bh->b_data;
    2184           0 :         end = start + journal->j_blocksize;
    2185             : 
    2186           0 :         for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
    2187           0 :              cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
    2188           0 :                 ext4_fc_get_tl(&tl, cur);
    2189           0 :                 val = cur + EXT4_FC_TAG_BASE_LEN;
    2190             : 
    2191           0 :                 if (state->fc_replay_num_tags == 0) {
    2192           0 :                         ret = JBD2_FC_REPLAY_STOP;
    2193           0 :                         ext4_fc_set_bitmaps_and_counters(sb);
    2194           0 :                         break;
    2195             :                 }
    2196             : 
    2197           0 :                 ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
    2198           0 :                 state->fc_replay_num_tags--;
    2199           0 :                 switch (tl.fc_tag) {
    2200           0 :                 case EXT4_FC_TAG_LINK:
    2201           0 :                         ret = ext4_fc_replay_link(sb, &tl, val);
    2202           0 :                         break;
    2203           0 :                 case EXT4_FC_TAG_UNLINK:
    2204           0 :                         ret = ext4_fc_replay_unlink(sb, &tl, val);
    2205           0 :                         break;
    2206           0 :                 case EXT4_FC_TAG_ADD_RANGE:
    2207           0 :                         ret = ext4_fc_replay_add_range(sb, &tl, val);
    2208           0 :                         break;
    2209           0 :                 case EXT4_FC_TAG_CREAT:
    2210           0 :                         ret = ext4_fc_replay_create(sb, &tl, val);
    2211           0 :                         break;
    2212           0 :                 case EXT4_FC_TAG_DEL_RANGE:
    2213           0 :                         ret = ext4_fc_replay_del_range(sb, &tl, val);
    2214           0 :                         break;
    2215           0 :                 case EXT4_FC_TAG_INODE:
    2216           0 :                         ret = ext4_fc_replay_inode(sb, &tl, val);
    2217           0 :                         break;
    2218           0 :                 case EXT4_FC_TAG_PAD:
    2219           0 :                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
    2220           0 :                                              tl.fc_len, 0);
    2221           0 :                         break;
    2222           0 :                 case EXT4_FC_TAG_TAIL:
    2223           0 :                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
    2224           0 :                                              0, tl.fc_len, 0);
    2225           0 :                         memcpy(&tail, val, sizeof(tail));
    2226           0 :                         WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
    2227             :                         break;
    2228             :                 case EXT4_FC_TAG_HEAD:
    2229             :                         break;
    2230           0 :                 default:
    2231           0 :                         trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
    2232           0 :                         ret = -ECANCELED;
    2233           0 :                         break;
    2234             :                 }
    2235           0 :                 if (ret < 0)
    2236             :                         break;
    2237           0 :                 ret = JBD2_FC_REPLAY_CONTINUE;
    2238             :         }
    2239             :         return ret;
    2240             : }
    2241             : 
    2242        3139 : void ext4_fc_init(struct super_block *sb, journal_t *journal)
    2243             : {
    2244             :         /*
    2245             :          * We set replay callback even if fast commit disabled because we may
    2246             :          * could still have fast commit blocks that need to be replayed even if
    2247             :          * fast commit has now been turned off.
    2248             :          */
    2249        3139 :         journal->j_fc_replay_callback = ext4_fc_replay;
    2250        3139 :         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
    2251             :                 return;
    2252           0 :         journal->j_fc_cleanup_callback = ext4_fc_cleanup;
    2253             : }
    2254             : 
    2255             : static const char * const fc_ineligible_reasons[] = {
    2256             :         [EXT4_FC_REASON_XATTR] = "Extended attributes changed",
    2257             :         [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
    2258             :         [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
    2259             :         [EXT4_FC_REASON_NOMEM] = "Insufficient memory",
    2260             :         [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
    2261             :         [EXT4_FC_REASON_RESIZE] = "Resize",
    2262             :         [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
    2263             :         [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
    2264             :         [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
    2265             :         [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
    2266             : };
    2267             : 
    2268           0 : int ext4_fc_info_show(struct seq_file *seq, void *v)
    2269             : {
    2270           0 :         struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
    2271           0 :         struct ext4_fc_stats *stats = &sbi->s_fc_stats;
    2272           0 :         int i;
    2273             : 
    2274           0 :         if (v != SEQ_START_TOKEN)
    2275             :                 return 0;
    2276             : 
    2277           0 :         seq_printf(seq,
    2278             :                 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
    2279             :                    stats->fc_num_commits, stats->fc_ineligible_commits,
    2280             :                    stats->fc_numblks,
    2281             :                    div_u64(stats->s_fc_avg_commit_time, 1000));
    2282           0 :         seq_puts(seq, "Ineligible reasons:\n");
    2283           0 :         for (i = 0; i < EXT4_FC_REASON_MAX; i++)
    2284           0 :                 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
    2285           0 :                         stats->fc_ineligible_reason_count[i]);
    2286             : 
    2287             :         return 0;
    2288             : }
    2289             : 
    2290          12 : int __init ext4_fc_init_dentry_cache(void)
    2291             : {
    2292          12 :         ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
    2293             :                                            SLAB_RECLAIM_ACCOUNT);
    2294             : 
    2295          12 :         if (ext4_fc_dentry_cachep == NULL)
    2296           0 :                 return -ENOMEM;
    2297             : 
    2298             :         return 0;
    2299             : }
    2300             : 
    2301           0 : void ext4_fc_destroy_dentry_cache(void)
    2302             : {
    2303           0 :         kmem_cache_destroy(ext4_fc_dentry_cachep);
    2304           0 : }

Generated by: LCOV version 1.14