Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * linux/fs/ext4/inode.c
4 : *
5 : * Copyright (C) 1992, 1993, 1994, 1995
6 : * Remy Card (card@masi.ibp.fr)
7 : * Laboratoire MASI - Institut Blaise Pascal
8 : * Universite Pierre et Marie Curie (Paris VI)
9 : *
10 : * from
11 : *
12 : * linux/fs/minix/inode.c
13 : *
14 : * Copyright (C) 1991, 1992 Linus Torvalds
15 : *
16 : * 64-bit file support on 64-bit platforms by Jakub Jelinek
17 : * (jj@sunsite.ms.mff.cuni.cz)
18 : *
19 : * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
20 : */
21 :
22 : #include <linux/fs.h>
23 : #include <linux/mount.h>
24 : #include <linux/time.h>
25 : #include <linux/highuid.h>
26 : #include <linux/pagemap.h>
27 : #include <linux/dax.h>
28 : #include <linux/quotaops.h>
29 : #include <linux/string.h>
30 : #include <linux/buffer_head.h>
31 : #include <linux/writeback.h>
32 : #include <linux/pagevec.h>
33 : #include <linux/mpage.h>
34 : #include <linux/namei.h>
35 : #include <linux/uio.h>
36 : #include <linux/bio.h>
37 : #include <linux/workqueue.h>
38 : #include <linux/kernel.h>
39 : #include <linux/printk.h>
40 : #include <linux/slab.h>
41 : #include <linux/bitops.h>
42 : #include <linux/iomap.h>
43 : #include <linux/iversion.h>
44 :
45 : #include "ext4_jbd2.h"
46 : #include "xattr.h"
47 : #include "acl.h"
48 : #include "truncate.h"
49 :
50 : #include <trace/events/ext4.h>
51 :
52 78721762 : static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
53 : struct ext4_inode_info *ei)
54 : {
55 78721762 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
56 78721762 : __u32 csum;
57 78721762 : __u16 dummy_csum = 0;
58 78721762 : int offset = offsetof(struct ext4_inode, i_checksum_lo);
59 78721762 : unsigned int csum_size = sizeof(dummy_csum);
60 :
61 78721762 : csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
62 78849502 : csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
63 78753069 : offset += csum_size;
64 78753069 : csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
65 : EXT4_GOOD_OLD_INODE_SIZE - offset);
66 :
67 78734324 : if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
68 78735583 : offset = offsetof(struct ext4_inode, i_checksum_hi);
69 78735583 : csum = ext4_chksum(sbi, csum, (__u8 *)raw +
70 : EXT4_GOOD_OLD_INODE_SIZE,
71 : offset - EXT4_GOOD_OLD_INODE_SIZE);
72 78782923 : if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
73 78793958 : csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
74 : csum_size);
75 78793958 : offset += csum_size;
76 : }
77 78733318 : csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
78 78733318 : EXT4_INODE_SIZE(inode->i_sb) - offset);
79 : }
80 :
81 78866620 : return csum;
82 : }
83 :
84 242036 : static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
85 : struct ext4_inode_info *ei)
86 : {
87 242036 : __u32 provided, calculated;
88 :
89 242036 : if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
90 242036 : cpu_to_le32(EXT4_OS_LINUX) ||
91 242036 : !ext4_has_metadata_csum(inode->i_sb))
92 457 : return 1;
93 :
94 241579 : provided = le16_to_cpu(raw->i_checksum_lo);
95 241579 : calculated = ext4_inode_csum(inode, raw, ei);
96 241579 : if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
97 241575 : EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
98 241573 : provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
99 : else
100 6 : calculated &= 0xFFFF;
101 :
102 241579 : return provided == calculated;
103 : }
104 :
105 78552404 : void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
106 : struct ext4_inode_info *ei)
107 : {
108 78552404 : __u32 csum;
109 :
110 78552404 : if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
111 78472047 : cpu_to_le32(EXT4_OS_LINUX) ||
112 78574525 : !ext4_has_metadata_csum(inode->i_sb))
113 14486 : return;
114 :
115 78435440 : csum = ext4_inode_csum(inode, raw, ei);
116 78619446 : raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
117 78619446 : if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
118 78620262 : EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
119 78620262 : raw->i_checksum_hi = cpu_to_le16(csum >> 16);
120 : }
121 :
122 2056720 : static inline int ext4_begin_ordered_truncate(struct inode *inode,
123 : loff_t new_size)
124 : {
125 2056720 : trace_ext4_begin_ordered_truncate(inode, new_size);
126 : /*
127 : * If jinode is zero, then we never opened the file for
128 : * writing, so there's no need to call
129 : * jbd2_journal_begin_ordered_truncate() since there's no
130 : * outstanding writes we need to flush.
131 : */
132 2056556 : if (!EXT4_I(inode)->jinode)
133 : return 0;
134 1904431 : return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
135 1904431 : EXT4_I(inode)->jinode,
136 : new_size);
137 : }
138 :
139 : static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
140 : int pextents);
141 :
142 : /*
143 : * Test whether an inode is a fast symlink.
144 : * A fast symlink has its symlink data stored in ext4_inode_info->i_data.
145 : */
146 2022283 : int ext4_inode_is_fast_symlink(struct inode *inode)
147 : {
148 2022283 : if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
149 2022273 : int ea_blocks = EXT4_I(inode)->i_file_acl ?
150 15002 : EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
151 :
152 2022273 : if (ext4_has_inline_data(inode))
153 : return 0;
154 :
155 2022273 : return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
156 : }
157 10 : return S_ISLNK(inode->i_mode) && inode->i_size &&
158 : (inode->i_size < EXT4_N_BLOCKS * 4);
159 : }
160 :
161 : /*
162 : * Called at the last iput() if i_nlink is zero.
163 : */
164 3248607 : void ext4_evict_inode(struct inode *inode)
165 : {
166 3248607 : handle_t *handle;
167 3248607 : int err;
168 : /*
169 : * Credits for final inode cleanup and freeing:
170 : * sb + inode (ext4_orphan_del()), block bitmap, group descriptor
171 : * (xattr block freeing), bitmap, group descriptor (inode freeing)
172 : */
173 3248607 : int extra_credits = 6;
174 3248607 : struct ext4_xattr_inode_array *ea_inode_array = NULL;
175 3248607 : bool freeze_protected = false;
176 :
177 3248607 : trace_ext4_evict_inode(inode);
178 :
179 3248361 : if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
180 10 : ext4_evict_ea_inode(inode);
181 3248361 : if (inode->i_nlink) {
182 1192704 : truncate_inode_pages_final(&inode->i_data);
183 :
184 1192704 : goto no_delete;
185 : }
186 :
187 2055657 : if (is_bad_inode(inode))
188 1035 : goto no_delete;
189 2054617 : dquot_initialize(inode);
190 :
191 2054716 : if (ext4_should_order_data(inode))
192 1763884 : ext4_begin_ordered_truncate(inode, 0);
193 2054406 : truncate_inode_pages_final(&inode->i_data);
194 :
195 : /*
196 : * For inodes with journalled data, transaction commit could have
197 : * dirtied the inode. And for inodes with dioread_nolock, unwritten
198 : * extents converting worker could merge extents and also have dirtied
199 : * the inode. Flush worker is ignoring it because of I_FREEING flag but
200 : * we still need to remove the inode from the writeback lists.
201 : */
202 2054384 : if (!list_empty_careful(&inode->i_io_list))
203 86 : inode_io_list_del(inode);
204 :
205 : /*
206 : * Protect us against freezing - iput() caller didn't have to have any
207 : * protection against it. When we are in a running transaction though,
208 : * we are already protected against freezing and we cannot grab further
209 : * protection due to lock ordering constraints.
210 : */
211 2054054 : if (!ext4_journal_current_handle()) {
212 2052348 : sb_start_intwrite(inode->i_sb);
213 2052348 : freeze_protected = true;
214 : }
215 :
216 2054357 : if (!IS_NOQUOTA(inode))
217 2084881 : extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
218 :
219 : /*
220 : * Block bitmap, group descriptor, and inode are accounted in both
221 : * ext4_blocks_for_truncate() and extra_credits. So subtract 3.
222 : */
223 2054357 : handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
224 : ext4_blocks_for_truncate(inode) + extra_credits - 3);
225 2049490 : if (IS_ERR(handle)) {
226 50595 : ext4_std_error(inode->i_sb, PTR_ERR(handle));
227 : /*
228 : * If we're going to skip the normal cleanup, we still need to
229 : * make sure that the in-core orphan linked list is properly
230 : * cleaned up.
231 : */
232 50595 : ext4_orphan_del(NULL, inode);
233 50595 : if (freeze_protected)
234 50595 : sb_end_intwrite(inode->i_sb);
235 50595 : goto no_delete;
236 : }
237 :
238 1998895 : if (IS_SYNC(inode))
239 3 : ext4_handle_sync(handle);
240 :
241 : /*
242 : * Set inode->i_size to 0 before calling ext4_truncate(). We need
243 : * special handling of symlinks here because i_size is used to
244 : * determine whether ext4_inode_info->i_data contains symlink data or
245 : * block mappings. Setting i_size to 0 will remove its fast symlink
246 : * status. Erase i_data so that it becomes a valid empty block map.
247 : */
248 1998895 : if (ext4_inode_is_fast_symlink(inode))
249 25295 : memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data));
250 1996114 : inode->i_size = 0;
251 1996114 : err = ext4_mark_inode_dirty(handle, inode);
252 2002272 : if (err) {
253 0 : ext4_warning(inode->i_sb,
254 : "couldn't mark inode dirty (err %d)", err);
255 0 : goto stop_handle;
256 : }
257 2002272 : if (inode->i_blocks) {
258 292045 : err = ext4_truncate(inode);
259 292043 : if (err) {
260 0 : ext4_error_err(inode->i_sb, -err,
261 : "couldn't truncate inode %lu (err %d)",
262 : inode->i_ino, err);
263 0 : goto stop_handle;
264 : }
265 : }
266 :
267 : /* Remove xattr references. */
268 2002270 : err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
269 : extra_credits);
270 1994132 : if (err) {
271 0 : ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
272 0 : stop_handle:
273 0 : ext4_journal_stop(handle);
274 0 : ext4_orphan_del(NULL, inode);
275 0 : if (freeze_protected)
276 0 : sb_end_intwrite(inode->i_sb);
277 0 : ext4_xattr_inode_array_free(ea_inode_array);
278 0 : goto no_delete;
279 : }
280 :
281 : /*
282 : * Kill off the orphan record which ext4_truncate created.
283 : * AKPM: I think this can be inside the above `if'.
284 : * Note that ext4_orphan_del() has to be able to cope with the
285 : * deletion of a non-existent orphan - this is because we don't
286 : * know if ext4_truncate() actually created an orphan record.
287 : * (Well, we could do this if we need to, but heck - it works)
288 : */
289 1994132 : ext4_orphan_del(handle, inode);
290 2004562 : EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds();
291 :
292 : /*
293 : * One subtle ordering requirement: if anything has gone wrong
294 : * (transaction abort, IO errors, whatever), then we can still
295 : * do these next steps (the fs will already have been marked as
296 : * having errors), but we can't free the inode if the mark_dirty
297 : * fails.
298 : */
299 2004566 : if (ext4_mark_inode_dirty(handle, inode))
300 : /* If that failed, just do the required in-core inode clear. */
301 0 : ext4_clear_inode(inode);
302 : else
303 2004510 : ext4_free_inode(handle, inode);
304 2003902 : ext4_journal_stop(handle);
305 2002894 : if (freeze_protected)
306 2001005 : sb_end_intwrite(inode->i_sb);
307 2003735 : ext4_xattr_inode_array_free(ea_inode_array);
308 2003147 : return;
309 1244334 : no_delete:
310 : /*
311 : * Check out some where else accidentally dirty the evicting inode,
312 : * which may probably cause inode use-after-free issues later.
313 : */
314 2488668 : WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list));
315 :
316 1244334 : if (!list_empty(&EXT4_I(inode)->i_fc_list))
317 0 : ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
318 1244334 : ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
319 : }
320 :
321 : #ifdef CONFIG_QUOTA
322 28998900 : qsize_t *ext4_get_reserved_space(struct inode *inode)
323 : {
324 28998900 : return &EXT4_I(inode)->i_reserved_quota;
325 : }
326 : #endif
327 :
328 : /*
329 : * Called with i_data_sem down, which is important since we can call
330 : * ext4_discard_preallocations() from here.
331 : */
332 901164 : void ext4_da_update_reserve_space(struct inode *inode,
333 : int used, int quota_claim)
334 : {
335 901164 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
336 901164 : struct ext4_inode_info *ei = EXT4_I(inode);
337 :
338 901164 : spin_lock(&ei->i_block_reservation_lock);
339 901171 : trace_ext4_da_update_reserve_space(inode, used, quota_claim);
340 901150 : if (unlikely(used > ei->i_reserved_data_blocks)) {
341 0 : ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
342 : "with only %d reserved data blocks",
343 : __func__, inode->i_ino, used,
344 : ei->i_reserved_data_blocks);
345 0 : WARN_ON(1);
346 0 : used = ei->i_reserved_data_blocks;
347 : }
348 :
349 : /* Update per-inode reservations */
350 901150 : ei->i_reserved_data_blocks -= used;
351 901150 : percpu_counter_sub(&sbi->s_dirtyclusters_counter, used);
352 :
353 901172 : spin_unlock(&ei->i_block_reservation_lock);
354 :
355 : /* Update quota subsystem for data blocks */
356 901186 : if (quota_claim)
357 863202 : dquot_claim_block(inode, EXT4_C2B(sbi, used));
358 : else {
359 : /*
360 : * We did fallocate with an offset that is already delayed
361 : * allocated. So on delayed allocated writeback we should
362 : * not re-claim the quota for fallocated blocks.
363 : */
364 37984 : dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
365 : }
366 :
367 : /*
368 : * If we have done all the pending block allocations and if
369 : * there aren't any writers on the inode, we can discard the
370 : * inode's preallocations.
371 : */
372 901175 : if ((ei->i_reserved_data_blocks == 0) &&
373 : !inode_is_open_for_write(inode))
374 382985 : ext4_discard_preallocations(inode, 0);
375 901175 : }
376 :
377 32193925 : static int __check_block_validity(struct inode *inode, const char *func,
378 : unsigned int line,
379 : struct ext4_map_blocks *map)
380 : {
381 32193925 : if (ext4_has_feature_journal(inode->i_sb) &&
382 29344195 : (inode->i_ino ==
383 29344195 : le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
384 : return 0;
385 27901836 : if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
386 0 : ext4_error_inode(inode, func, line, map->m_pblk,
387 : "lblock %lu mapped to illegal pblock %llu "
388 : "(length %d)", (unsigned long) map->m_lblk,
389 : map->m_pblk, map->m_len);
390 0 : return -EFSCORRUPTED;
391 : }
392 : return 0;
393 : }
394 :
395 1 : int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
396 : ext4_lblk_t len)
397 : {
398 1 : int ret;
399 :
400 1 : if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
401 : return fscrypt_zeroout_range(inode, lblk, pblk, len);
402 :
403 1 : ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
404 1 : if (ret > 0)
405 : ret = 0;
406 :
407 : return ret;
408 : }
409 :
410 : #define check_block_validity(inode, map) \
411 : __check_block_validity((inode), __func__, __LINE__, (map))
412 :
413 : #ifdef ES_AGGRESSIVE_TEST
414 : static void ext4_map_blocks_es_recheck(handle_t *handle,
415 : struct inode *inode,
416 : struct ext4_map_blocks *es_map,
417 : struct ext4_map_blocks *map,
418 : int flags)
419 : {
420 : int retval;
421 :
422 : map->m_flags = 0;
423 : /*
424 : * There is a race window that the result is not the same.
425 : * e.g. xfstests #223 when dioread_nolock enables. The reason
426 : * is that we lookup a block mapping in extent status tree with
427 : * out taking i_data_sem. So at the time the unwritten extent
428 : * could be converted.
429 : */
430 : down_read(&EXT4_I(inode)->i_data_sem);
431 : if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
432 : retval = ext4_ext_map_blocks(handle, inode, map, 0);
433 : } else {
434 : retval = ext4_ind_map_blocks(handle, inode, map, 0);
435 : }
436 : up_read((&EXT4_I(inode)->i_data_sem));
437 :
438 : /*
439 : * We don't check m_len because extent will be collpased in status
440 : * tree. So the m_len might not equal.
441 : */
442 : if (es_map->m_lblk != map->m_lblk ||
443 : es_map->m_flags != map->m_flags ||
444 : es_map->m_pblk != map->m_pblk) {
445 : printk("ES cache assertion failed for inode: %lu "
446 : "es_cached ex [%d/%d/%llu/%x] != "
447 : "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
448 : inode->i_ino, es_map->m_lblk, es_map->m_len,
449 : es_map->m_pblk, es_map->m_flags, map->m_lblk,
450 : map->m_len, map->m_pblk, map->m_flags,
451 : retval, flags);
452 : }
453 : }
454 : #endif /* ES_AGGRESSIVE_TEST */
455 :
456 : /*
457 : * The ext4_map_blocks() function tries to look up the requested blocks,
458 : * and returns if the blocks are already mapped.
459 : *
460 : * Otherwise it takes the write lock of the i_data_sem and allocate blocks
461 : * and store the allocated blocks in the result buffer head and mark it
462 : * mapped.
463 : *
464 : * If file type is extents based, it will call ext4_ext_map_blocks(),
465 : * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
466 : * based files
467 : *
468 : * On success, it returns the number of blocks being mapped or allocated. if
469 : * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
470 : * is marked as unwritten. If the create == 1, it will mark @map as mapped.
471 : *
472 : * It returns 0 if plain look up failed (blocks have not been allocated), in
473 : * that case, @map is returned as unmapped but we still do fill map->m_len to
474 : * indicate the length of a hole starting at map->m_lblk.
475 : *
476 : * It returns the error in case of allocation failure.
477 : */
478 74230637 : int ext4_map_blocks(handle_t *handle, struct inode *inode,
479 : struct ext4_map_blocks *map, int flags)
480 : {
481 74230637 : struct extent_status es;
482 74230637 : int retval;
483 74230637 : int ret = 0;
484 : #ifdef ES_AGGRESSIVE_TEST
485 : struct ext4_map_blocks orig_map;
486 :
487 : memcpy(&orig_map, map, sizeof(*map));
488 : #endif
489 :
490 74230637 : map->m_flags = 0;
491 74230637 : ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n",
492 : flags, map->m_len, (unsigned long) map->m_lblk);
493 :
494 : /*
495 : * ext4_map_blocks returns an int, and m_len is an unsigned int
496 : */
497 74230637 : if (unlikely(map->m_len > INT_MAX))
498 82190 : map->m_len = INT_MAX;
499 :
500 : /* We can handle the block number less than EXT_MAX_BLOCKS */
501 74230637 : if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
502 : return -EFSCORRUPTED;
503 :
504 : /* Lookup extent status tree firstly */
505 148482447 : if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
506 74225808 : ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
507 71217492 : if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
508 37489958 : map->m_pblk = ext4_es_pblock(&es) +
509 37489958 : map->m_lblk - es.es_lblk;
510 74979916 : map->m_flags |= ext4_es_is_written(&es) ?
511 37489958 : EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
512 37489958 : retval = es.es_len - (map->m_lblk - es.es_lblk);
513 37489958 : if (retval > map->m_len)
514 : retval = map->m_len;
515 37489958 : map->m_len = retval;
516 33727534 : } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
517 33727534 : map->m_pblk = 0;
518 33727534 : retval = es.es_len - (map->m_lblk - es.es_lblk);
519 33727534 : if (retval > map->m_len)
520 : retval = map->m_len;
521 33727534 : map->m_len = retval;
522 33727534 : retval = 0;
523 : } else {
524 0 : BUG();
525 : }
526 :
527 71217492 : if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
528 : return retval;
529 : #ifdef ES_AGGRESSIVE_TEST
530 : ext4_map_blocks_es_recheck(handle, inode, map,
531 : &orig_map, flags);
532 : #endif
533 71209819 : goto found;
534 : }
535 : /*
536 : * In the query cache no-wait mode, nothing we can do more if we
537 : * cannot find extent in the cache.
538 : */
539 3039147 : if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
540 : return 0;
541 :
542 : /*
543 : * Try to see if we can get the block without requesting a new
544 : * file system block.
545 : */
546 3039147 : down_read(&EXT4_I(inode)->i_data_sem);
547 3038915 : if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
548 3024307 : retval = ext4_ext_map_blocks(handle, inode, map, 0);
549 : } else {
550 14608 : retval = ext4_ind_map_blocks(handle, inode, map, 0);
551 : }
552 3038783 : if (retval > 0) {
553 1056206 : unsigned int status;
554 :
555 1056206 : if (unlikely(retval != map->m_len)) {
556 0 : ext4_warning(inode->i_sb,
557 : "ES len assertion failed for inode "
558 : "%lu: retval %d != map->m_len %d",
559 : inode->i_ino, retval, map->m_len);
560 0 : WARN_ON(1);
561 : }
562 :
563 1056206 : status = map->m_flags & EXT4_MAP_UNWRITTEN ?
564 1056206 : EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
565 1056206 : if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
566 1385096 : !(status & EXTENT_STATUS_WRITTEN) &&
567 328890 : ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
568 328890 : map->m_lblk + map->m_len - 1))
569 1 : status |= EXTENT_STATUS_DELAYED;
570 1056206 : ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
571 : map->m_pblk, status);
572 : }
573 3038784 : up_read((&EXT4_I(inode)->i_data_sem));
574 :
575 74246169 : found:
576 74246169 : if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
577 25876519 : ret = check_block_validity(inode, map);
578 25878192 : if (ret != 0)
579 : return ret;
580 : }
581 :
582 : /* If it is only a block(s) look up */
583 74247842 : if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
584 : return retval;
585 :
586 : /*
587 : * Returns if the blocks have already allocated
588 : *
589 : * Note that if blocks have been preallocated
590 : * ext4_ext_get_block() returns the create = 0
591 : * with buffer head unmapped.
592 : */
593 7679368 : if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
594 : /*
595 : * If we need to convert extent to unwritten
596 : * we continue and do the actual work in
597 : * ext4_ext_map_blocks()
598 : */
599 929217 : if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
600 : return retval;
601 :
602 : /*
603 : * Here we clear m_flags because after allocating an new extent,
604 : * it will be set again.
605 : */
606 6872702 : map->m_flags &= ~EXT4_MAP_FLAGS;
607 :
608 : /*
609 : * New blocks allocate and/or writing to unwritten extent
610 : * will possibly result in updating i_data, so we take
611 : * the write lock of i_data_sem, and call get_block()
612 : * with create == 1 flag.
613 : */
614 6872702 : down_write(&EXT4_I(inode)->i_data_sem);
615 :
616 : /*
617 : * We need to check for EXT4 here because migrate
618 : * could have changed the inode type in between
619 : */
620 6871838 : if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
621 6867418 : retval = ext4_ext_map_blocks(handle, inode, map, flags);
622 : } else {
623 4420 : retval = ext4_ind_map_blocks(handle, inode, map, flags);
624 :
625 4423 : if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
626 : /*
627 : * We allocated new blocks which will result in
628 : * i_data's format changing. Force the migrate
629 : * to fail by clearing migrate flags
630 : */
631 4421 : ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
632 : }
633 : }
634 :
635 6872984 : if (retval > 0) {
636 6443336 : unsigned int status;
637 :
638 6443336 : if (unlikely(retval != map->m_len)) {
639 0 : ext4_warning(inode->i_sb,
640 : "ES len assertion failed for inode "
641 : "%lu: retval %d != map->m_len %d",
642 : inode->i_ino, retval, map->m_len);
643 0 : WARN_ON(1);
644 : }
645 :
646 : /*
647 : * We have to zeroout blocks before inserting them into extent
648 : * status tree. Otherwise someone could look them up there and
649 : * use them before they are really zeroed. We also have to
650 : * unmap metadata before zeroing as otherwise writeback can
651 : * overwrite zeros with stale data from block device.
652 : */
653 6443336 : if (flags & EXT4_GET_BLOCKS_ZERO &&
654 0 : map->m_flags & EXT4_MAP_MAPPED &&
655 : map->m_flags & EXT4_MAP_NEW) {
656 0 : ret = ext4_issue_zeroout(inode, map->m_lblk,
657 : map->m_pblk, map->m_len);
658 0 : if (ret) {
659 0 : retval = ret;
660 0 : goto out_sem;
661 : }
662 : }
663 :
664 : /*
665 : * If the extent has been zeroed out, we don't need to update
666 : * extent status tree.
667 : */
668 8248164 : if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
669 1804636 : ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
670 1804752 : if (ext4_es_is_written(&es))
671 2 : goto out_sem;
672 : }
673 6443526 : status = map->m_flags & EXT4_MAP_UNWRITTEN ?
674 6443526 : EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
675 6443526 : if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
676 8640913 : !(status & EXTENT_STATUS_WRITTEN) &&
677 3083060 : ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
678 3083060 : map->m_lblk + map->m_len - 1))
679 76059 : status |= EXTENT_STATUS_DELAYED;
680 6443436 : ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
681 : map->m_pblk, status);
682 : }
683 :
684 429648 : out_sem:
685 6872486 : up_write((&EXT4_I(inode)->i_data_sem));
686 6872070 : if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
687 6320017 : ret = check_block_validity(inode, map);
688 6320376 : if (ret != 0)
689 : return ret;
690 :
691 : /*
692 : * Inodes with freshly allocated blocks where contents will be
693 : * visible after transaction commit must be on transaction's
694 : * ordered data list.
695 : */
696 6320376 : if (map->m_flags & EXT4_MAP_NEW &&
697 673083 : !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
698 1346164 : !(flags & EXT4_GET_BLOCKS_ZERO) &&
699 670261 : !ext4_is_quota_file(inode) &&
700 : ext4_should_order_data(inode)) {
701 524460 : loff_t start_byte =
702 262230 : (loff_t)map->m_lblk << inode->i_blkbits;
703 262230 : loff_t length = (loff_t)map->m_len << inode->i_blkbits;
704 :
705 262230 : if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
706 206 : ret = ext4_jbd2_inode_add_wait(handle, inode,
707 : start_byte, length);
708 : else
709 262024 : ret = ext4_jbd2_inode_add_write(handle, inode,
710 : start_byte, length);
711 262481 : if (ret)
712 : return ret;
713 : }
714 : }
715 6872614 : if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN ||
716 : map->m_flags & EXT4_MAP_MAPPED))
717 6442937 : ext4_fc_track_range(handle, inode, map->m_lblk,
718 6442937 : map->m_lblk + map->m_len - 1);
719 : if (retval < 0)
720 : ext_debug(inode, "failed with err %d\n", retval);
721 : return retval;
722 : }
723 :
724 : /*
725 : * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages
726 : * we have to be careful as someone else may be manipulating b_state as well.
727 : */
728 10967150 : static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
729 : {
730 10967150 : unsigned long old_state;
731 10967150 : unsigned long new_state;
732 :
733 10967150 : flags &= EXT4_MAP_FLAGS;
734 :
735 : /* Dummy buffer_head? Set non-atomically. */
736 10967150 : if (!bh->b_page) {
737 0 : bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
738 0 : return;
739 : }
740 : /*
741 : * Someone else may be modifying b_state. Be careful! This is ugly but
742 : * once we get rid of using bh as a container for mapping information
743 : * to pass to / from get_block functions, this can go away.
744 : */
745 10967150 : old_state = READ_ONCE(bh->b_state);
746 10967150 : do {
747 10967150 : new_state = (old_state & ~EXT4_MAP_FLAGS) | flags;
748 10967150 : } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state)));
749 : }
750 :
751 1221778 : static int _ext4_get_block(struct inode *inode, sector_t iblock,
752 : struct buffer_head *bh, int flags)
753 : {
754 1221778 : struct ext4_map_blocks map;
755 1221778 : int ret = 0;
756 :
757 1221778 : if (ext4_has_inline_data(inode))
758 : return -ERANGE;
759 :
760 1221778 : map.m_lblk = iblock;
761 1221778 : map.m_len = bh->b_size >> inode->i_blkbits;
762 :
763 1221778 : ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map,
764 : flags);
765 1222683 : if (ret > 0) {
766 923034 : map_bh(bh, inode->i_sb, map.m_pblk);
767 923254 : ext4_update_bh_state(bh, map.m_flags);
768 923313 : bh->b_size = inode->i_sb->s_blocksize * map.m_len;
769 923313 : ret = 0;
770 299649 : } else if (ret == 0) {
771 : /* hole case, need to fill in bh->b_size */
772 294703 : bh->b_size = inode->i_sb->s_blocksize * map.m_len;
773 : }
774 : return ret;
775 : }
776 :
777 192704 : int ext4_get_block(struct inode *inode, sector_t iblock,
778 : struct buffer_head *bh, int create)
779 : {
780 192704 : return _ext4_get_block(inode, iblock, bh,
781 : create ? EXT4_GET_BLOCKS_CREATE : 0);
782 : }
783 :
784 : /*
785 : * Get block function used when preparing for buffered write if we require
786 : * creating an unwritten extent if blocks haven't been allocated. The extent
787 : * will be converted to written after the IO is complete.
788 : */
789 249306 : int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
790 : struct buffer_head *bh_result, int create)
791 : {
792 249306 : ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
793 : inode->i_ino, create);
794 249306 : return _ext4_get_block(inode, iblock, bh_result,
795 : EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
796 : }
797 :
798 : /* Maximum number of blocks we map for direct IO at once. */
799 : #define DIO_MAX_BLOCKS 4096
800 :
801 : /*
802 : * `handle' can be NULL if create is zero
803 : */
804 18720714 : struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
805 : ext4_lblk_t block, int map_flags)
806 : {
807 18720714 : struct ext4_map_blocks map;
808 18720714 : struct buffer_head *bh;
809 18720714 : int create = map_flags & EXT4_GET_BLOCKS_CREATE;
810 18720714 : bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT;
811 18720714 : int err;
812 :
813 18720714 : ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
814 : || handle != NULL || create == 0);
815 18720714 : ASSERT(create == 0 || !nowait);
816 :
817 18720714 : map.m_lblk = block;
818 18720714 : map.m_len = 1;
819 18720714 : err = ext4_map_blocks(handle, inode, &map, map_flags);
820 :
821 18750645 : if (err == 0)
822 0 : return create ? ERR_PTR(-ENOSPC) : NULL;
823 18750645 : if (err < 0)
824 44492 : return ERR_PTR(err);
825 :
826 18706153 : if (nowait)
827 1367 : return sb_find_get_block(inode->i_sb, map.m_pblk);
828 :
829 18704786 : bh = sb_getblk(inode->i_sb, map.m_pblk);
830 18710990 : if (unlikely(!bh))
831 : return ERR_PTR(-ENOMEM);
832 18710990 : if (map.m_flags & EXT4_MAP_NEW) {
833 410883 : ASSERT(create != 0);
834 410883 : ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
835 : || (handle != NULL));
836 :
837 : /*
838 : * Now that we do not always journal data, we should
839 : * keep in mind whether this should always journal the
840 : * new buffer as metadata. For now, regular file
841 : * writes use ext4_get_block instead, so it's not a
842 : * problem.
843 : */
844 410883 : lock_buffer(bh);
845 410963 : BUFFER_TRACE(bh, "call get_create_access");
846 410963 : err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
847 : EXT4_JTR_NONE);
848 410887 : if (unlikely(err)) {
849 0 : unlock_buffer(bh);
850 0 : goto errout;
851 : }
852 821863 : if (!buffer_uptodate(bh)) {
853 377995 : memset(bh->b_data, 0, inode->i_sb->s_blocksize);
854 377995 : set_buffer_uptodate(bh);
855 : }
856 411007 : unlock_buffer(bh);
857 410961 : BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
858 410961 : err = ext4_handle_dirty_metadata(handle, inode, bh);
859 411030 : if (unlikely(err))
860 0 : goto errout;
861 : } else
862 : BUFFER_TRACE(bh, "not a new buffer");
863 : return bh;
864 0 : errout:
865 0 : brelse(bh);
866 0 : return ERR_PTR(err);
867 : }
868 :
869 16383935 : struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
870 : ext4_lblk_t block, int map_flags)
871 : {
872 16383935 : struct buffer_head *bh;
873 16383935 : int ret;
874 :
875 16383935 : bh = ext4_getblk(handle, inode, block, map_flags);
876 16405596 : if (IS_ERR(bh))
877 : return bh;
878 16361123 : if (!bh || ext4_buffer_uptodate(bh))
879 16327960 : return bh;
880 :
881 24108 : ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true);
882 24108 : if (ret) {
883 1 : put_bh(bh);
884 1 : return ERR_PTR(ret);
885 : }
886 : return bh;
887 : }
888 :
889 : /* Read a contiguous batch of blocks. */
890 2348704 : int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
891 : bool wait, struct buffer_head **bhs)
892 : {
893 2348704 : int i, err;
894 :
895 4699331 : for (i = 0; i < bh_count; i++) {
896 2349386 : bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */);
897 2350627 : if (IS_ERR(bhs[i])) {
898 0 : err = PTR_ERR(bhs[i]);
899 0 : bh_count = i;
900 0 : goto out_brelse;
901 : }
902 : }
903 :
904 4695193 : for (i = 0; i < bh_count; i++)
905 : /* Note that NULL bhs[i] is valid because of holes. */
906 2349745 : if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
907 1051 : ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false);
908 :
909 2345448 : if (!wait)
910 : return 0;
911 :
912 321 : for (i = 0; i < bh_count; i++)
913 279 : if (bhs[i])
914 279 : wait_on_buffer(bhs[i]);
915 :
916 321 : for (i = 0; i < bh_count; i++) {
917 558 : if (bhs[i] && !buffer_uptodate(bhs[i])) {
918 0 : err = -EIO;
919 0 : goto out_brelse;
920 : }
921 : }
922 : return 0;
923 :
924 0 : out_brelse:
925 0 : for (i = 0; i < bh_count; i++) {
926 0 : brelse(bhs[i]);
927 0 : bhs[i] = NULL;
928 : }
929 : return err;
930 : }
931 :
932 97 : int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,
933 : struct buffer_head *head,
934 : unsigned from,
935 : unsigned to,
936 : int *partial,
937 : int (*fn)(handle_t *handle, struct inode *inode,
938 : struct buffer_head *bh))
939 : {
940 97 : struct buffer_head *bh;
941 97 : unsigned block_start, block_end;
942 97 : unsigned blocksize = head->b_size;
943 97 : int err, ret = 0;
944 97 : struct buffer_head *next;
945 :
946 97 : for (bh = head, block_start = 0;
947 194 : ret == 0 && (bh != head || !block_start);
948 : block_start = block_end, bh = next) {
949 97 : next = bh->b_this_page;
950 97 : block_end = block_start + blocksize;
951 97 : if (block_end <= from || block_start >= to) {
952 0 : if (partial && !buffer_uptodate(bh))
953 0 : *partial = 1;
954 0 : continue;
955 : }
956 97 : err = (*fn)(handle, inode, bh);
957 97 : if (!ret)
958 97 : ret = err;
959 : }
960 97 : return ret;
961 : }
962 :
963 : /*
964 : * Helper for handling dirtying of journalled data. We also mark the folio as
965 : * dirty so that writeback code knows about this page (and inode) contains
966 : * dirty data. ext4_writepages() then commits appropriate transaction to
967 : * make data stable.
968 : */
969 0 : static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh)
970 : {
971 0 : folio_mark_dirty(bh->b_folio);
972 0 : return ext4_handle_dirty_metadata(handle, NULL, bh);
973 : }
974 :
975 0 : int do_journal_get_write_access(handle_t *handle, struct inode *inode,
976 : struct buffer_head *bh)
977 : {
978 0 : int dirty = buffer_dirty(bh);
979 0 : int ret;
980 :
981 0 : if (!buffer_mapped(bh) || buffer_freed(bh))
982 : return 0;
983 : /*
984 : * __block_write_begin() could have dirtied some buffers. Clean
985 : * the dirty bit as jbd2_journal_get_write_access() could complain
986 : * otherwise about fs integrity issues. Setting of the dirty bit
987 : * by __block_write_begin() isn't a real problem here as we clear
988 : * the bit before releasing a page lock and thus writeback cannot
989 : * ever write the buffer.
990 : */
991 0 : if (dirty)
992 0 : clear_buffer_dirty(bh);
993 0 : BUFFER_TRACE(bh, "get write access");
994 0 : ret = ext4_journal_get_write_access(handle, inode->i_sb, bh,
995 : EXT4_JTR_NONE);
996 0 : if (!ret && dirty)
997 0 : ret = ext4_dirty_journalled_data(handle, bh);
998 : return ret;
999 : }
1000 :
1001 : #ifdef CONFIG_FS_ENCRYPTION
1002 : static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
1003 : get_block_t *get_block)
1004 : {
1005 : unsigned from = pos & (PAGE_SIZE - 1);
1006 : unsigned to = from + len;
1007 : struct inode *inode = folio->mapping->host;
1008 : unsigned block_start, block_end;
1009 : sector_t block;
1010 : int err = 0;
1011 : unsigned blocksize = inode->i_sb->s_blocksize;
1012 : unsigned bbits;
1013 : struct buffer_head *bh, *head, *wait[2];
1014 : int nr_wait = 0;
1015 : int i;
1016 :
1017 : BUG_ON(!folio_test_locked(folio));
1018 : BUG_ON(from > PAGE_SIZE);
1019 : BUG_ON(to > PAGE_SIZE);
1020 : BUG_ON(from > to);
1021 :
1022 : head = folio_buffers(folio);
1023 : if (!head) {
1024 : create_empty_buffers(&folio->page, blocksize, 0);
1025 : head = folio_buffers(folio);
1026 : }
1027 : bbits = ilog2(blocksize);
1028 : block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
1029 :
1030 : for (bh = head, block_start = 0; bh != head || !block_start;
1031 : block++, block_start = block_end, bh = bh->b_this_page) {
1032 : block_end = block_start + blocksize;
1033 : if (block_end <= from || block_start >= to) {
1034 : if (folio_test_uptodate(folio)) {
1035 : set_buffer_uptodate(bh);
1036 : }
1037 : continue;
1038 : }
1039 : if (buffer_new(bh))
1040 : clear_buffer_new(bh);
1041 : if (!buffer_mapped(bh)) {
1042 : WARN_ON(bh->b_size != blocksize);
1043 : err = get_block(inode, block, bh, 1);
1044 : if (err)
1045 : break;
1046 : if (buffer_new(bh)) {
1047 : if (folio_test_uptodate(folio)) {
1048 : clear_buffer_new(bh);
1049 : set_buffer_uptodate(bh);
1050 : mark_buffer_dirty(bh);
1051 : continue;
1052 : }
1053 : if (block_end > to || block_start < from)
1054 : folio_zero_segments(folio, to,
1055 : block_end,
1056 : block_start, from);
1057 : continue;
1058 : }
1059 : }
1060 : if (folio_test_uptodate(folio)) {
1061 : set_buffer_uptodate(bh);
1062 : continue;
1063 : }
1064 : if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1065 : !buffer_unwritten(bh) &&
1066 : (block_start < from || block_end > to)) {
1067 : ext4_read_bh_lock(bh, 0, false);
1068 : wait[nr_wait++] = bh;
1069 : }
1070 : }
1071 : /*
1072 : * If we issued read requests, let them complete.
1073 : */
1074 : for (i = 0; i < nr_wait; i++) {
1075 : wait_on_buffer(wait[i]);
1076 : if (!buffer_uptodate(wait[i]))
1077 : err = -EIO;
1078 : }
1079 : if (unlikely(err)) {
1080 : folio_zero_new_buffers(folio, from, to);
1081 : } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
1082 : for (i = 0; i < nr_wait; i++) {
1083 : int err2;
1084 :
1085 : err2 = fscrypt_decrypt_pagecache_blocks(folio,
1086 : blocksize, bh_offset(wait[i]));
1087 : if (err2) {
1088 : clear_buffer_uptodate(wait[i]);
1089 : err = err2;
1090 : }
1091 : }
1092 : }
1093 :
1094 : return err;
1095 : }
1096 : #endif
1097 :
1098 : /*
1099 : * To preserve ordering, it is essential that the hole instantiation and
1100 : * the data write be encapsulated in a single transaction. We cannot
1101 : * close off a transaction and start a new one between the ext4_get_block()
1102 : * and the ext4_write_end(). So doing the jbd2_journal_start at the start of
1103 : * ext4_write_begin() is the right place.
1104 : */
1105 286020 : static int ext4_write_begin(struct file *file, struct address_space *mapping,
1106 : loff_t pos, unsigned len,
1107 : struct page **pagep, void **fsdata)
1108 : {
1109 286020 : struct inode *inode = mapping->host;
1110 286020 : int ret, needed_blocks;
1111 286020 : handle_t *handle;
1112 286020 : int retries = 0;
1113 286020 : struct folio *folio;
1114 286020 : pgoff_t index;
1115 286020 : unsigned from, to;
1116 :
1117 572040 : if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
1118 : return -EIO;
1119 :
1120 286020 : trace_ext4_write_begin(inode, pos, len);
1121 : /*
1122 : * Reserve one block more for addition to orphan list in case
1123 : * we allocate blocks but write fails for some reason
1124 : */
1125 286115 : needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
1126 285847 : index = pos >> PAGE_SHIFT;
1127 285847 : from = pos & (PAGE_SIZE - 1);
1128 285847 : to = from + len;
1129 :
1130 285847 : if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
1131 0 : ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
1132 : pagep);
1133 0 : if (ret < 0)
1134 : return ret;
1135 0 : if (ret == 1)
1136 : return 0;
1137 : }
1138 :
1139 : /*
1140 : * __filemap_get_folio() can take a long time if the
1141 : * system is thrashing due to memory pressure, or if the folio
1142 : * is being written back. So grab it first before we start
1143 : * the transaction handle. This also allows us to allocate
1144 : * the folio (if needed) without using GFP_NOFS.
1145 : */
1146 285847 : retry_grab:
1147 287661 : folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
1148 : mapping_gfp_mask(mapping));
1149 287042 : if (IS_ERR(folio))
1150 0 : return PTR_ERR(folio);
1151 : /*
1152 : * The same as page allocation, we prealloc buffer heads before
1153 : * starting the handle.
1154 : */
1155 287042 : if (!folio_buffers(folio))
1156 250978 : create_empty_buffers(&folio->page, inode->i_sb->s_blocksize, 0);
1157 :
1158 287892 : folio_unlock(folio);
1159 :
1160 290654 : retry_journal:
1161 290654 : handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
1162 290086 : if (IS_ERR(handle)) {
1163 0 : folio_put(folio);
1164 0 : return PTR_ERR(handle);
1165 : }
1166 :
1167 290086 : folio_lock(folio);
1168 290234 : if (folio->mapping != mapping) {
1169 : /* The folio got truncated from under us */
1170 1815 : folio_unlock(folio);
1171 1814 : folio_put(folio);
1172 1815 : ext4_journal_stop(handle);
1173 1814 : goto retry_grab;
1174 : }
1175 : /* In case writeback began while the folio was unlocked */
1176 288419 : folio_wait_stable(folio);
1177 :
1178 : #ifdef CONFIG_FS_ENCRYPTION
1179 : if (ext4_should_dioread_nolock(inode))
1180 : ret = ext4_block_write_begin(folio, pos, len,
1181 : ext4_get_block_unwritten);
1182 : else
1183 : ret = ext4_block_write_begin(folio, pos, len, ext4_get_block);
1184 : #else
1185 288368 : if (ext4_should_dioread_nolock(inode))
1186 284483 : ret = __block_write_begin(&folio->page, pos, len,
1187 : ext4_get_block_unwritten);
1188 : else
1189 3816 : ret = __block_write_begin(&folio->page, pos, len, ext4_get_block);
1190 : #endif
1191 571490 : if (!ret && ext4_should_journal_data(inode)) {
1192 0 : ret = ext4_walk_page_buffers(handle, inode,
1193 : folio_buffers(folio), from, to,
1194 : NULL, do_journal_get_write_access);
1195 : }
1196 :
1197 288106 : if (ret) {
1198 4803 : bool extended = (pos + len > inode->i_size) &&
1199 : !ext4_verity_in_progress(inode);
1200 :
1201 4803 : folio_unlock(folio);
1202 : /*
1203 : * __block_write_begin may have instantiated a few blocks
1204 : * outside i_size. Trim these off again. Don't need
1205 : * i_size_read because we hold i_rwsem.
1206 : *
1207 : * Add inode to orphan list in case we crash before
1208 : * truncate finishes
1209 : */
1210 4803 : if (extended && ext4_can_truncate(inode))
1211 2573 : ext4_orphan_add(handle, inode);
1212 :
1213 4803 : ext4_journal_stop(handle);
1214 4803 : if (extended) {
1215 2573 : ext4_truncate_failed_write(inode);
1216 : /*
1217 : * If truncate failed early the inode might
1218 : * still be on the orphan list; we need to
1219 : * make sure the inode is removed from the
1220 : * orphan list in that case.
1221 : */
1222 2573 : if (inode->i_nlink)
1223 2573 : ext4_orphan_del(NULL, inode);
1224 : }
1225 :
1226 9604 : if (ret == -ENOSPC &&
1227 4803 : ext4_should_retry_alloc(inode->i_sb, &retries))
1228 2680 : goto retry_journal;
1229 2121 : folio_put(folio);
1230 2121 : return ret;
1231 : }
1232 283303 : *pagep = &folio->page;
1233 283303 : return ret;
1234 : }
1235 :
1236 : /* For write_end() in data=journal mode */
1237 0 : static int write_end_fn(handle_t *handle, struct inode *inode,
1238 : struct buffer_head *bh)
1239 : {
1240 0 : int ret;
1241 0 : if (!buffer_mapped(bh) || buffer_freed(bh))
1242 : return 0;
1243 0 : set_buffer_uptodate(bh);
1244 0 : ret = ext4_dirty_journalled_data(handle, bh);
1245 0 : clear_buffer_meta(bh);
1246 0 : clear_buffer_prio(bh);
1247 0 : return ret;
1248 : }
1249 :
1250 : /*
1251 : * We need to pick up the new inode size which generic_commit_write gave us
1252 : * `file' can be NULL - eg, when called from page_symlink().
1253 : *
1254 : * ext4 never places buffers on inode->i_mapping->private_list. metadata
1255 : * buffers are managed internally.
1256 : */
1257 283602 : static int ext4_write_end(struct file *file,
1258 : struct address_space *mapping,
1259 : loff_t pos, unsigned len, unsigned copied,
1260 : struct page *page, void *fsdata)
1261 : {
1262 283602 : struct folio *folio = page_folio(page);
1263 283865 : handle_t *handle = ext4_journal_current_handle();
1264 283865 : struct inode *inode = mapping->host;
1265 283865 : loff_t old_size = inode->i_size;
1266 283865 : int ret = 0, ret2;
1267 283865 : int i_size_changed = 0;
1268 283865 : bool verity = ext4_verity_in_progress(inode);
1269 :
1270 283865 : trace_ext4_write_end(inode, pos, len, copied);
1271 :
1272 283771 : if (ext4_has_inline_data(inode) &&
1273 : ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
1274 0 : return ext4_write_inline_data_end(inode, pos, len, copied,
1275 : folio);
1276 :
1277 283771 : copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1278 : /*
1279 : * it's important to update i_size while still holding folio lock:
1280 : * page writeout could otherwise come in and zero beyond i_size.
1281 : *
1282 : * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree
1283 : * blocks are being written past EOF, so skip the i_size update.
1284 : */
1285 283979 : if (!verity)
1286 283979 : i_size_changed = ext4_update_inode_size(inode, pos + copied);
1287 283950 : folio_unlock(folio);
1288 284004 : folio_put(folio);
1289 :
1290 284043 : if (old_size < pos && !verity)
1291 610 : pagecache_isize_extended(inode, old_size, pos);
1292 : /*
1293 : * Don't mark the inode dirty under folio lock. First, it unnecessarily
1294 : * makes the holding time of folio lock longer. Second, it forces lock
1295 : * ordering of folio lock and transaction start for journaling
1296 : * filesystems.
1297 : */
1298 284042 : if (i_size_changed)
1299 186430 : ret = ext4_mark_inode_dirty(handle, inode);
1300 :
1301 284045 : if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
1302 : /* if we have allocated more blocks and copied
1303 : * less. We will have blocks allocated outside
1304 : * inode->i_size. So truncate them
1305 : */
1306 0 : ext4_orphan_add(handle, inode);
1307 :
1308 284045 : ret2 = ext4_journal_stop(handle);
1309 283597 : if (!ret)
1310 283595 : ret = ret2;
1311 :
1312 283597 : if (pos + len > inode->i_size && !verity) {
1313 0 : ext4_truncate_failed_write(inode);
1314 : /*
1315 : * If truncate failed early the inode might still be
1316 : * on the orphan list; we need to make sure the inode
1317 : * is removed from the orphan list in that case.
1318 : */
1319 0 : if (inode->i_nlink)
1320 0 : ext4_orphan_del(NULL, inode);
1321 : }
1322 :
1323 283597 : return ret ? ret : copied;
1324 : }
1325 :
1326 : /*
1327 : * This is a private version of folio_zero_new_buffers() which doesn't
1328 : * set the buffer to be dirty, since in data=journalled mode we need
1329 : * to call ext4_dirty_journalled_data() instead.
1330 : */
1331 0 : static void ext4_journalled_zero_new_buffers(handle_t *handle,
1332 : struct inode *inode,
1333 : struct folio *folio,
1334 : unsigned from, unsigned to)
1335 : {
1336 0 : unsigned int block_start = 0, block_end;
1337 0 : struct buffer_head *head, *bh;
1338 :
1339 0 : bh = head = folio_buffers(folio);
1340 0 : do {
1341 0 : block_end = block_start + bh->b_size;
1342 0 : if (buffer_new(bh)) {
1343 0 : if (block_end > from && block_start < to) {
1344 0 : if (!folio_test_uptodate(folio)) {
1345 0 : unsigned start, size;
1346 :
1347 0 : start = max(from, block_start);
1348 0 : size = min(to, block_end) - start;
1349 :
1350 0 : folio_zero_range(folio, start, size);
1351 0 : write_end_fn(handle, inode, bh);
1352 : }
1353 0 : clear_buffer_new(bh);
1354 : }
1355 : }
1356 0 : block_start = block_end;
1357 0 : bh = bh->b_this_page;
1358 0 : } while (bh != head);
1359 0 : }
1360 :
1361 0 : static int ext4_journalled_write_end(struct file *file,
1362 : struct address_space *mapping,
1363 : loff_t pos, unsigned len, unsigned copied,
1364 : struct page *page, void *fsdata)
1365 : {
1366 0 : struct folio *folio = page_folio(page);
1367 0 : handle_t *handle = ext4_journal_current_handle();
1368 0 : struct inode *inode = mapping->host;
1369 0 : loff_t old_size = inode->i_size;
1370 0 : int ret = 0, ret2;
1371 0 : int partial = 0;
1372 0 : unsigned from, to;
1373 0 : int size_changed = 0;
1374 0 : bool verity = ext4_verity_in_progress(inode);
1375 :
1376 0 : trace_ext4_journalled_write_end(inode, pos, len, copied);
1377 0 : from = pos & (PAGE_SIZE - 1);
1378 0 : to = from + len;
1379 :
1380 0 : BUG_ON(!ext4_handle_valid(handle));
1381 :
1382 0 : if (ext4_has_inline_data(inode))
1383 0 : return ext4_write_inline_data_end(inode, pos, len, copied,
1384 : folio);
1385 :
1386 0 : if (unlikely(copied < len) && !folio_test_uptodate(folio)) {
1387 0 : copied = 0;
1388 0 : ext4_journalled_zero_new_buffers(handle, inode, folio,
1389 : from, to);
1390 : } else {
1391 0 : if (unlikely(copied < len))
1392 0 : ext4_journalled_zero_new_buffers(handle, inode, folio,
1393 : from + copied, to);
1394 0 : ret = ext4_walk_page_buffers(handle, inode,
1395 : folio_buffers(folio),
1396 : from, from + copied, &partial,
1397 : write_end_fn);
1398 0 : if (!partial)
1399 0 : folio_mark_uptodate(folio);
1400 : }
1401 0 : if (!verity)
1402 0 : size_changed = ext4_update_inode_size(inode, pos + copied);
1403 0 : EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
1404 0 : folio_unlock(folio);
1405 0 : folio_put(folio);
1406 :
1407 0 : if (old_size < pos && !verity)
1408 0 : pagecache_isize_extended(inode, old_size, pos);
1409 :
1410 0 : if (size_changed) {
1411 0 : ret2 = ext4_mark_inode_dirty(handle, inode);
1412 0 : if (!ret)
1413 0 : ret = ret2;
1414 : }
1415 :
1416 0 : if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
1417 : /* if we have allocated more blocks and copied
1418 : * less. We will have blocks allocated outside
1419 : * inode->i_size. So truncate them
1420 : */
1421 0 : ext4_orphan_add(handle, inode);
1422 :
1423 0 : ret2 = ext4_journal_stop(handle);
1424 0 : if (!ret)
1425 0 : ret = ret2;
1426 0 : if (pos + len > inode->i_size && !verity) {
1427 0 : ext4_truncate_failed_write(inode);
1428 : /*
1429 : * If truncate failed early the inode might still be
1430 : * on the orphan list; we need to make sure the inode
1431 : * is removed from the orphan list in that case.
1432 : */
1433 0 : if (inode->i_nlink)
1434 0 : ext4_orphan_del(NULL, inode);
1435 : }
1436 :
1437 0 : return ret ? ret : copied;
1438 : }
1439 :
1440 : /*
1441 : * Reserve space for a single cluster
1442 : */
1443 26762736 : static int ext4_da_reserve_space(struct inode *inode)
1444 : {
1445 26762736 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1446 26762736 : struct ext4_inode_info *ei = EXT4_I(inode);
1447 26762736 : int ret;
1448 :
1449 : /*
1450 : * We will charge metadata quota at writeout time; this saves
1451 : * us from metadata over-estimation, though we may go over by
1452 : * a small amount in the end. Here we just reserve for data.
1453 : */
1454 26762736 : ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
1455 26778751 : if (ret)
1456 : return ret;
1457 :
1458 26775666 : spin_lock(&ei->i_block_reservation_lock);
1459 26799650 : if (ext4_claim_free_clusters(sbi, 1, 0)) {
1460 164016 : spin_unlock(&ei->i_block_reservation_lock);
1461 164060 : dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
1462 164060 : return -ENOSPC;
1463 : }
1464 26619308 : ei->i_reserved_data_blocks++;
1465 26619308 : trace_ext4_da_reserve_space(inode);
1466 26554893 : spin_unlock(&ei->i_block_reservation_lock);
1467 :
1468 26554893 : return 0; /* success */
1469 : }
1470 :
1471 8338047 : void ext4_da_release_space(struct inode *inode, int to_free)
1472 : {
1473 8338047 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1474 8338047 : struct ext4_inode_info *ei = EXT4_I(inode);
1475 :
1476 8338047 : if (!to_free)
1477 : return; /* Nothing to release, exit */
1478 :
1479 1013438 : spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1480 :
1481 1013703 : trace_ext4_da_release_space(inode, to_free);
1482 1013452 : if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1483 : /*
1484 : * if there aren't enough reserved blocks, then the
1485 : * counter is messed up somewhere. Since this
1486 : * function is called from invalidate page, it's
1487 : * harmless to return without any action.
1488 : */
1489 0 : ext4_warning(inode->i_sb, "ext4_da_release_space: "
1490 : "ino %lu, to_free %d with only %d reserved "
1491 : "data blocks", inode->i_ino, to_free,
1492 : ei->i_reserved_data_blocks);
1493 0 : WARN_ON(1);
1494 0 : to_free = ei->i_reserved_data_blocks;
1495 : }
1496 1013452 : ei->i_reserved_data_blocks -= to_free;
1497 :
1498 : /* update fs dirty data blocks counter */
1499 1013452 : percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
1500 :
1501 1013487 : spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1502 :
1503 1013686 : dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
1504 : }
1505 :
1506 : /*
1507 : * Delayed allocation stuff
1508 : */
1509 :
1510 : struct mpage_da_data {
1511 : /* These are input fields for ext4_do_writepages() */
1512 : struct inode *inode;
1513 : struct writeback_control *wbc;
1514 : unsigned int can_map:1; /* Can writepages call map blocks? */
1515 :
1516 : /* These are internal state of ext4_do_writepages() */
1517 : pgoff_t first_page; /* The first page to write */
1518 : pgoff_t next_page; /* Current page to examine */
1519 : pgoff_t last_page; /* Last page to examine */
1520 : /*
1521 : * Extent to map - this can be after first_page because that can be
1522 : * fully mapped. We somewhat abuse m_flags to store whether the extent
1523 : * is delalloc or unwritten.
1524 : */
1525 : struct ext4_map_blocks map;
1526 : struct ext4_io_submit io_submit; /* IO submission data */
1527 : unsigned int do_map:1;
1528 : unsigned int scanned_until_end:1;
1529 : unsigned int journalled_more_data:1;
1530 : };
1531 :
1532 2689853 : static void mpage_release_unused_pages(struct mpage_da_data *mpd,
1533 : bool invalidate)
1534 : {
1535 2689853 : unsigned nr, i;
1536 2689853 : pgoff_t index, end;
1537 2689853 : struct folio_batch fbatch;
1538 2689853 : struct inode *inode = mpd->inode;
1539 2689853 : struct address_space *mapping = inode->i_mapping;
1540 :
1541 : /* This is necessary when next_page == 0. */
1542 2689853 : if (mpd->first_page >= mpd->next_page)
1543 1743333 : return;
1544 :
1545 946520 : mpd->scanned_until_end = 0;
1546 946520 : index = mpd->first_page;
1547 946520 : end = mpd->next_page - 1;
1548 946520 : if (invalidate) {
1549 0 : ext4_lblk_t start, last;
1550 0 : start = index << (PAGE_SHIFT - inode->i_blkbits);
1551 0 : last = end << (PAGE_SHIFT - inode->i_blkbits);
1552 :
1553 : /*
1554 : * avoid racing with extent status tree scans made by
1555 : * ext4_insert_delayed_block()
1556 : */
1557 0 : down_write(&EXT4_I(inode)->i_data_sem);
1558 0 : ext4_es_remove_extent(inode, start, last - start + 1);
1559 0 : up_write(&EXT4_I(inode)->i_data_sem);
1560 : }
1561 :
1562 946513 : folio_batch_init(&fbatch);
1563 2170302 : while (index <= end) {
1564 1223781 : nr = filemap_get_folios(mapping, &index, end, &fbatch);
1565 1223766 : if (nr == 0)
1566 : break;
1567 6689903 : for (i = 0; i < nr; i++) {
1568 5466114 : struct folio *folio = fbatch.folios[i];
1569 :
1570 5466106 : if (folio->index < mpd->first_page)
1571 0 : continue;
1572 5466106 : if (folio->index + folio_nr_pages(folio) - 1 > end)
1573 0 : continue;
1574 5466106 : BUG_ON(!folio_test_locked(folio));
1575 5466106 : BUG_ON(folio_test_writeback(folio));
1576 5466106 : if (invalidate) {
1577 0 : if (folio_mapped(folio))
1578 0 : folio_clear_dirty_for_io(folio);
1579 0 : block_invalidate_folio(folio, 0,
1580 : folio_size(folio));
1581 0 : folio_clear_uptodate(folio);
1582 : }
1583 5466106 : folio_unlock(folio);
1584 : }
1585 1223789 : folio_batch_release(&fbatch);
1586 : }
1587 : }
1588 :
1589 0 : static void ext4_print_free_blocks(struct inode *inode)
1590 : {
1591 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1592 0 : struct super_block *sb = inode->i_sb;
1593 0 : struct ext4_inode_info *ei = EXT4_I(inode);
1594 :
1595 0 : ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
1596 : EXT4_C2B(EXT4_SB(inode->i_sb),
1597 : ext4_count_free_clusters(sb)));
1598 0 : ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
1599 0 : ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
1600 : (long long) EXT4_C2B(EXT4_SB(sb),
1601 : percpu_counter_sum(&sbi->s_freeclusters_counter)));
1602 0 : ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
1603 : (long long) EXT4_C2B(EXT4_SB(sb),
1604 : percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
1605 0 : ext4_msg(sb, KERN_CRIT, "Block reservation details");
1606 0 : ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
1607 : ei->i_reserved_data_blocks);
1608 0 : return;
1609 : }
1610 :
1611 : /*
1612 : * ext4_insert_delayed_block - adds a delayed block to the extents status
1613 : * tree, incrementing the reserved cluster/block
1614 : * count or making a pending reservation
1615 : * where needed
1616 : *
1617 : * @inode - file containing the newly added block
1618 : * @lblk - logical block to be added
1619 : *
1620 : * Returns 0 on success, negative error code on failure.
1621 : */
1622 26840066 : static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
1623 : {
1624 26840066 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1625 26840066 : int ret;
1626 26840066 : bool allocated = false;
1627 :
1628 : /*
1629 : * If the cluster containing lblk is shared with a delayed,
1630 : * written, or unwritten extent in a bigalloc file system, it's
1631 : * already been accounted for and does not need to be reserved.
1632 : * A pending reservation must be made for the cluster if it's
1633 : * shared with a written or unwritten extent and doesn't already
1634 : * have one. Written and unwritten extents can be purged from the
1635 : * extents status tree if the system is under memory pressure, so
1636 : * it's necessary to examine the extent tree if a search of the
1637 : * extents status tree doesn't get a match.
1638 : */
1639 26840066 : if (sbi->s_cluster_ratio == 1) {
1640 26774530 : ret = ext4_da_reserve_space(inode);
1641 26797413 : if (ret != 0) /* ENOSPC */
1642 : return ret;
1643 : } else { /* bigalloc */
1644 65536 : if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
1645 4097 : if (!ext4_es_scan_clu(inode,
1646 : &ext4_es_is_mapped, lblk)) {
1647 8192 : ret = ext4_clu_mapped(inode,
1648 4096 : EXT4_B2C(sbi, lblk));
1649 4096 : if (ret < 0)
1650 : return ret;
1651 4096 : if (ret == 0) {
1652 4096 : ret = ext4_da_reserve_space(inode);
1653 4096 : if (ret != 0) /* ENOSPC */
1654 : return ret;
1655 : } else {
1656 : allocated = true;
1657 : }
1658 : } else {
1659 : allocated = true;
1660 : }
1661 : }
1662 : }
1663 :
1664 26696364 : ext4_es_insert_delayed_block(inode, lblk, allocated);
1665 26696364 : return 0;
1666 : }
1667 :
1668 : /*
1669 : * This function is grabs code from the very beginning of
1670 : * ext4_map_blocks, but assumes that the caller is from delayed write
1671 : * time. This function looks up the requested blocks and sets the
1672 : * buffer delay bit under the protection of i_data_sem.
1673 : */
1674 36861729 : static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1675 : struct ext4_map_blocks *map,
1676 : struct buffer_head *bh)
1677 : {
1678 36861729 : struct extent_status es;
1679 36861729 : int retval;
1680 36861729 : sector_t invalid_block = ~((sector_t) 0xffff);
1681 : #ifdef ES_AGGRESSIVE_TEST
1682 : struct ext4_map_blocks orig_map;
1683 :
1684 : memcpy(&orig_map, map, sizeof(*map));
1685 : #endif
1686 :
1687 73603590 : if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
1688 0 : invalid_block = ~0;
1689 :
1690 36861729 : map->m_flags = 0;
1691 36861729 : ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
1692 : (unsigned long) map->m_lblk);
1693 :
1694 : /* Lookup extent status tree firstly */
1695 36861729 : if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
1696 34901659 : if (ext4_es_is_hole(&es)) {
1697 25084265 : retval = 0;
1698 25084265 : down_read(&EXT4_I(inode)->i_data_sem);
1699 25083003 : goto add_delayed;
1700 : }
1701 :
1702 : /*
1703 : * Delayed extent could be allocated by fallocate.
1704 : * So we need to check it.
1705 : */
1706 9817394 : if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
1707 0 : map_bh(bh, inode->i_sb, invalid_block);
1708 0 : set_buffer_new(bh);
1709 0 : set_buffer_delay(bh);
1710 0 : return 0;
1711 : }
1712 :
1713 9817394 : map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
1714 9817394 : retval = es.es_len - (iblock - es.es_lblk);
1715 9817394 : if (retval > map->m_len)
1716 : retval = map->m_len;
1717 9817394 : map->m_len = retval;
1718 9817394 : if (ext4_es_is_written(&es))
1719 5092504 : map->m_flags |= EXT4_MAP_MAPPED;
1720 4724890 : else if (ext4_es_is_unwritten(&es))
1721 4724890 : map->m_flags |= EXT4_MAP_UNWRITTEN;
1722 : else
1723 0 : BUG();
1724 :
1725 : #ifdef ES_AGGRESSIVE_TEST
1726 : ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
1727 : #endif
1728 9817394 : return retval;
1729 : }
1730 :
1731 : /*
1732 : * Try to see if we can get the block without requesting a new
1733 : * file system block.
1734 : */
1735 1992082 : down_read(&EXT4_I(inode)->i_data_sem);
1736 1990283 : if (ext4_has_inline_data(inode))
1737 : retval = 0;
1738 1990283 : else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1739 1865067 : retval = ext4_ext_map_blocks(NULL, inode, map, 0);
1740 : else
1741 125216 : retval = ext4_ind_map_blocks(NULL, inode, map, 0);
1742 :
1743 1984141 : add_delayed:
1744 27067144 : if (retval == 0) {
1745 26842601 : int ret;
1746 :
1747 : /*
1748 : * XXX: __block_prepare_write() unmaps passed block,
1749 : * is it OK?
1750 : */
1751 :
1752 26842601 : ret = ext4_insert_delayed_block(inode, map->m_lblk);
1753 26793907 : if (ret != 0) {
1754 166491 : retval = ret;
1755 166491 : goto out_unlock;
1756 : }
1757 :
1758 26627416 : map_bh(bh, inode->i_sb, invalid_block);
1759 26693129 : set_buffer_new(bh);
1760 26699937 : set_buffer_delay(bh);
1761 224543 : } else if (retval > 0) {
1762 224542 : unsigned int status;
1763 :
1764 224542 : if (unlikely(retval != map->m_len)) {
1765 0 : ext4_warning(inode->i_sb,
1766 : "ES len assertion failed for inode "
1767 : "%lu: retval %d != map->m_len %d",
1768 : inode->i_ino, retval, map->m_len);
1769 0 : WARN_ON(1);
1770 : }
1771 :
1772 224542 : status = map->m_flags & EXT4_MAP_UNWRITTEN ?
1773 224542 : EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
1774 224542 : ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
1775 : map->m_pblk, status);
1776 : }
1777 :
1778 1 : out_unlock:
1779 27074814 : up_read((&EXT4_I(inode)->i_data_sem));
1780 :
1781 27074814 : return retval;
1782 : }
1783 :
1784 : /*
1785 : * This is a special get_block_t callback which is used by
1786 : * ext4_da_write_begin(). It will either return mapped block or
1787 : * reserve space for a single block.
1788 : *
1789 : * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
1790 : * We also have b_blocknr = -1 and b_bdev initialized properly
1791 : *
1792 : * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
1793 : * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
1794 : * initialized properly.
1795 : */
1796 36882017 : int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1797 : struct buffer_head *bh, int create)
1798 : {
1799 36882017 : struct ext4_map_blocks map;
1800 36882017 : int ret = 0;
1801 :
1802 36882017 : BUG_ON(create == 0);
1803 36882017 : BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
1804 :
1805 36882017 : map.m_lblk = iblock;
1806 36882017 : map.m_len = 1;
1807 :
1808 : /*
1809 : * first, we need to know whether the block is allocated already
1810 : * preallocated blocks are unmapped but should treated
1811 : * the same as allocated blocks.
1812 : */
1813 36882017 : ret = ext4_da_map_blocks(inode, iblock, &map, bh);
1814 36870013 : if (ret <= 0)
1815 : return ret;
1816 :
1817 10041848 : map_bh(bh, inode->i_sb, map.m_pblk);
1818 10045091 : ext4_update_bh_state(bh, map.m_flags);
1819 :
1820 20092110 : if (buffer_unwritten(bh)) {
1821 : /* A delayed write to unwritten bh should be marked
1822 : * new and mapped. Mapped ensures that we don't do
1823 : * get_block multiple times when we write to the same
1824 : * offset and new ensures that we do proper zero out
1825 : * for partial write.
1826 : */
1827 4819470 : set_buffer_new(bh);
1828 4819351 : set_buffer_mapped(bh);
1829 : }
1830 : return 0;
1831 : }
1832 :
1833 30647221 : static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio)
1834 : {
1835 30647221 : mpd->first_page += folio_nr_pages(folio);
1836 30647221 : folio_unlock(folio);
1837 30654291 : }
1838 :
1839 30650846 : static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
1840 : {
1841 30650846 : size_t len;
1842 30650846 : loff_t size;
1843 30650846 : int err;
1844 :
1845 30650846 : BUG_ON(folio->index != mpd->first_page);
1846 30650846 : folio_clear_dirty_for_io(folio);
1847 : /*
1848 : * We have to be very careful here! Nothing protects writeback path
1849 : * against i_size changes and the page can be writeably mapped into
1850 : * page tables. So an application can be growing i_size and writing
1851 : * data through mmap while writeback runs. folio_clear_dirty_for_io()
1852 : * write-protects our page in page tables and the page cannot get
1853 : * written to again until we release folio lock. So only after
1854 : * folio_clear_dirty_for_io() we are safe to sample i_size for
1855 : * ext4_bio_write_folio() to zero-out tail of the written page. We rely
1856 : * on the barrier provided by folio_test_clear_dirty() in
1857 : * folio_clear_dirty_for_io() to make sure i_size is really sampled only
1858 : * after page tables are updated.
1859 : */
1860 30646235 : size = i_size_read(mpd->inode);
1861 30646235 : len = folio_size(folio);
1862 30646483 : if (folio_pos(folio) + len > size &&
1863 : !ext4_verity_in_progress(mpd->inode))
1864 386003 : len = size & ~PAGE_MASK;
1865 30646483 : err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
1866 30647406 : if (!err)
1867 30647421 : mpd->wbc->nr_to_write--;
1868 :
1869 30647406 : return err;
1870 : }
1871 :
1872 : #define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay))
1873 :
1874 : /*
1875 : * mballoc gives us at most this number of blocks...
1876 : * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
1877 : * The rest of mballoc seems to handle chunks up to full group size.
1878 : */
1879 : #define MAX_WRITEPAGES_EXTENT_LEN 2048
1880 :
1881 : /*
1882 : * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
1883 : *
1884 : * @mpd - extent of blocks
1885 : * @lblk - logical number of the block in the file
1886 : * @bh - buffer head we want to add to the extent
1887 : *
1888 : * The function is used to collect contig. blocks in the same state. If the
1889 : * buffer doesn't require mapping for writeback and we haven't started the
1890 : * extent of buffers to map yet, the function returns 'true' immediately - the
1891 : * caller can write the buffer right away. Otherwise the function returns true
1892 : * if the block has been added to the extent, false if the block couldn't be
1893 : * added.
1894 : */
1895 35757698 : static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
1896 : struct buffer_head *bh)
1897 : {
1898 35757698 : struct ext4_map_blocks *map = &mpd->map;
1899 :
1900 : /* Buffer that doesn't need mapping for writeback? */
1901 143031389 : if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
1902 11057680 : (!buffer_delay(bh) && !buffer_unwritten(bh))) {
1903 : /* So far no extent to map => we write the buffer right away */
1904 5757486 : if (map->m_len == 0)
1905 : return true;
1906 79196 : return false;
1907 : }
1908 :
1909 : /* First block in the extent? */
1910 30000212 : if (map->m_len == 0) {
1911 : /* We cannot map unless handle is started... */
1912 1841586 : if (!mpd->do_map)
1913 : return false;
1914 1117598 : map->m_lblk = lblk;
1915 1117598 : map->m_len = 1;
1916 1117598 : map->m_flags = bh->b_state & BH_FLAGS;
1917 1117598 : return true;
1918 : }
1919 :
1920 : /* Don't go larger than mballoc is willing to allocate */
1921 28158626 : if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
1922 : return false;
1923 :
1924 : /* Can we merge the block to our big extent? */
1925 28152021 : if (lblk == map->m_lblk + map->m_len &&
1926 28152602 : (bh->b_state & BH_FLAGS) == map->m_flags) {
1927 28070509 : map->m_len++;
1928 28070509 : return true;
1929 : }
1930 : return false;
1931 : }
1932 :
1933 : /*
1934 : * mpage_process_page_bufs - submit page buffers for IO or add them to extent
1935 : *
1936 : * @mpd - extent of blocks for mapping
1937 : * @head - the first buffer in the page
1938 : * @bh - buffer we should start processing from
1939 : * @lblk - logical number of the block in the file corresponding to @bh
1940 : *
1941 : * Walk through page buffers from @bh upto @head (exclusive) and either submit
1942 : * the page for IO if all buffers in this page were mapped and there's no
1943 : * accumulated extent of buffers to map or add buffers in the page to the
1944 : * extent of buffers to map. The function returns 1 if the caller can continue
1945 : * by processing the next page, 0 if it should stop adding buffers to the
1946 : * extent to map because we cannot extend it anymore. It can also return value
1947 : * < 0 in case of error during IO submission.
1948 : */
1949 35766469 : static int mpage_process_page_bufs(struct mpage_da_data *mpd,
1950 : struct buffer_head *head,
1951 : struct buffer_head *bh,
1952 : ext4_lblk_t lblk)
1953 : {
1954 35766469 : struct inode *inode = mpd->inode;
1955 35766469 : int err;
1956 35765818 : ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1)
1957 35766469 : >> inode->i_blkbits;
1958 :
1959 35765818 : if (ext4_verity_in_progress(inode))
1960 : blocks = EXT_MAX_BLOCKS;
1961 :
1962 35722699 : do {
1963 71445398 : BUG_ON(buffer_locked(bh));
1964 :
1965 35722699 : if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
1966 : /* Found extent to map? */
1967 892146 : if (mpd->map.m_len)
1968 : return 0;
1969 : /* Buffer needs mapping and handle is not started? */
1970 724124 : if (!mpd->do_map)
1971 : return 0;
1972 : /* Everything mapped so far and we hit EOF */
1973 : break;
1974 : }
1975 34823394 : } while (lblk++, (bh = bh->b_this_page) != head);
1976 : /* So far everything mapped? Submit the page for IO. */
1977 34866596 : if (mpd->map.m_len == 0) {
1978 5678368 : err = mpage_submit_folio(mpd, head->b_folio);
1979 5678333 : if (err < 0)
1980 : return err;
1981 5678333 : mpage_folio_done(mpd, head->b_folio);
1982 : }
1983 34866654 : if (lblk >= blocks) {
1984 495075 : mpd->scanned_until_end = 1;
1985 495075 : return 0;
1986 : }
1987 : return 1;
1988 : }
1989 :
1990 : /*
1991 : * mpage_process_folio - update folio buffers corresponding to changed extent
1992 : * and may submit fully mapped page for IO
1993 : * @mpd: description of extent to map, on return next extent to map
1994 : * @folio: Contains these buffers.
1995 : * @m_lblk: logical block mapping.
1996 : * @m_pblk: corresponding physical mapping.
1997 : * @map_bh: determines on return whether this page requires any further
1998 : * mapping or not.
1999 : *
2000 : * Scan given folio buffers corresponding to changed extent and update buffer
2001 : * state according to new extent state.
2002 : * We map delalloc buffers to their physical location, clear unwritten bits.
2003 : * If the given folio is not fully mapped, we update @mpd to the next extent in
2004 : * the given folio that needs mapping & return @map_bh as true.
2005 : */
2006 24620232 : static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio,
2007 : ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
2008 : bool *map_bh)
2009 : {
2010 24620232 : struct buffer_head *head, *bh;
2011 24620232 : ext4_io_end_t *io_end = mpd->io_submit.io_end;
2012 24620232 : ext4_lblk_t lblk = *m_lblk;
2013 24620232 : ext4_fsblk_t pblock = *m_pblk;
2014 24620232 : int err = 0;
2015 24620232 : int blkbits = mpd->inode->i_blkbits;
2016 24620232 : ssize_t io_end_size = 0;
2017 24620232 : struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
2018 :
2019 24619751 : bh = head = folio_buffers(folio);
2020 24617598 : do {
2021 24617598 : if (lblk < mpd->map.m_lblk)
2022 0 : continue;
2023 24617598 : if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
2024 : /*
2025 : * Buffer after end of mapped extent.
2026 : * Find next buffer in the folio to map.
2027 : */
2028 1 : mpd->map.m_len = 0;
2029 1 : mpd->map.m_flags = 0;
2030 1 : io_end_vec->size += io_end_size;
2031 :
2032 1 : err = mpage_process_page_bufs(mpd, head, bh, lblk);
2033 1 : if (err > 0)
2034 : err = 0;
2035 1 : if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
2036 0 : io_end_vec = ext4_alloc_io_end_vec(io_end);
2037 0 : if (IS_ERR(io_end_vec)) {
2038 0 : err = PTR_ERR(io_end_vec);
2039 0 : goto out;
2040 : }
2041 0 : io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits;
2042 : }
2043 1 : *map_bh = true;
2044 1 : goto out;
2045 : }
2046 49235194 : if (buffer_delay(bh)) {
2047 20231246 : clear_buffer_delay(bh);
2048 20236038 : bh->b_blocknr = pblock++;
2049 : }
2050 24622389 : clear_buffer_unwritten(bh);
2051 24618486 : io_end_size += (1 << blkbits);
2052 24618486 : } while (lblk++, (bh = bh->b_this_page) != head);
2053 :
2054 24620639 : io_end_vec->size += io_end_size;
2055 24620639 : *map_bh = false;
2056 24620640 : out:
2057 24620640 : *m_lblk = lblk;
2058 24620640 : *m_pblk = pblock;
2059 24620640 : return err;
2060 : }
2061 :
2062 : /*
2063 : * mpage_map_buffers - update buffers corresponding to changed extent and
2064 : * submit fully mapped pages for IO
2065 : *
2066 : * @mpd - description of extent to map, on return next extent to map
2067 : *
2068 : * Scan buffers corresponding to changed extent (we expect corresponding pages
2069 : * to be already locked) and update buffer state according to new extent state.
2070 : * We map delalloc buffers to their physical location, clear unwritten bits,
2071 : * and mark buffers as uninit when we perform writes to unwritten extents
2072 : * and do extent conversion after IO is finished. If the last page is not fully
2073 : * mapped, we update @map to the next extent in the last page that needs
2074 : * mapping. Otherwise we submit the page for IO.
2075 : */
2076 1117545 : static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
2077 : {
2078 1117545 : struct folio_batch fbatch;
2079 1117545 : unsigned nr, i;
2080 1117545 : struct inode *inode = mpd->inode;
2081 1117545 : int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
2082 1117545 : pgoff_t start, end;
2083 1117545 : ext4_lblk_t lblk;
2084 1117545 : ext4_fsblk_t pblock;
2085 1117545 : int err;
2086 1117545 : bool map_bh = false;
2087 :
2088 1117545 : start = mpd->map.m_lblk >> bpp_bits;
2089 1117545 : end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
2090 1117545 : lblk = start << bpp_bits;
2091 1117545 : pblock = mpd->map.m_pblk;
2092 :
2093 1117545 : folio_batch_init(&fbatch);
2094 3584535 : while (start <= end) {
2095 2466988 : nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch);
2096 2466992 : if (nr == 0)
2097 : break;
2098 27088563 : for (i = 0; i < nr; i++) {
2099 24621557 : struct folio *folio = fbatch.folios[i];
2100 :
2101 24620824 : err = mpage_process_folio(mpd, folio, &lblk, &pblock,
2102 : &map_bh);
2103 : /*
2104 : * If map_bh is true, means page may require further bh
2105 : * mapping, or maybe the page was submitted for IO.
2106 : * So we return to call further extent mapping.
2107 : */
2108 24619564 : if (err < 0 || map_bh)
2109 1 : goto out;
2110 : /* Page fully mapped - let IO run! */
2111 24619563 : err = mpage_submit_folio(mpd, folio);
2112 24615759 : if (err < 0)
2113 0 : goto out;
2114 24615759 : mpage_folio_done(mpd, folio);
2115 : }
2116 2467006 : folio_batch_release(&fbatch);
2117 : }
2118 : /* Extent fully mapped and matches with page boundary. We are done. */
2119 1117545 : mpd->map.m_len = 0;
2120 1117545 : mpd->map.m_flags = 0;
2121 1117545 : return 0;
2122 1 : out:
2123 1 : folio_batch_release(&fbatch);
2124 : return err;
2125 : }
2126 :
2127 1117624 : static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
2128 : {
2129 1117624 : struct inode *inode = mpd->inode;
2130 1117624 : struct ext4_map_blocks *map = &mpd->map;
2131 1117624 : int get_blocks_flags;
2132 1117624 : int err, dioread_nolock;
2133 :
2134 1117624 : trace_ext4_da_write_pages_extent(inode, map);
2135 : /*
2136 : * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
2137 : * to convert an unwritten extent to be initialized (in the case
2138 : * where we have written into one or more preallocated blocks). It is
2139 : * possible that we're going to need more metadata blocks than
2140 : * previously reserved. However we must not fail because we're in
2141 : * writeback and there is nothing we can do about it so it might result
2142 : * in data loss. So use reserved blocks to allocate metadata if
2143 : * possible.
2144 : *
2145 : * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if
2146 : * the blocks in question are delalloc blocks. This indicates
2147 : * that the blocks and quotas has already been checked when
2148 : * the data was copied into the page cache.
2149 : */
2150 1117613 : get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
2151 : EXT4_GET_BLOCKS_METADATA_NOFAIL |
2152 : EXT4_GET_BLOCKS_IO_SUBMIT;
2153 1117613 : dioread_nolock = ext4_should_dioread_nolock(inode);
2154 1117612 : if (dioread_nolock)
2155 1117414 : get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2156 1117612 : if (map->m_flags & BIT(BH_Delay))
2157 885417 : get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2158 :
2159 1117612 : err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
2160 1117615 : if (err < 0)
2161 : return err;
2162 1117535 : if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
2163 1117322 : if (!mpd->io_submit.io_end->handle &&
2164 : ext4_handle_valid(handle)) {
2165 1117303 : mpd->io_submit.io_end->handle = handle->h_rsv_handle;
2166 1117303 : handle->h_rsv_handle = NULL;
2167 : }
2168 1117322 : ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
2169 : }
2170 :
2171 1117558 : BUG_ON(map->m_len == 0);
2172 : return 0;
2173 : }
2174 :
2175 : /*
2176 : * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
2177 : * mpd->len and submit pages underlying it for IO
2178 : *
2179 : * @handle - handle for journal operations
2180 : * @mpd - extent to map
2181 : * @give_up_on_write - we set this to true iff there is a fatal error and there
2182 : * is no hope of writing the data. The caller should discard
2183 : * dirty pages to avoid infinite loops.
2184 : *
2185 : * The function maps extent starting at mpd->lblk of length mpd->len. If it is
2186 : * delayed, blocks are allocated, if it is unwritten, we may need to convert
2187 : * them to initialized or split the described range from larger unwritten
2188 : * extent. Note that we need not map all the described range since allocation
2189 : * can return less blocks or the range is covered by more unwritten extents. We
2190 : * cannot map more because we are limited by reserved transaction credits. On
2191 : * the other hand we always make sure that the last touched page is fully
2192 : * mapped so that it can be written out (and thus forward progress is
2193 : * guaranteed). After mapping we submit all mapped pages for IO.
2194 : */
2195 1117633 : static int mpage_map_and_submit_extent(handle_t *handle,
2196 : struct mpage_da_data *mpd,
2197 : bool *give_up_on_write)
2198 : {
2199 1117633 : struct inode *inode = mpd->inode;
2200 1117633 : struct ext4_map_blocks *map = &mpd->map;
2201 1117633 : int err;
2202 1117633 : loff_t disksize;
2203 1117633 : int progress = 0;
2204 1117633 : ext4_io_end_t *io_end = mpd->io_submit.io_end;
2205 1117633 : struct ext4_io_end_vec *io_end_vec;
2206 :
2207 1117633 : io_end_vec = ext4_alloc_io_end_vec(io_end);
2208 1117623 : if (IS_ERR(io_end_vec))
2209 0 : return PTR_ERR(io_end_vec);
2210 1117623 : io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
2211 1117625 : do {
2212 1117625 : err = mpage_map_one_extent(handle, mpd);
2213 1117630 : if (err < 0) {
2214 80 : struct super_block *sb = inode->i_sb;
2215 :
2216 160 : if (ext4_forced_shutdown(EXT4_SB(sb)) ||
2217 : ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
2218 0 : goto invalidate_dirty_pages;
2219 : /*
2220 : * Let the uper layers retry transient errors.
2221 : * In the case of ENOSPC, if ext4_count_free_blocks()
2222 : * is non-zero, a commit should free up blocks.
2223 : */
2224 80 : if ((err == -ENOMEM) ||
2225 80 : (err == -ENOSPC && ext4_count_free_clusters(sb))) {
2226 80 : if (progress)
2227 0 : goto update_disksize;
2228 : return err;
2229 : }
2230 0 : ext4_msg(sb, KERN_CRIT,
2231 : "Delayed block allocation failed for "
2232 : "inode %lu at logical offset %llu with"
2233 : " max blocks %u with error %d",
2234 : inode->i_ino,
2235 : (unsigned long long)map->m_lblk,
2236 : (unsigned)map->m_len, -err);
2237 0 : ext4_msg(sb, KERN_CRIT,
2238 : "This should not happen!! Data will "
2239 : "be lost\n");
2240 0 : if (err == -ENOSPC)
2241 0 : ext4_print_free_blocks(inode);
2242 0 : invalidate_dirty_pages:
2243 0 : *give_up_on_write = true;
2244 0 : return err;
2245 : }
2246 1117550 : progress = 1;
2247 : /*
2248 : * Update buffer state, submit mapped pages, and get us new
2249 : * extent to map
2250 : */
2251 1117550 : err = mpage_map_and_submit_buffers(mpd);
2252 1117551 : if (err < 0)
2253 0 : goto update_disksize;
2254 1117551 : } while (map->m_len);
2255 :
2256 1117549 : update_disksize:
2257 : /*
2258 : * Update on-disk size after IO is submitted. Races with
2259 : * truncate are avoided by checking i_size under i_data_sem.
2260 : */
2261 1117549 : disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
2262 1117549 : if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
2263 496747 : int err2;
2264 496747 : loff_t i_size;
2265 :
2266 496747 : down_write(&EXT4_I(inode)->i_data_sem);
2267 496746 : i_size = i_size_read(inode);
2268 496746 : if (disksize > i_size)
2269 : disksize = i_size;
2270 496746 : if (disksize > EXT4_I(inode)->i_disksize)
2271 439187 : EXT4_I(inode)->i_disksize = disksize;
2272 496746 : up_write(&EXT4_I(inode)->i_data_sem);
2273 496747 : err2 = ext4_mark_inode_dirty(handle, inode);
2274 496746 : if (err2) {
2275 0 : ext4_error_err(inode->i_sb, -err2,
2276 : "Failed to mark inode %lu dirty",
2277 : inode->i_ino);
2278 : }
2279 496746 : if (!err)
2280 496746 : err = err2;
2281 : }
2282 : return err;
2283 : }
2284 :
2285 : /*
2286 : * Calculate the total number of credits to reserve for one writepages
2287 : * iteration. This is called from ext4_writepages(). We map an extent of
2288 : * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
2289 : * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
2290 : * bpp - 1 blocks in bpp different extents.
2291 : */
2292 1158816 : static int ext4_da_writepages_trans_blocks(struct inode *inode)
2293 : {
2294 1158816 : int bpp = ext4_journal_blocks_per_page(inode);
2295 :
2296 1158808 : return ext4_meta_trans_blocks(inode,
2297 : MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
2298 : }
2299 :
2300 0 : static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio,
2301 : size_t len)
2302 : {
2303 0 : struct buffer_head *page_bufs = folio_buffers(folio);
2304 0 : struct inode *inode = folio->mapping->host;
2305 0 : int ret, err;
2306 :
2307 0 : ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
2308 : NULL, do_journal_get_write_access);
2309 0 : err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
2310 : NULL, write_end_fn);
2311 0 : if (ret == 0)
2312 0 : ret = err;
2313 0 : err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len);
2314 0 : if (ret == 0)
2315 0 : ret = err;
2316 0 : EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
2317 :
2318 0 : return ret;
2319 : }
2320 :
2321 0 : static int mpage_journal_page_buffers(handle_t *handle,
2322 : struct mpage_da_data *mpd,
2323 : struct folio *folio)
2324 : {
2325 0 : struct inode *inode = mpd->inode;
2326 0 : loff_t size = i_size_read(inode);
2327 0 : size_t len = folio_size(folio);
2328 :
2329 0 : folio_clear_checked(folio);
2330 0 : mpd->wbc->nr_to_write--;
2331 :
2332 0 : if (folio_pos(folio) + len > size &&
2333 : !ext4_verity_in_progress(inode))
2334 0 : len = size - folio_pos(folio);
2335 :
2336 0 : return ext4_journal_folio_buffers(handle, folio, len);
2337 : }
2338 :
2339 : /*
2340 : * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
2341 : * needing mapping, submit mapped pages
2342 : *
2343 : * @mpd - where to look for pages
2344 : *
2345 : * Walk dirty pages in the mapping. If they are fully mapped, submit them for
2346 : * IO immediately. If we cannot map blocks, we submit just already mapped
2347 : * buffers in the page for IO and keep page dirty. When we can map blocks and
2348 : * we find a page which isn't mapped we start accumulating extent of buffers
2349 : * underlying these pages that needs mapping (formed by either delayed or
2350 : * unwritten buffers). We also lock the pages containing these buffers. The
2351 : * extent found is returned in @mpd structure (starting at mpd->lblk with
2352 : * length mpd->len blocks).
2353 : *
2354 : * Note that this function can attach bios to one io_end structure which are
2355 : * neither logically nor physically contiguous. Although it may seem as an
2356 : * unnecessary complication, it is actually inevitable in blocksize < pagesize
2357 : * case as we need to track IO to all buffers underlying a page in one io_end.
2358 : */
2359 2690225 : static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
2360 : {
2361 2690225 : struct address_space *mapping = mpd->inode->i_mapping;
2362 2690225 : struct folio_batch fbatch;
2363 2690225 : unsigned int nr_folios;
2364 2690225 : pgoff_t index = mpd->first_page;
2365 2690225 : pgoff_t end = mpd->last_page;
2366 2690225 : xa_mark_t tag;
2367 2690225 : int i, err = 0;
2368 2690225 : int blkbits = mpd->inode->i_blkbits;
2369 2690225 : ext4_lblk_t lblk;
2370 2690225 : struct buffer_head *head;
2371 2690225 : handle_t *handle = NULL;
2372 2690225 : int bpp = ext4_journal_blocks_per_page(mpd->inode);
2373 :
2374 2690119 : if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
2375 : tag = PAGECACHE_TAG_TOWRITE;
2376 : else
2377 716369 : tag = PAGECACHE_TAG_DIRTY;
2378 :
2379 2690119 : mpd->map.m_len = 0;
2380 2690119 : mpd->next_page = index;
2381 2690119 : if (ext4_should_journal_data(mpd->inode)) {
2382 0 : handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE,
2383 : bpp);
2384 0 : if (IS_ERR(handle))
2385 0 : return PTR_ERR(handle);
2386 : }
2387 2690178 : folio_batch_init(&fbatch);
2388 5140444 : while (index <= end) {
2389 4714914 : nr_folios = filemap_get_folios_tag(mapping, &index, end,
2390 : tag, &fbatch);
2391 4714810 : if (nr_folios == 0)
2392 : break;
2393 :
2394 38959284 : for (i = 0; i < nr_folios; i++) {
2395 36509018 : struct folio *folio = fbatch.folios[i];
2396 :
2397 : /*
2398 : * Accumulated enough dirty pages? This doesn't apply
2399 : * to WB_SYNC_ALL mode. For integrity sync we have to
2400 : * keep going because someone may be concurrently
2401 : * dirtying pages, and we might have synced a lot of
2402 : * newly appeared dirty pages, but have not synced all
2403 : * of the old dirty pages.
2404 : */
2405 36509813 : if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
2406 24370658 : mpd->wbc->nr_to_write <=
2407 24370658 : mpd->map.m_len >> (PAGE_SHIFT - blkbits))
2408 230 : goto out;
2409 :
2410 : /* If we can't merge this page, we are done. */
2411 36509583 : if (mpd->map.m_len > 0 && mpd->next_page != folio->index)
2412 219398 : goto out;
2413 :
2414 36290185 : if (handle) {
2415 0 : err = ext4_journal_ensure_credits(handle, bpp,
2416 : 0);
2417 0 : if (err < 0)
2418 0 : goto out;
2419 : }
2420 :
2421 36290185 : folio_lock(folio);
2422 : /*
2423 : * If the page is no longer dirty, or its mapping no
2424 : * longer corresponds to inode we are writing (which
2425 : * means it has been truncated or invalidated), or the
2426 : * page is already under writeback and we are not doing
2427 : * a data integrity writeback, skip the page
2428 : */
2429 36297116 : if (!folio_test_dirty(folio) ||
2430 21381 : (folio_test_writeback(folio) &&
2431 21381 : (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
2432 36119968 : unlikely(folio->mapping != mapping)) {
2433 177148 : folio_unlock(folio);
2434 177136 : continue;
2435 : }
2436 :
2437 36119968 : folio_wait_writeback(folio);
2438 36119984 : BUG_ON(folio_test_writeback(folio));
2439 :
2440 : /*
2441 : * Should never happen but for buggy code in
2442 : * other subsystems that call
2443 : * set_page_dirty() without properly warning
2444 : * the file system first. See [1] for more
2445 : * information.
2446 : *
2447 : * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
2448 : */
2449 36119984 : if (!folio_buffers(folio)) {
2450 0 : ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index);
2451 0 : folio_clear_dirty(folio);
2452 0 : folio_unlock(folio);
2453 0 : continue;
2454 : }
2455 :
2456 36119984 : if (mpd->map.m_len == 0)
2457 7873399 : mpd->first_page = folio->index;
2458 36119984 : mpd->next_page = folio->index + folio_nr_pages(folio);
2459 : /*
2460 : * Writeout when we cannot modify metadata is simple.
2461 : * Just submit the page. For data=journal mode we
2462 : * first handle writeout of the page for checkpoint and
2463 : * only after that handle delayed page dirtying. This
2464 : * makes sure current data is checkpointed to the final
2465 : * location before possibly journalling it again which
2466 : * is desirable when the page is frequently dirtied
2467 : * through a pin.
2468 : */
2469 36119984 : if (!mpd->can_map) {
2470 353305 : err = mpage_submit_folio(mpd, folio);
2471 353305 : if (err < 0)
2472 0 : goto out;
2473 : /* Pending dirtying of journalled data? */
2474 353305 : if (folio_test_checked(folio)) {
2475 0 : err = mpage_journal_page_buffers(handle,
2476 : mpd, folio);
2477 0 : if (err < 0)
2478 0 : goto out;
2479 0 : mpd->journalled_more_data = 1;
2480 : }
2481 353305 : mpage_folio_done(mpd, folio);
2482 : } else {
2483 : /* Add all dirty buffers to mpd */
2484 0 : lblk = ((ext4_lblk_t)folio->index) <<
2485 35766679 : (PAGE_SHIFT - blkbits);
2486 35766679 : head = folio_buffers(folio);
2487 35766679 : err = mpage_process_page_bufs(mpd, head, head,
2488 : lblk);
2489 35759011 : if (err <= 0)
2490 1387134 : goto out;
2491 : err = 0;
2492 : }
2493 : }
2494 2450266 : folio_batch_release(&fbatch);
2495 2450272 : cond_resched();
2496 : }
2497 1083374 : mpd->scanned_until_end = 1;
2498 1083374 : if (handle)
2499 0 : ext4_journal_stop(handle);
2500 : return 0;
2501 1606762 : out:
2502 1606762 : folio_batch_release(&fbatch);
2503 1606783 : if (handle)
2504 0 : ext4_journal_stop(handle);
2505 : return err;
2506 : }
2507 :
2508 1816475 : static int ext4_do_writepages(struct mpage_da_data *mpd)
2509 : {
2510 1816475 : struct writeback_control *wbc = mpd->wbc;
2511 1816475 : pgoff_t writeback_index = 0;
2512 1816475 : long nr_to_write = wbc->nr_to_write;
2513 1816475 : int range_whole = 0;
2514 1816475 : int cycled = 1;
2515 1816475 : handle_t *handle = NULL;
2516 1816475 : struct inode *inode = mpd->inode;
2517 1816475 : struct address_space *mapping = inode->i_mapping;
2518 1816475 : int needed_blocks, rsv_blocks = 0, ret = 0;
2519 1816475 : struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2520 1816475 : struct blk_plug plug;
2521 1816475 : bool give_up_on_write = false;
2522 :
2523 1816475 : trace_ext4_writepages(inode, wbc);
2524 :
2525 : /*
2526 : * No pages to write? This is mainly a kludge to avoid starting
2527 : * a transaction for special inodes like journal inode on last iput()
2528 : * because that could violate lock ordering on umount
2529 : */
2530 3528137 : if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2531 357551 : goto out_writepages;
2532 :
2533 : /*
2534 : * If the filesystem has aborted, it is read-only, so return
2535 : * right away instead of dumping stack traces later on that
2536 : * will obscure the real source of the problem. We test
2537 : * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because
2538 : * the latter could be true if the filesystem is mounted
2539 : * read-only, and in that case, ext4_writepages should
2540 : * *never* be called, so if that ever happens, we would want
2541 : * the stack trace.
2542 : */
2543 2917570 : if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) ||
2544 : ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) {
2545 0 : ret = -EROFS;
2546 0 : goto out_writepages;
2547 : }
2548 :
2549 : /*
2550 : * If we have inline data and arrive here, it means that
2551 : * we will soon create the block for the 1st page, so
2552 : * we'd better clear the inline data here.
2553 : */
2554 1458785 : if (ext4_has_inline_data(inode)) {
2555 : /* Just inode will be modified... */
2556 0 : handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
2557 0 : if (IS_ERR(handle)) {
2558 0 : ret = PTR_ERR(handle);
2559 0 : goto out_writepages;
2560 : }
2561 0 : BUG_ON(ext4_test_inode_state(inode,
2562 : EXT4_STATE_MAY_INLINE_DATA));
2563 0 : ext4_destroy_inline_data(handle, inode);
2564 0 : ext4_journal_stop(handle);
2565 : }
2566 :
2567 : /*
2568 : * data=journal mode does not do delalloc so we just need to writeout /
2569 : * journal already mapped buffers. On the other hand we need to commit
2570 : * transaction to make data stable. We expect all the data to be
2571 : * already in the journal (the only exception are DMA pinned pages
2572 : * dirtied behind our back) so we commit transaction here and run the
2573 : * writeback loop to checkpoint them. The checkpointing is not actually
2574 : * necessary to make data persistent *but* quite a few places (extent
2575 : * shifting operations, fsverity, ...) depend on being able to drop
2576 : * pagecache pages after calling filemap_write_and_wait() and for that
2577 : * checkpointing needs to happen.
2578 : */
2579 1458785 : if (ext4_should_journal_data(inode)) {
2580 0 : mpd->can_map = 0;
2581 0 : if (wbc->sync_mode == WB_SYNC_ALL)
2582 0 : ext4_fc_commit(sbi->s_journal,
2583 0 : EXT4_I(inode)->i_datasync_tid);
2584 : }
2585 1458787 : mpd->journalled_more_data = 0;
2586 :
2587 1458787 : if (ext4_should_dioread_nolock(inode)) {
2588 : /*
2589 : * We may need to convert up to one extent per block in
2590 : * the page and we may dirty the inode.
2591 : */
2592 1458550 : rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
2593 1458497 : PAGE_SIZE >> inode->i_blkbits);
2594 : }
2595 :
2596 1458772 : if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2597 443483 : range_whole = 1;
2598 :
2599 1458772 : if (wbc->range_cyclic) {
2600 234955 : writeback_index = mapping->writeback_index;
2601 234955 : if (writeback_index)
2602 72861 : cycled = 0;
2603 234955 : mpd->first_page = writeback_index;
2604 234955 : mpd->last_page = -1;
2605 : } else {
2606 1223817 : mpd->first_page = wbc->range_start >> PAGE_SHIFT;
2607 1223817 : mpd->last_page = wbc->range_end >> PAGE_SHIFT;
2608 : }
2609 :
2610 1458772 : ext4_io_submit_init(&mpd->io_submit, wbc);
2611 1531279 : retry:
2612 1531279 : if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2613 1167688 : tag_pages_for_writeback(mapping, mpd->first_page,
2614 : mpd->last_page);
2615 1531628 : blk_start_plug(&plug);
2616 :
2617 : /*
2618 : * First writeback pages that don't need mapping - we can avoid
2619 : * starting a transaction unnecessarily and also avoid being blocked
2620 : * in the block layer on device congestion while having transaction
2621 : * started.
2622 : */
2623 1531619 : mpd->do_map = 0;
2624 1531619 : mpd->scanned_until_end = 0;
2625 1531619 : mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
2626 1531535 : if (!mpd->io_submit.io_end) {
2627 0 : ret = -ENOMEM;
2628 0 : goto unplug;
2629 : }
2630 1531535 : ret = mpage_prepare_extent_to_map(mpd);
2631 : /* Unlock pages we didn't use */
2632 1530915 : mpage_release_unused_pages(mpd, false);
2633 : /* Submit prepared bio */
2634 1531141 : ext4_io_submit(&mpd->io_submit);
2635 1531105 : ext4_put_io_end_defer(mpd->io_submit.io_end);
2636 1531276 : mpd->io_submit.io_end = NULL;
2637 1531276 : if (ret < 0)
2638 0 : goto unplug;
2639 :
2640 2690106 : while (!mpd->scanned_until_end && wbc->nr_to_write > 0) {
2641 : /* For each extent of pages we use new io_end */
2642 1158812 : mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
2643 1158809 : if (!mpd->io_submit.io_end) {
2644 : ret = -ENOMEM;
2645 : break;
2646 : }
2647 :
2648 1158809 : WARN_ON_ONCE(!mpd->can_map);
2649 : /*
2650 : * We have two constraints: We find one extent to map and we
2651 : * must always write out whole page (makes a difference when
2652 : * blocksize < pagesize) so that we don't block on IO when we
2653 : * try to write out the rest of the page. Journalled mode is
2654 : * not supported by delalloc.
2655 : */
2656 1158809 : BUG_ON(ext4_should_journal_data(inode));
2657 1158804 : needed_blocks = ext4_da_writepages_trans_blocks(inode);
2658 :
2659 : /* start a new transaction */
2660 1158809 : handle = ext4_journal_start_with_reserve(inode,
2661 : EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
2662 1158791 : if (IS_ERR(handle)) {
2663 0 : ret = PTR_ERR(handle);
2664 0 : ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2665 : "%ld pages, ino %lu; err %d", __func__,
2666 : wbc->nr_to_write, inode->i_ino, ret);
2667 : /* Release allocated io_end */
2668 0 : ext4_put_io_end(mpd->io_submit.io_end);
2669 0 : mpd->io_submit.io_end = NULL;
2670 0 : break;
2671 : }
2672 1158791 : mpd->do_map = 1;
2673 :
2674 1158791 : trace_ext4_da_write_pages(inode, mpd->first_page, wbc);
2675 1158770 : ret = mpage_prepare_extent_to_map(mpd);
2676 1158842 : if (!ret && mpd->map.m_len)
2677 1117640 : ret = mpage_map_and_submit_extent(handle, mpd,
2678 : &give_up_on_write);
2679 : /*
2680 : * Caution: If the handle is synchronous,
2681 : * ext4_journal_stop() can wait for transaction commit
2682 : * to finish which may depend on writeback of pages to
2683 : * complete or on page lock to be released. In that
2684 : * case, we have to wait until after we have
2685 : * submitted all the IO, released page locks we hold,
2686 : * and dropped io_end reference (for extent conversion
2687 : * to be able to complete) before stopping the handle.
2688 : */
2689 1158831 : if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
2690 1158830 : ext4_journal_stop(handle);
2691 1158845 : handle = NULL;
2692 1158845 : mpd->do_map = 0;
2693 : }
2694 : /* Unlock pages we didn't use */
2695 1158846 : mpage_release_unused_pages(mpd, give_up_on_write);
2696 : /* Submit prepared bio */
2697 1158844 : ext4_io_submit(&mpd->io_submit);
2698 :
2699 : /*
2700 : * Drop our io_end reference we got from init. We have
2701 : * to be careful and use deferred io_end finishing if
2702 : * we are still holding the transaction as we can
2703 : * release the last reference to io_end which may end
2704 : * up doing unwritten extent conversion.
2705 : */
2706 1158799 : if (handle) {
2707 0 : ext4_put_io_end_defer(mpd->io_submit.io_end);
2708 0 : ext4_journal_stop(handle);
2709 : } else
2710 1158799 : ext4_put_io_end(mpd->io_submit.io_end);
2711 1158830 : mpd->io_submit.io_end = NULL;
2712 :
2713 1158830 : if (ret == -ENOSPC && sbi->s_journal) {
2714 : /*
2715 : * Commit the transaction which would
2716 : * free blocks released in the transaction
2717 : * and try again
2718 : */
2719 80 : jbd2_journal_force_commit_nested(sbi->s_journal);
2720 80 : ret = 0;
2721 80 : continue;
2722 : }
2723 : /* Fatal error - ENOMEM, EIO... */
2724 1158750 : if (ret)
2725 : break;
2726 : }
2727 1531294 : unplug:
2728 1531294 : blk_finish_plug(&plug);
2729 1531240 : if (!ret && !cycled && wbc->nr_to_write > 0) {
2730 72643 : cycled = 1;
2731 72643 : mpd->last_page = writeback_index - 1;
2732 72643 : mpd->first_page = 0;
2733 72643 : goto retry;
2734 : }
2735 :
2736 : /* Update index */
2737 1458597 : if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2738 : /*
2739 : * Set the writeback_index so that range_cyclic
2740 : * mode will write it back later
2741 : */
2742 443505 : mapping->writeback_index = mpd->first_page;
2743 :
2744 1015092 : out_writepages:
2745 1816148 : trace_ext4_writepages_result(inode, wbc, ret,
2746 1816148 : nr_to_write - wbc->nr_to_write);
2747 1815691 : return ret;
2748 : }
2749 :
2750 1675201 : static int ext4_writepages(struct address_space *mapping,
2751 : struct writeback_control *wbc)
2752 : {
2753 1675201 : struct super_block *sb = mapping->host->i_sb;
2754 1675201 : struct mpage_da_data mpd = {
2755 : .inode = mapping->host,
2756 : .wbc = wbc,
2757 : .can_map = 1,
2758 : };
2759 1675201 : int ret;
2760 1675201 : int alloc_ctx;
2761 :
2762 3350402 : if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
2763 : return -EIO;
2764 :
2765 1647443 : alloc_ctx = ext4_writepages_down_read(sb);
2766 1647435 : ret = ext4_do_writepages(&mpd);
2767 : /*
2768 : * For data=journal writeback we could have come across pages marked
2769 : * for delayed dirtying (PageChecked) which were just added to the
2770 : * running transaction. Try once more to get them to stable storage.
2771 : */
2772 1646720 : if (!ret && mpd.journalled_more_data)
2773 0 : ret = ext4_do_writepages(&mpd);
2774 1646720 : ext4_writepages_up_read(sb, alloc_ctx);
2775 :
2776 1646720 : return ret;
2777 : }
2778 :
2779 169069 : int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode)
2780 : {
2781 169069 : struct writeback_control wbc = {
2782 : .sync_mode = WB_SYNC_ALL,
2783 : .nr_to_write = LONG_MAX,
2784 169069 : .range_start = jinode->i_dirty_start,
2785 169069 : .range_end = jinode->i_dirty_end,
2786 : };
2787 169069 : struct mpage_da_data mpd = {
2788 169069 : .inode = jinode->i_vfs_inode,
2789 : .wbc = &wbc,
2790 : .can_map = 0,
2791 : };
2792 169069 : return ext4_do_writepages(&mpd);
2793 : }
2794 :
2795 0 : static int ext4_dax_writepages(struct address_space *mapping,
2796 : struct writeback_control *wbc)
2797 : {
2798 0 : int ret;
2799 0 : long nr_to_write = wbc->nr_to_write;
2800 0 : struct inode *inode = mapping->host;
2801 0 : struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2802 0 : int alloc_ctx;
2803 :
2804 0 : if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
2805 : return -EIO;
2806 :
2807 0 : alloc_ctx = ext4_writepages_down_read(inode->i_sb);
2808 0 : trace_ext4_writepages(inode, wbc);
2809 :
2810 0 : ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
2811 0 : trace_ext4_writepages_result(inode, wbc, ret,
2812 0 : nr_to_write - wbc->nr_to_write);
2813 0 : ext4_writepages_up_read(inode->i_sb, alloc_ctx);
2814 0 : return ret;
2815 : }
2816 :
2817 121171502 : static int ext4_nonda_switch(struct super_block *sb)
2818 : {
2819 121171502 : s64 free_clusters, dirty_clusters;
2820 121171502 : struct ext4_sb_info *sbi = EXT4_SB(sb);
2821 :
2822 : /*
2823 : * switch to non delalloc mode if we are running low
2824 : * on free block. The free block accounting via percpu
2825 : * counters can get slightly wrong with percpu_counter_batch getting
2826 : * accumulated on each CPU without updating global counters
2827 : * Delalloc need an accurate free block accounting. So switch
2828 : * to non delalloc when we are near to error range.
2829 : */
2830 121171502 : free_clusters =
2831 : percpu_counter_read_positive(&sbi->s_freeclusters_counter);
2832 121171502 : dirty_clusters =
2833 : percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
2834 : /*
2835 : * Start pushing delalloc when 1/2 of free blocks are dirty.
2836 : */
2837 121171502 : if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
2838 497095 : try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
2839 :
2840 121173133 : if (2 * free_clusters < 3 * dirty_clusters ||
2841 120926496 : free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
2842 : /*
2843 : * free block count is less than 150% of dirty blocks
2844 : * or free blocks is less than watermark
2845 : */
2846 283635 : return 1;
2847 : }
2848 : return 0;
2849 : }
2850 :
2851 112694976 : static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2852 : loff_t pos, unsigned len,
2853 : struct page **pagep, void **fsdata)
2854 : {
2855 112694976 : int ret, retries = 0;
2856 112694976 : struct folio *folio;
2857 112694976 : pgoff_t index;
2858 112694976 : struct inode *inode = mapping->host;
2859 :
2860 225389952 : if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
2861 : return -EIO;
2862 :
2863 112694976 : index = pos >> PAGE_SHIFT;
2864 :
2865 112694976 : if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) {
2866 282739 : *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
2867 282739 : return ext4_write_begin(file, mapping, pos,
2868 : len, pagep, fsdata);
2869 : }
2870 112425538 : *fsdata = (void *)0;
2871 112425538 : trace_ext4_da_write_begin(inode, pos, len);
2872 :
2873 112324706 : if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
2874 0 : ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len,
2875 : pagep, fsdata);
2876 0 : if (ret < 0)
2877 : return ret;
2878 0 : if (ret == 1)
2879 : return 0;
2880 : }
2881 :
2882 112324706 : retry:
2883 112414743 : folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
2884 : mapping_gfp_mask(mapping));
2885 112494344 : if (IS_ERR(folio))
2886 0 : return PTR_ERR(folio);
2887 :
2888 : /* In case writeback began while the folio was unlocked */
2889 112494344 : folio_wait_stable(folio);
2890 :
2891 : #ifdef CONFIG_FS_ENCRYPTION
2892 : ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep);
2893 : #else
2894 112488692 : ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep);
2895 : #endif
2896 112475105 : if (ret < 0) {
2897 161236 : folio_unlock(folio);
2898 161454 : folio_put(folio);
2899 : /*
2900 : * block_write_begin may have instantiated a few blocks
2901 : * outside i_size. Trim these off again. Don't need
2902 : * i_size_read because we hold inode lock.
2903 : */
2904 161572 : if (pos + len > inode->i_size)
2905 125672 : ext4_truncate_failed_write(inode);
2906 :
2907 320357 : if (ret == -ENOSPC &&
2908 158823 : ext4_should_retry_alloc(inode->i_sb, &retries))
2909 90037 : goto retry;
2910 71497 : return ret;
2911 : }
2912 :
2913 112313869 : *pagep = &folio->page;
2914 112313869 : return ret;
2915 : }
2916 :
2917 : /*
2918 : * Check if we should update i_disksize
2919 : * when write to the end of file but not require block allocation
2920 : */
2921 25538235 : static int ext4_da_should_update_i_disksize(struct folio *folio,
2922 : unsigned long offset)
2923 : {
2924 25538235 : struct buffer_head *bh;
2925 25538235 : struct inode *inode = folio->mapping->host;
2926 25538235 : unsigned int idx;
2927 25538235 : int i;
2928 :
2929 25538235 : bh = folio_buffers(folio);
2930 25538235 : idx = offset >> inode->i_blkbits;
2931 :
2932 25538246 : for (i = 0; i < idx; i++)
2933 11 : bh = bh->b_this_page;
2934 :
2935 76776847 : if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
2936 25503389 : return 0;
2937 : return 1;
2938 : }
2939 :
2940 112673751 : static int ext4_da_write_end(struct file *file,
2941 : struct address_space *mapping,
2942 : loff_t pos, unsigned len, unsigned copied,
2943 : struct page *page, void *fsdata)
2944 : {
2945 112673751 : struct inode *inode = mapping->host;
2946 112673751 : loff_t new_i_size;
2947 112673751 : unsigned long start, end;
2948 112673751 : int write_mode = (int)(unsigned long)fsdata;
2949 112673751 : struct folio *folio = page_folio(page);
2950 :
2951 112590118 : if (write_mode == FALL_BACK_TO_NONDELALLOC)
2952 280296 : return ext4_write_end(file, mapping, pos,
2953 : len, copied, &folio->page, fsdata);
2954 :
2955 112309822 : trace_ext4_da_write_end(inode, pos, len, copied);
2956 :
2957 112313404 : if (write_mode != CONVERT_INLINE_DATA &&
2958 : ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
2959 : ext4_has_inline_data(inode))
2960 0 : return ext4_write_inline_data_end(inode, pos, len, copied,
2961 : folio);
2962 :
2963 112313404 : if (unlikely(copied < len) && !PageUptodate(page))
2964 0 : copied = 0;
2965 :
2966 112313404 : start = pos & (PAGE_SIZE - 1);
2967 112313404 : end = start + copied - 1;
2968 :
2969 : /*
2970 : * Since we are holding inode lock, we are sure i_disksize <=
2971 : * i_size. We also know that if i_disksize < i_size, there are
2972 : * delalloc writes pending in the range upto i_size. If the end of
2973 : * the current write is <= i_size, there's no need to touch
2974 : * i_disksize since writeback will push i_disksize upto i_size
2975 : * eventually. If the end of the current write is > i_size and
2976 : * inside an allocated block (ext4_da_should_update_i_disksize()
2977 : * check), we need to update i_disksize here as certain
2978 : * ext4_writepages() paths not allocating blocks update i_disksize.
2979 : *
2980 : * Note that we defer inode dirtying to generic_write_end() /
2981 : * ext4_da_write_inline_data_end().
2982 : */
2983 112313404 : new_i_size = pos + copied;
2984 137836155 : if (copied && new_i_size > inode->i_size &&
2985 25532049 : ext4_da_should_update_i_disksize(folio, end))
2986 34841 : ext4_update_i_disksize(inode, new_i_size);
2987 :
2988 112304176 : return generic_write_end(file, mapping, pos, len, copied, &folio->page,
2989 : fsdata);
2990 : }
2991 :
2992 : /*
2993 : * Force all delayed allocation blocks to be allocated for a given inode.
2994 : */
2995 98201 : int ext4_alloc_da_blocks(struct inode *inode)
2996 : {
2997 98201 : trace_ext4_alloc_da_blocks(inode);
2998 :
2999 98187 : if (!EXT4_I(inode)->i_reserved_data_blocks)
3000 : return 0;
3001 :
3002 : /*
3003 : * We do something simple for now. The filemap_flush() will
3004 : * also start triggering a write of the data blocks, which is
3005 : * not strictly speaking necessary (and for users of
3006 : * laptop_mode, not even desirable). However, to do otherwise
3007 : * would require replicating code paths in:
3008 : *
3009 : * ext4_writepages() ->
3010 : * write_cache_pages() ---> (via passed in callback function)
3011 : * __mpage_da_writepage() -->
3012 : * mpage_add_bh_to_extent()
3013 : * mpage_da_map_blocks()
3014 : *
3015 : * The problem is that write_cache_pages(), located in
3016 : * mm/page-writeback.c, marks pages clean in preparation for
3017 : * doing I/O, which is not desirable if we're not planning on
3018 : * doing I/O at all.
3019 : *
3020 : * We could call write_cache_pages(), and then redirty all of
3021 : * the pages by calling redirty_page_for_writepage() but that
3022 : * would be ugly in the extreme. So instead we would need to
3023 : * replicate parts of the code in the above functions,
3024 : * simplifying them because we wouldn't actually intend to
3025 : * write out the pages, but rather only collect contiguous
3026 : * logical block extents, call the multi-block allocator, and
3027 : * then update the buffer heads with the block allocations.
3028 : *
3029 : * For now, though, we'll cheat by calling filemap_flush(),
3030 : * which will map the blocks, and start the I/O, but not
3031 : * actually wait for the I/O to complete.
3032 : */
3033 50466 : return filemap_flush(inode->i_mapping);
3034 : }
3035 :
3036 : /*
3037 : * bmap() is special. It gets used by applications such as lilo and by
3038 : * the swapper to find the on-disk block of a specific piece of data.
3039 : *
3040 : * Naturally, this is dangerous if the block concerned is still in the
3041 : * journal. If somebody makes a swapfile on an ext4 data-journaling
3042 : * filesystem and enables swap, then they may get a nasty shock when the
3043 : * data getting swapped to that swapfile suddenly gets overwritten by
3044 : * the original zero's written out previously to the journal and
3045 : * awaiting writeback in the kernel's buffer cache.
3046 : *
3047 : * So, if we see any bmap calls here on a modified, data-journaled file,
3048 : * take extra steps to flush any blocks which might be in the cache.
3049 : */
3050 5076 : static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3051 : {
3052 5076 : struct inode *inode = mapping->host;
3053 5076 : sector_t ret = 0;
3054 :
3055 5076 : inode_lock_shared(inode);
3056 : /*
3057 : * We can get here for an inline file via the FIBMAP ioctl
3058 : */
3059 5076 : if (ext4_has_inline_data(inode))
3060 0 : goto out;
3061 :
3062 5076 : if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
3063 2 : (test_opt(inode->i_sb, DELALLOC) ||
3064 : ext4_should_journal_data(inode))) {
3065 : /*
3066 : * With delalloc or journalled data we want to sync the file so
3067 : * that we can make sure we allocate blocks for file and data
3068 : * is in place for the user to see it
3069 : */
3070 2 : filemap_write_and_wait(mapping);
3071 : }
3072 :
3073 5076 : ret = iomap_bmap(mapping, block, &ext4_iomap_ops);
3074 :
3075 5076 : out:
3076 5076 : inode_unlock_shared(inode);
3077 5076 : return ret;
3078 : }
3079 :
3080 76329 : static int ext4_read_folio(struct file *file, struct folio *folio)
3081 : {
3082 76329 : int ret = -EAGAIN;
3083 76329 : struct inode *inode = folio->mapping->host;
3084 :
3085 76329 : trace_ext4_read_folio(inode, folio);
3086 :
3087 76329 : if (ext4_has_inline_data(inode))
3088 0 : ret = ext4_readpage_inline(inode, folio);
3089 :
3090 76329 : if (ret == -EAGAIN)
3091 76329 : return ext4_mpage_readpages(inode, NULL, folio);
3092 :
3093 : return ret;
3094 : }
3095 :
3096 964678 : static void ext4_readahead(struct readahead_control *rac)
3097 : {
3098 964678 : struct inode *inode = rac->mapping->host;
3099 :
3100 : /* If the file has inline data, no need to do readahead. */
3101 964678 : if (ext4_has_inline_data(inode))
3102 : return;
3103 :
3104 964678 : ext4_mpage_readpages(inode, rac, NULL);
3105 : }
3106 :
3107 32860807 : static void ext4_invalidate_folio(struct folio *folio, size_t offset,
3108 : size_t length)
3109 : {
3110 32860807 : trace_ext4_invalidate_folio(folio, offset, length);
3111 :
3112 : /* No journalling happens on data buffers when this function is used */
3113 98576700 : WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio)));
3114 :
3115 32858887 : block_invalidate_folio(folio, offset, length);
3116 32860551 : }
3117 :
3118 1066 : static int __ext4_journalled_invalidate_folio(struct folio *folio,
3119 : size_t offset, size_t length)
3120 : {
3121 1066 : journal_t *journal = EXT4_JOURNAL(folio->mapping->host);
3122 :
3123 1066 : trace_ext4_journalled_invalidate_folio(folio, offset, length);
3124 :
3125 : /*
3126 : * If it's a full truncate we just forget about the pending dirtying
3127 : */
3128 1066 : if (offset == 0 && length == folio_size(folio))
3129 1062 : folio_clear_checked(folio);
3130 :
3131 1066 : return jbd2_journal_invalidate_folio(journal, folio, offset, length);
3132 : }
3133 :
3134 : /* Wrapper for aops... */
3135 1066 : static void ext4_journalled_invalidate_folio(struct folio *folio,
3136 : size_t offset,
3137 : size_t length)
3138 : {
3139 1066 : WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0);
3140 1066 : }
3141 :
3142 42151793 : static bool ext4_release_folio(struct folio *folio, gfp_t wait)
3143 : {
3144 42151793 : struct inode *inode = folio->mapping->host;
3145 42151793 : journal_t *journal = EXT4_JOURNAL(inode);
3146 :
3147 42151793 : trace_ext4_release_folio(inode, folio);
3148 :
3149 : /* Page has dirty journalled data -> cannot release */
3150 42151282 : if (folio_test_checked(folio))
3151 : return false;
3152 42151282 : if (journal)
3153 42151261 : return jbd2_journal_try_to_free_buffers(journal, folio);
3154 : else
3155 21 : return try_to_free_buffers(folio);
3156 : }
3157 :
3158 2999800 : static bool ext4_inode_datasync_dirty(struct inode *inode)
3159 : {
3160 2999800 : journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
3161 :
3162 2999800 : if (journal) {
3163 2998539 : if (jbd2_transaction_committed(journal,
3164 2997286 : EXT4_I(inode)->i_datasync_tid))
3165 : return false;
3166 2335180 : if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
3167 0 : return !list_empty(&EXT4_I(inode)->i_fc_list);
3168 : return true;
3169 : }
3170 :
3171 : /* Any metadata buffers to write? */
3172 2514 : if (!list_empty(&inode->i_mapping->private_list))
3173 : return true;
3174 2514 : return inode->i_state & I_DIRTY_DATASYNC;
3175 : }
3176 :
3177 3000262 : static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
3178 : struct ext4_map_blocks *map, loff_t offset,
3179 : loff_t length, unsigned int flags)
3180 : {
3181 3000262 : u8 blkbits = inode->i_blkbits;
3182 :
3183 : /*
3184 : * Writes that span EOF might trigger an I/O size update on completion,
3185 : * so consider them to be dirty for the purpose of O_DSYNC, even if
3186 : * there is no other metadata changes being made or are pending.
3187 : */
3188 3000262 : iomap->flags = 0;
3189 3000262 : if (ext4_inode_datasync_dirty(inode) ||
3190 665868 : offset + length > i_size_read(inode))
3191 2385809 : iomap->flags |= IOMAP_F_DIRTY;
3192 :
3193 3000870 : if (map->m_flags & EXT4_MAP_NEW)
3194 945962 : iomap->flags |= IOMAP_F_NEW;
3195 :
3196 3000870 : if (flags & IOMAP_DAX)
3197 0 : iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
3198 : else
3199 3000870 : iomap->bdev = inode->i_sb->s_bdev;
3200 3000870 : iomap->offset = (u64) map->m_lblk << blkbits;
3201 3000870 : iomap->length = (u64) map->m_len << blkbits;
3202 :
3203 3000870 : if ((map->m_flags & EXT4_MAP_MAPPED) &&
3204 : !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3205 613 : iomap->flags |= IOMAP_F_MERGED;
3206 :
3207 : /*
3208 : * Flags passed to ext4_map_blocks() for direct I/O writes can result
3209 : * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
3210 : * set. In order for any allocated unwritten extents to be converted
3211 : * into written extents correctly within the ->end_io() handler, we
3212 : * need to ensure that the iomap->type is set appropriately. Hence, the
3213 : * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
3214 : * been set first.
3215 : */
3216 3000870 : if (map->m_flags & EXT4_MAP_UNWRITTEN) {
3217 880498 : iomap->type = IOMAP_UNWRITTEN;
3218 880498 : iomap->addr = (u64) map->m_pblk << blkbits;
3219 880498 : if (flags & IOMAP_DAX)
3220 0 : iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
3221 2120372 : } else if (map->m_flags & EXT4_MAP_MAPPED) {
3222 1589170 : iomap->type = IOMAP_MAPPED;
3223 1589170 : iomap->addr = (u64) map->m_pblk << blkbits;
3224 1589170 : if (flags & IOMAP_DAX)
3225 0 : iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
3226 : } else {
3227 531202 : iomap->type = IOMAP_HOLE;
3228 531202 : iomap->addr = IOMAP_NULL_ADDR;
3229 : }
3230 3000870 : }
3231 :
3232 1267970 : static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
3233 : unsigned int flags)
3234 : {
3235 1267970 : handle_t *handle;
3236 1267970 : u8 blkbits = inode->i_blkbits;
3237 1267970 : int ret, dio_credits, m_flags = 0, retries = 0;
3238 :
3239 : /*
3240 : * Trim the mapping request to the maximum value that we can map at
3241 : * once for direct I/O.
3242 : */
3243 1267970 : if (map->m_len > DIO_MAX_BLOCKS)
3244 31 : map->m_len = DIO_MAX_BLOCKS;
3245 1267970 : dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
3246 :
3247 1304359 : retry:
3248 : /*
3249 : * Either we allocate blocks and then don't get an unwritten extent, so
3250 : * in that case we have reserved enough credits. Or, the blocks are
3251 : * already allocated and unwritten. In that case, the extent conversion
3252 : * fits into the credits as well.
3253 : */
3254 1304359 : handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
3255 1304126 : if (IS_ERR(handle))
3256 0 : return PTR_ERR(handle);
3257 :
3258 : /*
3259 : * DAX and direct I/O are the only two operations that are currently
3260 : * supported with IOMAP_WRITE.
3261 : */
3262 1304126 : WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT)));
3263 1304126 : if (flags & IOMAP_DAX)
3264 : m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
3265 : /*
3266 : * We use i_size instead of i_disksize here because delalloc writeback
3267 : * can complete at any point during the I/O and subsequently push the
3268 : * i_disksize out to i_size. This could be beyond where direct I/O is
3269 : * happening and thus expose allocated blocks to direct I/O reads.
3270 : */
3271 1304094 : else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode))
3272 : m_flags = EXT4_GET_BLOCKS_CREATE;
3273 1000491 : else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3274 1000382 : m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
3275 :
3276 1304126 : ret = ext4_map_blocks(handle, inode, map, m_flags);
3277 :
3278 : /*
3279 : * We cannot fill holes in indirect tree based inodes as that could
3280 : * expose stale data in the case of a crash. Use the magic error code
3281 : * to fallback to buffered I/O.
3282 : */
3283 1305179 : if (!m_flags && !ret)
3284 33 : ret = -ENOTBLK;
3285 :
3286 1305179 : ext4_journal_stop(handle);
3287 1305059 : if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3288 36748 : goto retry;
3289 :
3290 : return ret;
3291 : }
3292 :
3293 :
3294 2735998 : static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
3295 : unsigned flags, struct iomap *iomap, struct iomap *srcmap)
3296 : {
3297 2735998 : int ret;
3298 2735998 : struct ext4_map_blocks map;
3299 2735998 : u8 blkbits = inode->i_blkbits;
3300 :
3301 2735998 : if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
3302 : return -EINVAL;
3303 :
3304 2735998 : if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
3305 : return -ERANGE;
3306 :
3307 : /*
3308 : * Calculate the first and last logical blocks respectively.
3309 : */
3310 2735998 : map.m_lblk = offset >> blkbits;
3311 2735998 : map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
3312 2735998 : EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
3313 :
3314 2735998 : if (flags & IOMAP_WRITE) {
3315 : /*
3316 : * We check here if the blocks are already allocated, then we
3317 : * don't need to start a journal txn and we can directly return
3318 : * the mapping information. This could boost performance
3319 : * especially in multi-threaded overwrite requests.
3320 : */
3321 1383196 : if (offset + length <= i_size_read(inode)) {
3322 1069995 : ret = ext4_map_blocks(NULL, inode, &map, 0);
3323 1071076 : if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
3324 116053 : goto out;
3325 : }
3326 1268224 : ret = ext4_iomap_alloc(inode, &map, flags);
3327 : } else {
3328 1352802 : ret = ext4_map_blocks(NULL, inode, &map, 0);
3329 : }
3330 :
3331 2622072 : if (ret < 0)
3332 : return ret;
3333 2323788 : out:
3334 : /*
3335 : * When inline encryption is enabled, sometimes I/O to an encrypted file
3336 : * has to be broken up to guarantee DUN contiguity. Handle this by
3337 : * limiting the length of the mapping returned.
3338 : */
3339 2439841 : map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
3340 :
3341 2439841 : ext4_set_iomap(inode, iomap, &map, offset, length, flags);
3342 :
3343 2439841 : return 0;
3344 : }
3345 :
3346 290427 : static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
3347 : loff_t length, unsigned flags, struct iomap *iomap,
3348 : struct iomap *srcmap)
3349 : {
3350 290427 : int ret;
3351 :
3352 : /*
3353 : * Even for writes we don't need to allocate blocks, so just pretend
3354 : * we are reading to save overhead of starting a transaction.
3355 : */
3356 290427 : flags &= ~IOMAP_WRITE;
3357 290427 : ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap);
3358 581005 : WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED);
3359 290490 : return ret;
3360 : }
3361 :
3362 2438478 : static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
3363 : ssize_t written, unsigned flags, struct iomap *iomap)
3364 : {
3365 : /*
3366 : * Check to see whether an error occurred while writing out the data to
3367 : * the allocated blocks. If so, return the magic error code so that we
3368 : * fallback to buffered I/O and attempt to complete the remainder of
3369 : * the I/O. Any blocks that may have been allocated in preparation for
3370 : * the direct I/O will be reused during buffered I/O.
3371 : */
3372 2438478 : if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
3373 67 : return -ENOTBLK;
3374 :
3375 : return 0;
3376 : }
3377 :
3378 : const struct iomap_ops ext4_iomap_ops = {
3379 : .iomap_begin = ext4_iomap_begin,
3380 : .iomap_end = ext4_iomap_end,
3381 : };
3382 :
3383 : const struct iomap_ops ext4_iomap_overwrite_ops = {
3384 : .iomap_begin = ext4_iomap_overwrite_begin,
3385 : .iomap_end = ext4_iomap_end,
3386 : };
3387 :
3388 244616 : static bool ext4_iomap_is_delalloc(struct inode *inode,
3389 : struct ext4_map_blocks *map)
3390 : {
3391 244616 : struct extent_status es;
3392 244616 : ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
3393 :
3394 244616 : ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
3395 : map->m_lblk, end, &es);
3396 :
3397 244616 : if (!es.es_len || es.es_lblk > end)
3398 : return false;
3399 :
3400 453 : if (es.es_lblk > map->m_lblk) {
3401 6 : map->m_len = es.es_lblk - map->m_lblk;
3402 6 : return false;
3403 : }
3404 :
3405 447 : offset = map->m_lblk - es.es_lblk;
3406 447 : map->m_len = es.es_len - offset;
3407 :
3408 447 : return true;
3409 : }
3410 :
3411 560848 : static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
3412 : loff_t length, unsigned int flags,
3413 : struct iomap *iomap, struct iomap *srcmap)
3414 : {
3415 560848 : int ret;
3416 560848 : bool delalloc = false;
3417 560848 : struct ext4_map_blocks map;
3418 560848 : u8 blkbits = inode->i_blkbits;
3419 :
3420 560848 : if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
3421 : return -EINVAL;
3422 :
3423 560848 : if (ext4_has_inline_data(inode)) {
3424 0 : ret = ext4_inline_data_iomap(inode, iomap);
3425 0 : if (ret != -EAGAIN) {
3426 0 : if (ret == 0 && offset >= iomap->length)
3427 0 : ret = -ENOENT;
3428 0 : return ret;
3429 : }
3430 : }
3431 :
3432 : /*
3433 : * Calculate the first and last logical block respectively.
3434 : */
3435 560848 : map.m_lblk = offset >> blkbits;
3436 560848 : map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
3437 560848 : EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
3438 :
3439 : /*
3440 : * Fiemap callers may call for offset beyond s_bitmap_maxbytes.
3441 : * So handle it here itself instead of querying ext4_map_blocks().
3442 : * Since ext4_map_blocks() will warn about it and will return
3443 : * -EIO error.
3444 : */
3445 560848 : if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
3446 1926 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3447 :
3448 1926 : if (offset >= sbi->s_bitmap_maxbytes) {
3449 0 : map.m_flags = 0;
3450 0 : goto set_iomap;
3451 : }
3452 : }
3453 :
3454 560848 : ret = ext4_map_blocks(NULL, inode, &map, 0);
3455 560848 : if (ret < 0)
3456 : return ret;
3457 560848 : if (ret == 0)
3458 244616 : delalloc = ext4_iomap_is_delalloc(inode, &map);
3459 :
3460 316232 : set_iomap:
3461 560848 : ext4_set_iomap(inode, iomap, &map, offset, length, flags);
3462 560848 : if (delalloc && iomap->type == IOMAP_HOLE)
3463 447 : iomap->type = IOMAP_DELALLOC;
3464 :
3465 : return 0;
3466 : }
3467 :
3468 : const struct iomap_ops ext4_iomap_report_ops = {
3469 : .iomap_begin = ext4_iomap_begin_report,
3470 : };
3471 :
3472 : /*
3473 : * For data=journal mode, folio should be marked dirty only when it was
3474 : * writeably mapped. When that happens, it was already attached to the
3475 : * transaction and marked as jbddirty (we take care of this in
3476 : * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings
3477 : * so we should have nothing to do here, except for the case when someone
3478 : * had the page pinned and dirtied the page through this pin (e.g. by doing
3479 : * direct IO to it). In that case we'd need to attach buffers here to the
3480 : * transaction but we cannot due to lock ordering. We cannot just dirty the
3481 : * folio and leave attached buffers clean, because the buffers' dirty state is
3482 : * "definitive". We cannot just set the buffers dirty or jbddirty because all
3483 : * the journalling code will explode. So what we do is to mark the folio
3484 : * "pending dirty" and next time ext4_writepages() is called, attach buffers
3485 : * to the transaction appropriately.
3486 : */
3487 0 : static bool ext4_journalled_dirty_folio(struct address_space *mapping,
3488 : struct folio *folio)
3489 : {
3490 0 : WARN_ON_ONCE(!folio_buffers(folio));
3491 0 : if (folio_maybe_dma_pinned(folio))
3492 0 : folio_set_checked(folio);
3493 0 : return filemap_dirty_folio(mapping, folio);
3494 : }
3495 :
3496 25319976 : static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio)
3497 : {
3498 31007958 : WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio));
3499 25319976 : WARN_ON_ONCE(!folio_buffers(folio));
3500 25319976 : return block_dirty_folio(mapping, folio);
3501 : }
3502 :
3503 30 : static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
3504 : struct file *file, sector_t *span)
3505 : {
3506 30 : return iomap_swapfile_activate(sis, file, span,
3507 : &ext4_iomap_report_ops);
3508 : }
3509 :
3510 : static const struct address_space_operations ext4_aops = {
3511 : .read_folio = ext4_read_folio,
3512 : .readahead = ext4_readahead,
3513 : .writepages = ext4_writepages,
3514 : .write_begin = ext4_write_begin,
3515 : .write_end = ext4_write_end,
3516 : .dirty_folio = ext4_dirty_folio,
3517 : .bmap = ext4_bmap,
3518 : .invalidate_folio = ext4_invalidate_folio,
3519 : .release_folio = ext4_release_folio,
3520 : .direct_IO = noop_direct_IO,
3521 : .migrate_folio = buffer_migrate_folio,
3522 : .is_partially_uptodate = block_is_partially_uptodate,
3523 : .error_remove_page = generic_error_remove_page,
3524 : .swap_activate = ext4_iomap_swap_activate,
3525 : };
3526 :
3527 : static const struct address_space_operations ext4_journalled_aops = {
3528 : .read_folio = ext4_read_folio,
3529 : .readahead = ext4_readahead,
3530 : .writepages = ext4_writepages,
3531 : .write_begin = ext4_write_begin,
3532 : .write_end = ext4_journalled_write_end,
3533 : .dirty_folio = ext4_journalled_dirty_folio,
3534 : .bmap = ext4_bmap,
3535 : .invalidate_folio = ext4_journalled_invalidate_folio,
3536 : .release_folio = ext4_release_folio,
3537 : .direct_IO = noop_direct_IO,
3538 : .migrate_folio = buffer_migrate_folio_norefs,
3539 : .is_partially_uptodate = block_is_partially_uptodate,
3540 : .error_remove_page = generic_error_remove_page,
3541 : .swap_activate = ext4_iomap_swap_activate,
3542 : };
3543 :
3544 : static const struct address_space_operations ext4_da_aops = {
3545 : .read_folio = ext4_read_folio,
3546 : .readahead = ext4_readahead,
3547 : .writepages = ext4_writepages,
3548 : .write_begin = ext4_da_write_begin,
3549 : .write_end = ext4_da_write_end,
3550 : .dirty_folio = ext4_dirty_folio,
3551 : .bmap = ext4_bmap,
3552 : .invalidate_folio = ext4_invalidate_folio,
3553 : .release_folio = ext4_release_folio,
3554 : .direct_IO = noop_direct_IO,
3555 : .migrate_folio = buffer_migrate_folio,
3556 : .is_partially_uptodate = block_is_partially_uptodate,
3557 : .error_remove_page = generic_error_remove_page,
3558 : .swap_activate = ext4_iomap_swap_activate,
3559 : };
3560 :
3561 : static const struct address_space_operations ext4_dax_aops = {
3562 : .writepages = ext4_dax_writepages,
3563 : .direct_IO = noop_direct_IO,
3564 : .dirty_folio = noop_dirty_folio,
3565 : .bmap = ext4_bmap,
3566 : .swap_activate = ext4_iomap_swap_activate,
3567 : };
3568 :
3569 2296020 : void ext4_set_aops(struct inode *inode)
3570 : {
3571 2296020 : switch (ext4_inode_journal_mode(inode)) {
3572 : case EXT4_INODE_ORDERED_DATA_MODE:
3573 : case EXT4_INODE_WRITEBACK_DATA_MODE:
3574 2289313 : break;
3575 5632 : case EXT4_INODE_JOURNAL_DATA_MODE:
3576 5632 : inode->i_mapping->a_ops = &ext4_journalled_aops;
3577 5632 : return;
3578 0 : default:
3579 0 : BUG();
3580 : }
3581 2289313 : if (IS_DAX(inode))
3582 0 : inode->i_mapping->a_ops = &ext4_dax_aops;
3583 2289313 : else if (test_opt(inode->i_sb, DELALLOC))
3584 2289047 : inode->i_mapping->a_ops = &ext4_da_aops;
3585 : else
3586 266 : inode->i_mapping->a_ops = &ext4_aops;
3587 : }
3588 :
3589 923760 : static int __ext4_block_zero_page_range(handle_t *handle,
3590 : struct address_space *mapping, loff_t from, loff_t length)
3591 : {
3592 923760 : ext4_fsblk_t index = from >> PAGE_SHIFT;
3593 923760 : unsigned offset = from & (PAGE_SIZE-1);
3594 923760 : unsigned blocksize, pos;
3595 923760 : ext4_lblk_t iblock;
3596 923760 : struct inode *inode = mapping->host;
3597 923760 : struct buffer_head *bh;
3598 923760 : struct folio *folio;
3599 923760 : int err = 0;
3600 :
3601 923760 : folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT,
3602 : FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
3603 : mapping_gfp_constraint(mapping, ~__GFP_FS));
3604 923786 : if (IS_ERR(folio))
3605 0 : return PTR_ERR(folio);
3606 :
3607 923786 : blocksize = inode->i_sb->s_blocksize;
3608 :
3609 923786 : iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
3610 :
3611 923786 : bh = folio_buffers(folio);
3612 923786 : if (!bh) {
3613 744362 : create_empty_buffers(&folio->page, blocksize, 0);
3614 744361 : bh = folio_buffers(folio);
3615 : }
3616 :
3617 : /* Find the buffer that contains "offset" */
3618 923785 : pos = blocksize;
3619 923785 : while (offset >= pos) {
3620 0 : bh = bh->b_this_page;
3621 0 : iblock++;
3622 0 : pos += blocksize;
3623 : }
3624 1847570 : if (buffer_freed(bh)) {
3625 0 : BUFFER_TRACE(bh, "freed: skip");
3626 0 : goto unlock;
3627 : }
3628 1847570 : if (!buffer_mapped(bh)) {
3629 780100 : BUFFER_TRACE(bh, "unmapped");
3630 780100 : ext4_get_block(inode, iblock, bh, 0);
3631 : /* unmapped? It's a hole - nothing to do */
3632 1560210 : if (!buffer_mapped(bh)) {
3633 650926 : BUFFER_TRACE(bh, "still unmapped");
3634 650926 : goto unlock;
3635 : }
3636 : }
3637 :
3638 : /* Ok, it's mapped. Make sure it's up-to-date */
3639 473345 : if (folio_test_uptodate(folio))
3640 200479 : set_buffer_uptodate(bh);
3641 :
3642 545727 : if (!buffer_uptodate(bh)) {
3643 69993 : err = ext4_read_bh_lock(bh, 0, true);
3644 69993 : if (err)
3645 0 : goto unlock;
3646 : if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
3647 : /* We expect the key to be set. */
3648 : BUG_ON(!fscrypt_has_encryption_key(inode));
3649 : err = fscrypt_decrypt_pagecache_blocks(folio,
3650 : blocksize,
3651 : bh_offset(bh));
3652 : if (err) {
3653 : clear_buffer_uptodate(bh);
3654 : goto unlock;
3655 : }
3656 : }
3657 : }
3658 272863 : if (ext4_should_journal_data(inode)) {
3659 0 : BUFFER_TRACE(bh, "get write access");
3660 0 : err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
3661 : EXT4_JTR_NONE);
3662 0 : if (err)
3663 0 : goto unlock;
3664 : }
3665 272863 : folio_zero_range(folio, offset, length);
3666 272864 : BUFFER_TRACE(bh, "zeroed end of block");
3667 :
3668 272864 : if (ext4_should_journal_data(inode)) {
3669 0 : err = ext4_dirty_journalled_data(handle, bh);
3670 : } else {
3671 272864 : err = 0;
3672 272864 : mark_buffer_dirty(bh);
3673 272866 : if (ext4_should_order_data(inode))
3674 272866 : err = ext4_jbd2_inode_add_write(handle, inode, from,
3675 : length);
3676 : }
3677 :
3678 0 : unlock:
3679 923794 : folio_unlock(folio);
3680 923800 : folio_put(folio);
3681 923800 : return err;
3682 : }
3683 :
3684 : /*
3685 : * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3686 : * starting from file offset 'from'. The range to be zero'd must
3687 : * be contained with in one block. If the specified range exceeds
3688 : * the end of the block it will be shortened to end of the block
3689 : * that corresponds to 'from'
3690 : */
3691 923770 : static int ext4_block_zero_page_range(handle_t *handle,
3692 : struct address_space *mapping, loff_t from, loff_t length)
3693 : {
3694 923770 : struct inode *inode = mapping->host;
3695 923770 : unsigned offset = from & (PAGE_SIZE-1);
3696 923770 : unsigned blocksize = inode->i_sb->s_blocksize;
3697 923770 : unsigned max = blocksize - (offset & (blocksize - 1));
3698 :
3699 : /*
3700 : * correct length if it does not fall between
3701 : * 'from' and the end of the block
3702 : */
3703 923770 : if (length > max || length < 0)
3704 388437 : length = max;
3705 :
3706 923770 : if (IS_DAX(inode)) {
3707 0 : return dax_zero_range(inode, from, length, NULL,
3708 : &ext4_iomap_ops);
3709 : }
3710 923770 : return __ext4_block_zero_page_range(handle, mapping, from, length);
3711 : }
3712 :
3713 : /*
3714 : * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3715 : * up to the end of the block which corresponds to `from'.
3716 : * This required during truncate. We need to physically zero the tail end
3717 : * of that block so it doesn't yield old data if the file is later grown.
3718 : */
3719 150843 : static int ext4_block_truncate_page(handle_t *handle,
3720 : struct address_space *mapping, loff_t from)
3721 : {
3722 150843 : unsigned offset = from & (PAGE_SIZE-1);
3723 150843 : unsigned length;
3724 150843 : unsigned blocksize;
3725 150843 : struct inode *inode = mapping->host;
3726 :
3727 : /* If we are processing an encrypted inode during orphan list handling */
3728 150843 : if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
3729 : return 0;
3730 :
3731 150845 : blocksize = inode->i_sb->s_blocksize;
3732 150845 : length = blocksize - (offset & (blocksize - 1));
3733 :
3734 150845 : return ext4_block_zero_page_range(handle, mapping, from, length);
3735 : }
3736 :
3737 441285 : int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
3738 : loff_t lstart, loff_t length)
3739 : {
3740 441285 : struct super_block *sb = inode->i_sb;
3741 441285 : struct address_space *mapping = inode->i_mapping;
3742 441285 : unsigned partial_start, partial_end;
3743 441285 : ext4_fsblk_t start, end;
3744 441285 : loff_t byte_end = (lstart + length - 1);
3745 441285 : int err = 0;
3746 :
3747 441285 : partial_start = lstart & (sb->s_blocksize - 1);
3748 441285 : partial_end = byte_end & (sb->s_blocksize - 1);
3749 :
3750 441285 : start = lstart >> sb->s_blocksize_bits;
3751 441285 : end = byte_end >> sb->s_blocksize_bits;
3752 :
3753 : /* Handle partial zero within the single block */
3754 441285 : if (start == end &&
3755 38135 : (partial_start || (partial_end != sb->s_blocksize - 1))) {
3756 11407 : err = ext4_block_zero_page_range(handle, mapping,
3757 : lstart, length);
3758 11407 : return err;
3759 : }
3760 : /* Handle partial zero out on the start of the range */
3761 429878 : if (partial_start) {
3762 388439 : err = ext4_block_zero_page_range(handle, mapping,
3763 388439 : lstart, sb->s_blocksize);
3764 388441 : if (err)
3765 : return err;
3766 : }
3767 : /* Handle partial zero out on the end of the range */
3768 429880 : if (partial_end != sb->s_blocksize - 1)
3769 373091 : err = ext4_block_zero_page_range(handle, mapping,
3770 : byte_end - partial_end,
3771 373091 : partial_end + 1);
3772 : return err;
3773 : }
3774 :
3775 801716 : int ext4_can_truncate(struct inode *inode)
3776 : {
3777 801716 : if (S_ISREG(inode->i_mode))
3778 : return 1;
3779 199561 : if (S_ISDIR(inode->i_mode))
3780 : return 1;
3781 10915 : if (S_ISLNK(inode->i_mode))
3782 10915 : return !ext4_inode_is_fast_symlink(inode);
3783 : return 0;
3784 : }
3785 :
3786 : /*
3787 : * We have to make sure i_disksize gets properly updated before we truncate
3788 : * page cache due to hole punching or zero range. Otherwise i_disksize update
3789 : * can get lost as it may have been postponed to submission of writeback but
3790 : * that will never happen after we truncate page cache.
3791 : */
3792 413997 : int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
3793 : loff_t len)
3794 : {
3795 413997 : handle_t *handle;
3796 413997 : int ret;
3797 :
3798 413997 : loff_t size = i_size_read(inode);
3799 :
3800 413997 : WARN_ON(!inode_is_locked(inode));
3801 413997 : if (offset > size || offset + len < size)
3802 : return 0;
3803 :
3804 76093 : if (EXT4_I(inode)->i_disksize >= size)
3805 : return 0;
3806 :
3807 916 : handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
3808 916 : if (IS_ERR(handle))
3809 0 : return PTR_ERR(handle);
3810 916 : ext4_update_i_disksize(inode, size);
3811 916 : ret = ext4_mark_inode_dirty(handle, inode);
3812 916 : ext4_journal_stop(handle);
3813 :
3814 916 : return ret;
3815 : }
3816 :
3817 0 : static void ext4_wait_dax_page(struct inode *inode)
3818 : {
3819 0 : filemap_invalidate_unlock(inode->i_mapping);
3820 0 : schedule();
3821 0 : filemap_invalidate_lock(inode->i_mapping);
3822 0 : }
3823 :
3824 1286973 : int ext4_break_layouts(struct inode *inode)
3825 : {
3826 1286973 : struct page *page;
3827 1286973 : int error;
3828 :
3829 1286973 : if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
3830 : return -EINVAL;
3831 :
3832 1286973 : do {
3833 1286973 : page = dax_layout_busy_page(inode->i_mapping);
3834 1286932 : if (!page)
3835 : return 0;
3836 :
3837 0 : error = ___wait_var_event(&page->_refcount,
3838 : atomic_read(&page->_refcount) == 1,
3839 : TASK_INTERRUPTIBLE, 0, 0,
3840 : ext4_wait_dax_page(inode));
3841 0 : } while (error == 0);
3842 :
3843 : return error;
3844 : }
3845 :
3846 : /*
3847 : * ext4_punch_hole: punches a hole in a file by releasing the blocks
3848 : * associated with the given offset and length
3849 : *
3850 : * @inode: File inode
3851 : * @offset: The offset where the hole will begin
3852 : * @len: The length of the hole
3853 : *
3854 : * Returns: 0 on success or negative on failure
3855 : */
3856 :
3857 255389 : int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3858 : {
3859 255389 : struct inode *inode = file_inode(file);
3860 255389 : struct super_block *sb = inode->i_sb;
3861 255389 : ext4_lblk_t first_block, stop_block;
3862 255389 : struct address_space *mapping = inode->i_mapping;
3863 255389 : loff_t first_block_offset, last_block_offset, max_length;
3864 255389 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3865 255389 : handle_t *handle;
3866 255389 : unsigned int credits;
3867 255389 : int ret = 0, ret2 = 0;
3868 :
3869 255389 : trace_ext4_punch_hole(inode, offset, length, 0);
3870 :
3871 : /*
3872 : * Write out all dirty pages to avoid race conditions
3873 : * Then release them.
3874 : */
3875 255393 : if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
3876 136448 : ret = filemap_write_and_wait_range(mapping, offset,
3877 136448 : offset + length - 1);
3878 136446 : if (ret)
3879 : return ret;
3880 : }
3881 :
3882 255390 : inode_lock(inode);
3883 :
3884 : /* No need to punch hole beyond i_size */
3885 255398 : if (offset >= inode->i_size)
3886 13464 : goto out_mutex;
3887 :
3888 : /*
3889 : * If the hole extends beyond i_size, set the hole
3890 : * to end after the page that contains i_size
3891 : */
3892 241934 : if (offset + length > inode->i_size) {
3893 3114 : length = inode->i_size +
3894 3114 : PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
3895 : offset;
3896 : }
3897 :
3898 : /*
3899 : * For punch hole the length + offset needs to be within one block
3900 : * before last range. Adjust the length if it goes beyond that limit.
3901 : */
3902 241934 : max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
3903 241934 : if (offset + length > max_length)
3904 0 : length = max_length - offset;
3905 :
3906 241934 : if (offset & (sb->s_blocksize - 1) ||
3907 41407 : (offset + length) & (sb->s_blocksize - 1)) {
3908 : /*
3909 : * Attach jinode to inode for jbd2 if we do any zeroing of
3910 : * partial block
3911 : */
3912 200566 : ret = ext4_inode_attach_jinode(inode);
3913 200566 : if (ret < 0)
3914 0 : goto out_mutex;
3915 :
3916 : }
3917 :
3918 : /* Wait all existing dio workers, newcomers will block on i_rwsem */
3919 241934 : inode_dio_wait(inode);
3920 :
3921 241934 : ret = file_modified(file);
3922 241938 : if (ret)
3923 0 : goto out_mutex;
3924 :
3925 : /*
3926 : * Prevent page faults from reinstantiating pages we have released from
3927 : * page cache.
3928 : */
3929 241938 : filemap_invalidate_lock(mapping);
3930 :
3931 241938 : ret = ext4_break_layouts(inode);
3932 241938 : if (ret)
3933 0 : goto out_dio;
3934 :
3935 241938 : first_block_offset = round_up(offset, sb->s_blocksize);
3936 241938 : last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
3937 :
3938 : /* Now release the pages and zero block aligned part of pages*/
3939 241938 : if (last_block_offset > first_block_offset) {
3940 225494 : ret = ext4_update_disksize_before_punch(inode, offset, length);
3941 225491 : if (ret)
3942 0 : goto out_dio;
3943 225491 : truncate_pagecache_range(inode, first_block_offset,
3944 : last_block_offset);
3945 : }
3946 :
3947 241939 : if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3948 241931 : credits = ext4_writepage_trans_blocks(inode);
3949 : else
3950 8 : credits = ext4_blocks_for_truncate(inode);
3951 241939 : handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
3952 241939 : if (IS_ERR(handle)) {
3953 0 : ret = PTR_ERR(handle);
3954 0 : ext4_std_error(sb, ret);
3955 0 : goto out_dio;
3956 : }
3957 :
3958 241939 : ret = ext4_zero_partial_blocks(handle, inode, offset,
3959 : length);
3960 241940 : if (ret)
3961 0 : goto out_stop;
3962 :
3963 0 : first_block = (offset + sb->s_blocksize - 1) >>
3964 241940 : EXT4_BLOCK_SIZE_BITS(sb);
3965 241940 : stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
3966 :
3967 : /* If there are blocks to remove, do it */
3968 241940 : if (stop_block > first_block) {
3969 :
3970 225496 : down_write(&EXT4_I(inode)->i_data_sem);
3971 225496 : ext4_discard_preallocations(inode, 0);
3972 :
3973 225496 : ext4_es_remove_extent(inode, first_block,
3974 : stop_block - first_block);
3975 :
3976 225496 : if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3977 225488 : ret = ext4_ext_remove_space(inode, first_block,
3978 : stop_block - 1);
3979 : else
3980 8 : ret = ext4_ind_remove_space(handle, inode, first_block,
3981 : stop_block);
3982 :
3983 225496 : up_write(&EXT4_I(inode)->i_data_sem);
3984 : }
3985 241939 : ext4_fc_track_range(handle, inode, first_block, stop_block);
3986 241939 : if (IS_SYNC(inode))
3987 0 : ext4_handle_sync(handle);
3988 :
3989 241939 : inode->i_mtime = inode->i_ctime = current_time(inode);
3990 241940 : ret2 = ext4_mark_inode_dirty(handle, inode);
3991 241940 : if (unlikely(ret2))
3992 0 : ret = ret2;
3993 241940 : if (ret >= 0)
3994 241940 : ext4_update_inode_fsync_trans(handle, inode, 1);
3995 0 : out_stop:
3996 241939 : ext4_journal_stop(handle);
3997 241939 : out_dio:
3998 241939 : filemap_invalidate_unlock(mapping);
3999 255404 : out_mutex:
4000 255404 : inode_unlock(inode);
4001 255404 : return ret;
4002 : }
4003 :
4004 6358623 : int ext4_inode_attach_jinode(struct inode *inode)
4005 : {
4006 6358623 : struct ext4_inode_info *ei = EXT4_I(inode);
4007 6358623 : struct jbd2_inode *jinode;
4008 :
4009 6358623 : if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal)
4010 : return 0;
4011 :
4012 2011176 : jinode = jbd2_alloc_inode(GFP_KERNEL);
4013 2012805 : spin_lock(&inode->i_lock);
4014 2016878 : if (!ei->jinode) {
4015 2015870 : if (!jinode) {
4016 0 : spin_unlock(&inode->i_lock);
4017 0 : return -ENOMEM;
4018 : }
4019 2015870 : ei->jinode = jinode;
4020 2015870 : jbd2_journal_init_jbd_inode(ei->jinode, inode);
4021 2015870 : jinode = NULL;
4022 : }
4023 2000204 : spin_unlock(&inode->i_lock);
4024 2014685 : if (unlikely(jinode != NULL))
4025 1 : jbd2_free_inode(jinode);
4026 : return 0;
4027 : }
4028 :
4029 : /*
4030 : * ext4_truncate()
4031 : *
4032 : * We block out ext4_get_block() block instantiations across the entire
4033 : * transaction, and VFS/VM ensures that ext4_truncate() cannot run
4034 : * simultaneously on behalf of the same inode.
4035 : *
4036 : * As we work through the truncate and commit bits of it to the journal there
4037 : * is one core, guiding principle: the file's tree must always be consistent on
4038 : * disk. We must be able to restart the truncate after a crash.
4039 : *
4040 : * The file's tree may be transiently inconsistent in memory (although it
4041 : * probably isn't), but whenever we close off and commit a journal transaction,
4042 : * the contents of (the filesystem + the journal) must be consistent and
4043 : * restartable. It's pretty simple, really: bottom up, right to left (although
4044 : * left-to-right works OK too).
4045 : *
4046 : * Note that at recovery time, journal replay occurs *before* the restart of
4047 : * truncate against the orphan inode list.
4048 : *
4049 : * The committed inode has the new, desired i_size (which is the same as
4050 : * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see
4051 : * that this inode's truncate did not complete and it will again call
4052 : * ext4_truncate() to have another go. So there will be instantiated blocks
4053 : * to the right of the truncation point in a crashed ext4 filesystem. But
4054 : * that's fine - as long as they are linked from the inode, the post-crash
4055 : * ext4_truncate() run will find them and release them.
4056 : */
4057 799445 : int ext4_truncate(struct inode *inode)
4058 : {
4059 799445 : struct ext4_inode_info *ei = EXT4_I(inode);
4060 799445 : unsigned int credits;
4061 799445 : int err = 0, err2;
4062 799445 : handle_t *handle;
4063 799445 : struct address_space *mapping = inode->i_mapping;
4064 :
4065 : /*
4066 : * There is a possibility that we're either freeing the inode
4067 : * or it's a completely new inode. In those cases we might not
4068 : * have i_rwsem locked because it's not necessary.
4069 : */
4070 799445 : if (!(inode->i_state & (I_NEW|I_FREEING)))
4071 507379 : WARN_ON(!inode_is_locked(inode));
4072 799445 : trace_ext4_truncate_enter(inode);
4073 :
4074 799144 : if (!ext4_can_truncate(inode))
4075 0 : goto out_trace;
4076 :
4077 799267 : if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4078 530773 : ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4079 :
4080 799529 : if (ext4_has_inline_data(inode)) {
4081 0 : int has_inline = 1;
4082 :
4083 0 : err = ext4_inline_data_truncate(inode, &has_inline);
4084 0 : if (err || has_inline)
4085 0 : goto out_trace;
4086 : }
4087 :
4088 : /* If we zero-out tail of the page, we have to create jinode for jbd2 */
4089 799529 : if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
4090 150812 : err = ext4_inode_attach_jinode(inode);
4091 150818 : if (err)
4092 0 : goto out_trace;
4093 : }
4094 :
4095 799535 : if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4096 799504 : credits = ext4_writepage_trans_blocks(inode);
4097 : else
4098 31 : credits = ext4_blocks_for_truncate(inode);
4099 :
4100 798969 : handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
4101 799655 : if (IS_ERR(handle)) {
4102 0 : err = PTR_ERR(handle);
4103 0 : goto out_trace;
4104 : }
4105 :
4106 799655 : if (inode->i_size & (inode->i_sb->s_blocksize - 1))
4107 150843 : ext4_block_truncate_page(handle, mapping, inode->i_size);
4108 :
4109 : /*
4110 : * We add the inode to the orphan list, so that if this
4111 : * truncate spans multiple transactions, and we crash, we will
4112 : * resume the truncate when the filesystem recovers. It also
4113 : * marks the inode dirty, to catch the new size.
4114 : *
4115 : * Implication: the file must always be in a sane, consistent
4116 : * truncatable state while each transaction commits.
4117 : */
4118 799671 : err = ext4_orphan_add(handle, inode);
4119 799936 : if (err)
4120 0 : goto out_stop;
4121 :
4122 799936 : down_write(&EXT4_I(inode)->i_data_sem);
4123 :
4124 799915 : ext4_discard_preallocations(inode, 0);
4125 :
4126 799783 : if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4127 799752 : err = ext4_ext_truncate(handle, inode);
4128 : else
4129 31 : ext4_ind_truncate(handle, inode);
4130 :
4131 799622 : up_write(&ei->i_data_sem);
4132 799460 : if (err)
4133 1 : goto out_stop;
4134 :
4135 799459 : if (IS_SYNC(inode))
4136 15 : ext4_handle_sync(handle);
4137 :
4138 799444 : out_stop:
4139 : /*
4140 : * If this was a simple ftruncate() and the file will remain alive,
4141 : * then we need to clear up the orphan record which we created above.
4142 : * However, if this was a real unlink then we were called by
4143 : * ext4_evict_inode(), and we allow that function to clean up the
4144 : * orphan info for us.
4145 : */
4146 799460 : if (inode->i_nlink)
4147 507453 : ext4_orphan_del(handle, inode);
4148 :
4149 799983 : inode->i_mtime = inode->i_ctime = current_time(inode);
4150 799980 : err2 = ext4_mark_inode_dirty(handle, inode);
4151 800022 : if (unlikely(err2 && !err))
4152 0 : err = err2;
4153 800022 : ext4_journal_stop(handle);
4154 :
4155 800004 : out_trace:
4156 800004 : trace_ext4_truncate_exit(inode);
4157 799997 : return err;
4158 : }
4159 :
4160 : static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
4161 : {
4162 78518602 : if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
4163 204 : return inode_peek_iversion_raw(inode);
4164 : else
4165 78518398 : return inode_peek_iversion(inode);
4166 : }
4167 :
4168 78531413 : static int ext4_inode_blocks_set(struct ext4_inode *raw_inode,
4169 : struct ext4_inode_info *ei)
4170 : {
4171 78531413 : struct inode *inode = &(ei->vfs_inode);
4172 78531413 : u64 i_blocks = READ_ONCE(inode->i_blocks);
4173 78531413 : struct super_block *sb = inode->i_sb;
4174 :
4175 78531413 : if (i_blocks <= ~0U) {
4176 : /*
4177 : * i_blocks can be represented in a 32 bit variable
4178 : * as multiple of 512 bytes
4179 : */
4180 78531413 : raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
4181 78531413 : raw_inode->i_blocks_high = 0;
4182 78531413 : ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
4183 78531413 : return 0;
4184 : }
4185 :
4186 : /*
4187 : * This should never happen since sb->s_maxbytes should not have
4188 : * allowed this, sb->s_maxbytes was set according to the huge_file
4189 : * feature in ext4_fill_super().
4190 : */
4191 0 : if (!ext4_has_feature_huge_file(sb))
4192 : return -EFSCORRUPTED;
4193 :
4194 0 : if (i_blocks <= 0xffffffffffffULL) {
4195 : /*
4196 : * i_blocks can be represented in a 48 bit variable
4197 : * as multiple of 512 bytes
4198 : */
4199 0 : raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
4200 0 : raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
4201 0 : ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
4202 : } else {
4203 0 : ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
4204 : /* i_block is stored in file system block size */
4205 0 : i_blocks = i_blocks >> (inode->i_blkbits - 9);
4206 0 : raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
4207 0 : raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
4208 : }
4209 : return 0;
4210 : }
4211 :
4212 78540999 : static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode)
4213 : {
4214 78540999 : struct ext4_inode_info *ei = EXT4_I(inode);
4215 78540999 : uid_t i_uid;
4216 78540999 : gid_t i_gid;
4217 78540999 : projid_t i_projid;
4218 78540999 : int block;
4219 78540999 : int err;
4220 :
4221 78540999 : err = ext4_inode_blocks_set(raw_inode, ei);
4222 :
4223 78660399 : raw_inode->i_mode = cpu_to_le16(inode->i_mode);
4224 78660399 : i_uid = i_uid_read(inode);
4225 78665802 : i_gid = i_gid_read(inode);
4226 78675515 : i_projid = from_kprojid(&init_user_ns, ei->i_projid);
4227 78600384 : if (!(test_opt(inode->i_sb, NO_UID32))) {
4228 78600384 : raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
4229 78600384 : raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
4230 : /*
4231 : * Fix up interoperability with old kernels. Otherwise,
4232 : * old inodes get re-used with the upper 16 bits of the
4233 : * uid/gid intact.
4234 : */
4235 78600384 : if (ei->i_dtime && list_empty(&ei->i_orphan)) {
4236 2004570 : raw_inode->i_uid_high = 0;
4237 2004570 : raw_inode->i_gid_high = 0;
4238 : } else {
4239 76595814 : raw_inode->i_uid_high =
4240 76595814 : cpu_to_le16(high_16_bits(i_uid));
4241 76595814 : raw_inode->i_gid_high =
4242 76595814 : cpu_to_le16(high_16_bits(i_gid));
4243 : }
4244 : } else {
4245 0 : raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
4246 0 : raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
4247 0 : raw_inode->i_uid_high = 0;
4248 0 : raw_inode->i_gid_high = 0;
4249 : }
4250 78600384 : raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
4251 :
4252 78600384 : EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
4253 78600384 : EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
4254 78600384 : EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
4255 78600384 : EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
4256 :
4257 78600384 : raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
4258 78600384 : raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
4259 78600384 : if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
4260 78600384 : raw_inode->i_file_acl_high =
4261 78600384 : cpu_to_le16(ei->i_file_acl >> 32);
4262 78600384 : raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
4263 78600384 : ext4_isize_set(raw_inode, ei->i_disksize);
4264 :
4265 78600384 : raw_inode->i_generation = cpu_to_le32(inode->i_generation);
4266 78600384 : if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
4267 1325424 : if (old_valid_dev(inode->i_rdev)) {
4268 662712 : raw_inode->i_block[0] =
4269 662712 : cpu_to_le32(old_encode_dev(inode->i_rdev));
4270 662712 : raw_inode->i_block[1] = 0;
4271 : } else {
4272 0 : raw_inode->i_block[0] = 0;
4273 0 : raw_inode->i_block[1] =
4274 0 : cpu_to_le32(new_encode_dev(inode->i_rdev));
4275 0 : raw_inode->i_block[2] = 0;
4276 : }
4277 77937672 : } else if (!ext4_has_inline_data(inode)) {
4278 1242517685 : for (block = 0; block < EXT4_N_BLOCKS; block++)
4279 1164661795 : raw_inode->i_block[block] = ei->i_data[block];
4280 : }
4281 :
4282 78518602 : if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
4283 78518602 : u64 ivers = ext4_inode_peek_iversion(inode);
4284 :
4285 78518602 : raw_inode->i_disk_version = cpu_to_le32(ivers);
4286 78518602 : if (ei->i_extra_isize) {
4287 78580234 : if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4288 78575347 : raw_inode->i_version_hi =
4289 78575347 : cpu_to_le32(ivers >> 32);
4290 78580234 : raw_inode->i_extra_isize =
4291 78580234 : cpu_to_le16(ei->i_extra_isize);
4292 : }
4293 : }
4294 :
4295 78518602 : if (i_projid != EXT4_DEF_PROJID &&
4296 53227 : !ext4_has_feature_project(inode->i_sb))
4297 0 : err = err ?: -EFSCORRUPTED;
4298 :
4299 78518602 : if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
4300 78553841 : EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
4301 78562367 : raw_inode->i_projid = cpu_to_le32(i_projid);
4302 :
4303 78518602 : ext4_inode_csum_set(inode, raw_inode, ei);
4304 78624253 : return err;
4305 : }
4306 :
4307 : /*
4308 : * ext4_get_inode_loc returns with an extra refcount against the inode's
4309 : * underlying buffer_head on success. If we pass 'inode' and it does not
4310 : * have in-inode xattr, we have all inode data in memory that is needed
4311 : * to recreate the on-disk version of this inode.
4312 : */
4313 80074534 : static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
4314 : struct inode *inode, struct ext4_iloc *iloc,
4315 : ext4_fsblk_t *ret_block)
4316 : {
4317 80074534 : struct ext4_group_desc *gdp;
4318 80074534 : struct buffer_head *bh;
4319 80074534 : ext4_fsblk_t block;
4320 80074534 : struct blk_plug plug;
4321 80074534 : int inodes_per_block, inode_offset;
4322 :
4323 80074534 : iloc->bh = NULL;
4324 80074534 : if (ino < EXT4_ROOT_INO ||
4325 80074534 : ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
4326 : return -EFSCORRUPTED;
4327 :
4328 80074534 : iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
4329 80074534 : gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
4330 79839420 : if (!gdp)
4331 : return -EIO;
4332 :
4333 : /*
4334 : * Figure out the offset within the block group inode table
4335 : */
4336 79839420 : inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
4337 0 : inode_offset = ((ino - 1) %
4338 79839420 : EXT4_INODES_PER_GROUP(sb));
4339 79839420 : iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
4340 :
4341 79839420 : block = ext4_inode_table(sb, gdp);
4342 159903424 : if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) ||
4343 : (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) {
4344 0 : ext4_error(sb, "Invalid inode table block %llu in "
4345 : "block_group %u", block, iloc->block_group);
4346 0 : return -EFSCORRUPTED;
4347 : }
4348 79951712 : block += (inode_offset / inodes_per_block);
4349 :
4350 79951712 : bh = sb_getblk(sb, block);
4351 80079540 : if (unlikely(!bh))
4352 : return -ENOMEM;
4353 80079540 : if (ext4_buffer_uptodate(bh))
4354 79716460 : goto has_buffer;
4355 :
4356 231600 : lock_buffer(bh);
4357 231558 : if (ext4_buffer_uptodate(bh)) {
4358 : /* Someone brought it uptodate while we waited */
4359 48947 : unlock_buffer(bh);
4360 49321 : goto has_buffer;
4361 : }
4362 :
4363 : /*
4364 : * If we have all information of the inode in memory and this
4365 : * is the only valid inode in the block, we need not read the
4366 : * block.
4367 : */
4368 182390 : if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
4369 179436 : struct buffer_head *bitmap_bh;
4370 179436 : int i, start;
4371 :
4372 179436 : start = inode_offset & ~(inodes_per_block - 1);
4373 :
4374 : /* Is the inode bitmap in cache? */
4375 179436 : bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
4376 179675 : if (unlikely(!bitmap_bh))
4377 0 : goto make_io;
4378 :
4379 : /*
4380 : * If the inode bitmap isn't in cache then the
4381 : * optimisation may end up performing two reads instead
4382 : * of one, so skip it.
4383 : */
4384 359104 : if (!buffer_uptodate(bitmap_bh)) {
4385 41 : brelse(bitmap_bh);
4386 41 : goto make_io;
4387 : }
4388 1791871 : for (i = start; i < start + inodes_per_block; i++) {
4389 1615944 : if (i == inode_offset)
4390 177874 : continue;
4391 1438070 : if (ext4_test_bit(i, bitmap_bh->b_data))
4392 : break;
4393 : }
4394 179692 : brelse(bitmap_bh);
4395 179652 : if (i == start + inodes_per_block) {
4396 175887 : struct ext4_inode *raw_inode =
4397 175887 : (struct ext4_inode *) (bh->b_data + iloc->offset);
4398 :
4399 : /* all other inodes are free, so skip I/O */
4400 175887 : memset(bh->b_data, 0, bh->b_size);
4401 175887 : if (!ext4_test_inode_state(inode, EXT4_STATE_NEW))
4402 0 : ext4_fill_raw_inode(inode, raw_inode);
4403 175887 : set_buffer_uptodate(bh);
4404 176002 : unlock_buffer(bh);
4405 175975 : goto has_buffer;
4406 : }
4407 : }
4408 :
4409 6719 : make_io:
4410 : /*
4411 : * If we need to do any I/O, try to pre-readahead extra
4412 : * blocks from the inode table.
4413 : */
4414 6760 : blk_start_plug(&plug);
4415 6760 : if (EXT4_SB(sb)->s_inode_readahead_blks) {
4416 6760 : ext4_fsblk_t b, end, table;
4417 6760 : unsigned num;
4418 6760 : __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
4419 :
4420 6760 : table = ext4_inode_table(sb, gdp);
4421 : /* s_inode_readahead_blks is always a power of 2 */
4422 6760 : b = block & ~((ext4_fsblk_t) ra_blks - 1);
4423 6760 : if (table > b)
4424 : b = table;
4425 6760 : end = b + ra_blks;
4426 6760 : num = EXT4_INODES_PER_GROUP(sb);
4427 6760 : if (ext4_has_group_desc_csum(sb))
4428 6568 : num -= ext4_itable_unused_count(sb, gdp);
4429 6760 : table += num / inodes_per_block;
4430 6760 : if (end > table)
4431 : end = table;
4432 97221 : while (b <= end)
4433 90461 : ext4_sb_breadahead_unmovable(sb, b++);
4434 : }
4435 :
4436 : /*
4437 : * There are other valid inodes in the buffer, this inode
4438 : * has in-inode xattrs, or we don't have this inode in memory.
4439 : * Read the block from disk.
4440 : */
4441 6760 : trace_ext4_load_inode(sb, ino);
4442 6760 : ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
4443 6759 : blk_finish_plug(&plug);
4444 6760 : wait_on_buffer(bh);
4445 6754 : ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO);
4446 13506 : if (!buffer_uptodate(bh)) {
4447 1 : if (ret_block)
4448 1 : *ret_block = block;
4449 1 : brelse(bh);
4450 1 : return -EIO;
4451 : }
4452 6752 : has_buffer:
4453 79948508 : iloc->bh = bh;
4454 79948508 : return 0;
4455 : }
4456 :
4457 342060 : static int __ext4_get_inode_loc_noinmem(struct inode *inode,
4458 : struct ext4_iloc *iloc)
4459 : {
4460 342060 : ext4_fsblk_t err_blk = 0;
4461 342060 : int ret;
4462 :
4463 342060 : ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc,
4464 : &err_blk);
4465 :
4466 342060 : if (ret == -EIO)
4467 0 : ext4_error_inode_block(inode, err_blk, EIO,
4468 : "unable to read itable block");
4469 :
4470 342060 : return ret;
4471 : }
4472 :
4473 79614061 : int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
4474 : {
4475 79614061 : ext4_fsblk_t err_blk = 0;
4476 79614061 : int ret;
4477 :
4478 79614061 : ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc,
4479 : &err_blk);
4480 :
4481 79604583 : if (ret == -EIO)
4482 1 : ext4_error_inode_block(inode, err_blk, EIO,
4483 : "unable to read itable block");
4484 :
4485 79604583 : return ret;
4486 : }
4487 :
4488 :
4489 0 : int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
4490 : struct ext4_iloc *iloc)
4491 : {
4492 0 : return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL);
4493 : }
4494 :
4495 3064181 : static bool ext4_should_enable_dax(struct inode *inode)
4496 : {
4497 3064181 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4498 :
4499 3064181 : if (test_opt2(inode->i_sb, DAX_NEVER))
4500 : return false;
4501 3064181 : if (!S_ISREG(inode->i_mode))
4502 : return false;
4503 2276706 : if (ext4_should_journal_data(inode))
4504 : return false;
4505 2270431 : if (ext4_has_inline_data(inode))
4506 : return false;
4507 2270431 : if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
4508 : return false;
4509 2270431 : if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
4510 : return false;
4511 2270431 : if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags))
4512 : return false;
4513 0 : if (test_opt(inode->i_sb, DAX_ALWAYS))
4514 : return true;
4515 :
4516 0 : return ext4_test_inode_flag(inode, EXT4_INODE_DAX);
4517 : }
4518 :
4519 3081429 : void ext4_set_inode_flags(struct inode *inode, bool init)
4520 : {
4521 3081429 : unsigned int flags = EXT4_I(inode)->i_flags;
4522 3081429 : unsigned int new_fl = 0;
4523 :
4524 6162858 : WARN_ON_ONCE(IS_DAX(inode) && init);
4525 :
4526 3081429 : if (flags & EXT4_SYNC_FL)
4527 23 : new_fl |= S_SYNC;
4528 3081429 : if (flags & EXT4_APPEND_FL)
4529 28 : new_fl |= S_APPEND;
4530 3081429 : if (flags & EXT4_IMMUTABLE_FL)
4531 152 : new_fl |= S_IMMUTABLE;
4532 3081429 : if (flags & EXT4_NOATIME_FL)
4533 7 : new_fl |= S_NOATIME;
4534 3081429 : if (flags & EXT4_DIRSYNC_FL)
4535 0 : new_fl |= S_DIRSYNC;
4536 :
4537 : /* Because of the way inode_set_flags() works we must preserve S_DAX
4538 : * here if already set. */
4539 3081429 : new_fl |= (inode->i_flags & S_DAX);
4540 3081429 : if (init && ext4_should_enable_dax(inode))
4541 0 : new_fl |= S_DAX;
4542 :
4543 3064172 : if (flags & EXT4_ENCRYPT_FL)
4544 0 : new_fl |= S_ENCRYPTED;
4545 3064172 : if (flags & EXT4_CASEFOLD_FL)
4546 0 : new_fl |= S_CASEFOLD;
4547 3064172 : if (flags & EXT4_VERITY_FL)
4548 0 : new_fl |= S_VERITY;
4549 3064172 : inode_set_flags(inode, new_fl,
4550 : S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
4551 : S_ENCRYPTED|S_CASEFOLD|S_VERITY);
4552 3090090 : }
4553 :
4554 240999 : static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
4555 : struct ext4_inode_info *ei)
4556 : {
4557 240999 : blkcnt_t i_blocks ;
4558 240999 : struct inode *inode = &(ei->vfs_inode);
4559 240999 : struct super_block *sb = inode->i_sb;
4560 :
4561 240999 : if (ext4_has_feature_huge_file(sb)) {
4562 : /* we are using combined 48 bit field */
4563 240542 : i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
4564 240542 : le32_to_cpu(raw_inode->i_blocks_lo);
4565 240542 : if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
4566 : /* i_blocks represent file system block size */
4567 0 : return i_blocks << (inode->i_blkbits - 9);
4568 : } else {
4569 : return i_blocks;
4570 : }
4571 : } else {
4572 457 : return le32_to_cpu(raw_inode->i_blocks_lo);
4573 : }
4574 : }
4575 :
4576 240663 : static inline int ext4_iget_extra_inode(struct inode *inode,
4577 : struct ext4_inode *raw_inode,
4578 : struct ext4_inode_info *ei)
4579 : {
4580 240663 : __le32 *magic = (void *)raw_inode +
4581 240663 : EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
4582 :
4583 240663 : if (EXT4_INODE_HAS_XATTR_SPACE(inode) &&
4584 240663 : *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
4585 5515 : int err;
4586 :
4587 5515 : ext4_set_inode_state(inode, EXT4_STATE_XATTR);
4588 5515 : err = ext4_find_inline_data_nolock(inode);
4589 5514 : if (!err && ext4_has_inline_data(inode))
4590 0 : ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
4591 5514 : return err;
4592 : } else
4593 235148 : EXT4_I(inode)->i_inline_off = 0;
4594 235148 : return 0;
4595 : }
4596 :
4597 154 : int ext4_get_projid(struct inode *inode, kprojid_t *projid)
4598 : {
4599 154 : if (!ext4_has_feature_project(inode->i_sb))
4600 : return -EOPNOTSUPP;
4601 154 : *projid = EXT4_I(inode)->i_projid;
4602 154 : return 0;
4603 : }
4604 :
4605 : /*
4606 : * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of
4607 : * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag
4608 : * set.
4609 : */
4610 : static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
4611 : {
4612 240999 : if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
4613 0 : inode_set_iversion_raw(inode, val);
4614 : else
4615 240999 : inode_set_iversion_queried(inode, val);
4616 : }
4617 :
4618 289083 : static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags)
4619 :
4620 : {
4621 289083 : if (flags & EXT4_IGET_EA_INODE) {
4622 85 : if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
4623 : return "missing EA_INODE flag";
4624 85 : if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
4625 85 : EXT4_I(inode)->i_file_acl)
4626 : return "ea_inode with extended attributes";
4627 : } else {
4628 288998 : if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
4629 : return "unexpected EA_INODE flag";
4630 : }
4631 289083 : if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD))
4632 0 : return "unexpected bad inode w/o EXT4_IGET_BAD";
4633 : return NULL;
4634 : }
4635 :
4636 290122 : struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
4637 : ext4_iget_flags flags, const char *function,
4638 : unsigned int line)
4639 : {
4640 290122 : struct ext4_iloc iloc;
4641 290122 : struct ext4_inode *raw_inode;
4642 290122 : struct ext4_inode_info *ei;
4643 290122 : struct ext4_super_block *es = EXT4_SB(sb)->s_es;
4644 290122 : struct inode *inode;
4645 290122 : const char *err_str;
4646 290122 : journal_t *journal = EXT4_SB(sb)->s_journal;
4647 290122 : long ret;
4648 290122 : loff_t size;
4649 290122 : int block;
4650 290122 : uid_t i_uid;
4651 290122 : gid_t i_gid;
4652 290122 : projid_t i_projid;
4653 :
4654 290122 : if ((!(flags & EXT4_IGET_SPECIAL) &&
4655 282380 : ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
4656 282380 : ino == le32_to_cpu(es->s_usr_quota_inum) ||
4657 282380 : ino == le32_to_cpu(es->s_grp_quota_inum) ||
4658 282380 : ino == le32_to_cpu(es->s_prj_quota_inum) ||
4659 290122 : ino == le32_to_cpu(es->s_orphan_file_inum))) ||
4660 290122 : (ino < EXT4_ROOT_INO) ||
4661 290122 : (ino > le32_to_cpu(es->s_inodes_count))) {
4662 5 : if (flags & EXT4_IGET_HANDLE)
4663 : return ERR_PTR(-ESTALE);
4664 0 : __ext4_error(sb, function, line, false, EFSCORRUPTED, 0,
4665 : "inode #%lu: comm %s: iget: illegal inode #",
4666 0 : ino, current->comm);
4667 0 : return ERR_PTR(-EFSCORRUPTED);
4668 : }
4669 :
4670 290117 : inode = iget_locked(sb, ino);
4671 290126 : if (!inode)
4672 : return ERR_PTR(-ENOMEM);
4673 290126 : if (!(inode->i_state & I_NEW)) {
4674 48090 : if ((err_str = check_igot_inode(inode, flags)) != NULL) {
4675 0 : ext4_error_inode(inode, function, line, 0, err_str);
4676 0 : iput(inode);
4677 0 : return ERR_PTR(-EFSCORRUPTED);
4678 : }
4679 : return inode;
4680 : }
4681 :
4682 242036 : ei = EXT4_I(inode);
4683 242036 : iloc.bh = NULL;
4684 :
4685 242036 : ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
4686 242036 : if (ret < 0)
4687 0 : goto bad_inode;
4688 242036 : raw_inode = ext4_raw_inode(&iloc);
4689 :
4690 242036 : if ((flags & EXT4_IGET_HANDLE) &&
4691 4213 : (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
4692 0 : ret = -ESTALE;
4693 0 : goto bad_inode;
4694 : }
4695 :
4696 242036 : if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4697 241700 : ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
4698 241700 : if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
4699 241700 : EXT4_INODE_SIZE(inode->i_sb) ||
4700 : (ei->i_extra_isize & 3)) {
4701 0 : ext4_error_inode(inode, function, line, 0,
4702 : "iget: bad extra_isize %u "
4703 : "(inode size %u)",
4704 : ei->i_extra_isize,
4705 : EXT4_INODE_SIZE(inode->i_sb));
4706 0 : ret = -EFSCORRUPTED;
4707 0 : goto bad_inode;
4708 : }
4709 : } else
4710 336 : ei->i_extra_isize = 0;
4711 :
4712 : /* Precompute checksum seed for inode metadata */
4713 242036 : if (ext4_has_metadata_csum(sb)) {
4714 241579 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4715 241579 : __u32 csum;
4716 241579 : __le32 inum = cpu_to_le32(inode->i_ino);
4717 241579 : __le32 gen = raw_inode->i_generation;
4718 241579 : csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
4719 : sizeof(inum));
4720 241579 : ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
4721 : sizeof(gen));
4722 : }
4723 :
4724 242036 : if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
4725 2 : ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) &&
4726 2 : (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) {
4727 2 : ext4_error_inode_err(inode, function, line, 0,
4728 : EFSBADCRC, "iget: checksum invalid");
4729 2 : ret = -EFSBADCRC;
4730 2 : goto bad_inode;
4731 : }
4732 :
4733 242034 : inode->i_mode = le16_to_cpu(raw_inode->i_mode);
4734 242034 : i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
4735 242034 : i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
4736 242034 : if (ext4_has_feature_project(sb) &&
4737 253 : EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE &&
4738 253 : EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
4739 253 : i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid);
4740 : else
4741 : i_projid = EXT4_DEF_PROJID;
4742 :
4743 242034 : if (!(test_opt(inode->i_sb, NO_UID32))) {
4744 242030 : i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
4745 242030 : i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
4746 : }
4747 242034 : i_uid_write(inode, i_uid);
4748 242033 : i_gid_write(inode, i_gid);
4749 242034 : ei->i_projid = make_kprojid(&init_user_ns, i_projid);
4750 242034 : set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
4751 :
4752 242033 : ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
4753 242033 : ei->i_inline_off = 0;
4754 242033 : ei->i_dir_start_lookup = 0;
4755 242033 : ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
4756 : /* We now have enough fields to check if the inode was active or not.
4757 : * This is needed because nfsd might try to access dead inodes
4758 : * the test is that same one that e2fsck uses
4759 : * NeilBrown 1999oct15
4760 : */
4761 242033 : if (inode->i_nlink == 0) {
4762 51631 : if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL ||
4763 51631 : !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
4764 : ino != EXT4_BOOT_LOADER_INO) {
4765 : /* this inode is deleted or unallocated */
4766 1035 : if (flags & EXT4_IGET_SPECIAL) {
4767 0 : ext4_error_inode(inode, function, line, 0,
4768 : "iget: special inode unallocated");
4769 0 : ret = -EFSCORRUPTED;
4770 : } else
4771 : ret = -ESTALE;
4772 1035 : goto bad_inode;
4773 : }
4774 : /* The only unlinked inodes we let through here have
4775 : * valid i_mode and are being read by the orphan
4776 : * recovery code: that's fine, we're about to complete
4777 : * the process of deleting those.
4778 : * OR it is the EXT4_BOOT_LOADER_INO which is
4779 : * not initialized on a new filesystem. */
4780 : }
4781 240998 : ei->i_flags = le32_to_cpu(raw_inode->i_flags);
4782 240998 : ext4_set_inode_flags(inode, true);
4783 240999 : inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
4784 240999 : ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
4785 240999 : if (ext4_has_feature_64bit(sb))
4786 240539 : ei->i_file_acl |=
4787 240539 : ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
4788 240999 : inode->i_size = ext4_isize(sb, raw_inode);
4789 240999 : if ((size = i_size_read(inode)) < 0) {
4790 0 : ext4_error_inode(inode, function, line, 0,
4791 : "iget: bad i_size value: %lld", size);
4792 0 : ret = -EFSCORRUPTED;
4793 0 : goto bad_inode;
4794 : }
4795 : /*
4796 : * If dir_index is not enabled but there's dir with INDEX flag set,
4797 : * we'd normally treat htree data as empty space. But with metadata
4798 : * checksumming that corrupts checksums so forbid that.
4799 : */
4800 240999 : if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) &&
4801 : ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
4802 0 : ext4_error_inode(inode, function, line, 0,
4803 : "iget: Dir with htree data on filesystem without dir_index feature.");
4804 0 : ret = -EFSCORRUPTED;
4805 0 : goto bad_inode;
4806 : }
4807 240999 : ei->i_disksize = inode->i_size;
4808 : #ifdef CONFIG_QUOTA
4809 240999 : ei->i_reserved_quota = 0;
4810 : #endif
4811 240999 : inode->i_generation = le32_to_cpu(raw_inode->i_generation);
4812 240999 : ei->i_block_group = iloc.block_group;
4813 240999 : ei->i_last_alloc_group = ~0;
4814 : /*
4815 : * NOTE! The in-memory inode i_data array is in little-endian order
4816 : * even on big-endian machines: we do NOT byteswap the block numbers!
4817 : */
4818 3855954 : for (block = 0; block < EXT4_N_BLOCKS; block++)
4819 3614955 : ei->i_data[block] = raw_inode->i_block[block];
4820 240999 : INIT_LIST_HEAD(&ei->i_orphan);
4821 240999 : ext4_fc_init_inode(&ei->vfs_inode);
4822 :
4823 : /*
4824 : * Set transaction id's of transactions that have to be committed
4825 : * to finish f[data]sync. We set them to currently running transaction
4826 : * as we cannot be sure that the inode or some of its metadata isn't
4827 : * part of the transaction - the inode could have been reclaimed and
4828 : * now it is reread from disk.
4829 : */
4830 241000 : if (journal) {
4831 238453 : transaction_t *transaction;
4832 238453 : tid_t tid;
4833 :
4834 238453 : read_lock(&journal->j_state_lock);
4835 238452 : if (journal->j_running_transaction)
4836 : transaction = journal->j_running_transaction;
4837 : else
4838 52610 : transaction = journal->j_committing_transaction;
4839 238452 : if (transaction)
4840 185844 : tid = transaction->t_tid;
4841 : else
4842 52608 : tid = journal->j_commit_sequence;
4843 238452 : read_unlock(&journal->j_state_lock);
4844 238453 : ei->i_sync_tid = tid;
4845 238453 : ei->i_datasync_tid = tid;
4846 : }
4847 :
4848 241000 : if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4849 240663 : if (ei->i_extra_isize == 0) {
4850 : /* The extra space is currently unused. Use it. */
4851 0 : BUILD_BUG_ON(sizeof(struct ext4_inode) & 3);
4852 0 : ei->i_extra_isize = sizeof(struct ext4_inode) -
4853 : EXT4_GOOD_OLD_INODE_SIZE;
4854 : } else {
4855 240663 : ret = ext4_iget_extra_inode(inode, raw_inode, ei);
4856 240662 : if (ret)
4857 0 : goto bad_inode;
4858 : }
4859 : }
4860 :
4861 240999 : EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
4862 240999 : EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
4863 240999 : EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
4864 240999 : EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
4865 :
4866 240999 : if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
4867 240999 : u64 ivers = le32_to_cpu(raw_inode->i_disk_version);
4868 :
4869 240999 : if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4870 240663 : if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4871 240663 : ivers |=
4872 240663 : (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4873 : }
4874 240999 : ext4_inode_set_iversion_queried(inode, ivers);
4875 : }
4876 :
4877 240999 : ret = 0;
4878 243427 : if (ei->i_file_acl &&
4879 2428 : !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) {
4880 0 : ext4_error_inode(inode, function, line, 0,
4881 : "iget: bad extended attribute block %llu",
4882 : ei->i_file_acl);
4883 0 : ret = -EFSCORRUPTED;
4884 0 : goto bad_inode;
4885 240999 : } else if (!ext4_has_inline_data(inode)) {
4886 : /* validate the block references in the inode */
4887 240999 : if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
4888 240998 : (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4889 7368 : (S_ISLNK(inode->i_mode) &&
4890 7368 : !ext4_inode_is_fast_symlink(inode)))) {
4891 217865 : if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4892 217373 : ret = ext4_ext_check_inode(inode);
4893 : else
4894 492 : ret = ext4_ind_check_inode(inode);
4895 : }
4896 : }
4897 217865 : if (ret)
4898 0 : goto bad_inode;
4899 :
4900 240999 : if (S_ISREG(inode->i_mode)) {
4901 172251 : inode->i_op = &ext4_file_inode_operations;
4902 172251 : inode->i_fop = &ext4_file_operations;
4903 172251 : ext4_set_aops(inode);
4904 68748 : } else if (S_ISDIR(inode->i_mode)) {
4905 38420 : inode->i_op = &ext4_dir_inode_operations;
4906 38420 : inode->i_fop = &ext4_dir_operations;
4907 30328 : } else if (S_ISLNK(inode->i_mode)) {
4908 : /* VFS does not allow setting these so must be corruption */
4909 7368 : if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
4910 0 : ext4_error_inode(inode, function, line, 0,
4911 : "iget: immutable or append flags "
4912 : "not allowed on symlinks");
4913 0 : ret = -EFSCORRUPTED;
4914 0 : goto bad_inode;
4915 : }
4916 7368 : if (IS_ENCRYPTED(inode)) {
4917 0 : inode->i_op = &ext4_encrypted_symlink_inode_operations;
4918 7368 : } else if (ext4_inode_is_fast_symlink(inode)) {
4919 173 : inode->i_link = (char *)ei->i_data;
4920 173 : inode->i_op = &ext4_fast_symlink_inode_operations;
4921 173 : nd_terminate_link(ei->i_data, inode->i_size,
4922 : sizeof(ei->i_data) - 1);
4923 : } else {
4924 7195 : inode->i_op = &ext4_symlink_inode_operations;
4925 : }
4926 22960 : } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
4927 0 : S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
4928 22960 : inode->i_op = &ext4_special_inode_operations;
4929 22960 : if (raw_inode->i_block[0])
4930 0 : init_special_inode(inode, inode->i_mode,
4931 : old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
4932 : else
4933 22960 : init_special_inode(inode, inode->i_mode,
4934 22960 : new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4935 0 : } else if (ino == EXT4_BOOT_LOADER_INO) {
4936 0 : make_bad_inode(inode);
4937 : } else {
4938 0 : ret = -EFSCORRUPTED;
4939 0 : ext4_error_inode(inode, function, line, 0,
4940 : "iget: bogus i_mode (%o)", inode->i_mode);
4941 0 : goto bad_inode;
4942 : }
4943 240997 : if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
4944 0 : ext4_error_inode(inode, function, line, 0,
4945 : "casefold flag without casefold feature");
4946 240997 : if ((err_str = check_igot_inode(inode, flags)) != NULL) {
4947 0 : ext4_error_inode(inode, function, line, 0, err_str);
4948 0 : ret = -EFSCORRUPTED;
4949 0 : goto bad_inode;
4950 : }
4951 :
4952 240994 : brelse(iloc.bh);
4953 240999 : unlock_new_inode(inode);
4954 240999 : return inode;
4955 :
4956 1037 : bad_inode:
4957 1037 : brelse(iloc.bh);
4958 1037 : iget_failed(inode);
4959 1037 : return ERR_PTR(ret);
4960 : }
4961 :
4962 375 : static void __ext4_update_other_inode_time(struct super_block *sb,
4963 : unsigned long orig_ino,
4964 : unsigned long ino,
4965 : struct ext4_inode *raw_inode)
4966 : {
4967 375 : struct inode *inode;
4968 :
4969 375 : inode = find_inode_by_ino_rcu(sb, ino);
4970 375 : if (!inode)
4971 : return;
4972 :
4973 50 : if (!inode_is_dirtytime_only(inode))
4974 : return;
4975 :
4976 0 : spin_lock(&inode->i_lock);
4977 0 : if (inode_is_dirtytime_only(inode)) {
4978 0 : struct ext4_inode_info *ei = EXT4_I(inode);
4979 :
4980 0 : inode->i_state &= ~I_DIRTY_TIME;
4981 0 : spin_unlock(&inode->i_lock);
4982 :
4983 0 : spin_lock(&ei->i_raw_lock);
4984 0 : EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
4985 0 : EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
4986 0 : EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
4987 0 : ext4_inode_csum_set(inode, raw_inode, ei);
4988 0 : spin_unlock(&ei->i_raw_lock);
4989 0 : trace_ext4_other_inode_update_time(inode, orig_ino);
4990 0 : return;
4991 : }
4992 0 : spin_unlock(&inode->i_lock);
4993 : }
4994 :
4995 : /*
4996 : * Opportunistically update the other time fields for other inodes in
4997 : * the same inode table block.
4998 : */
4999 25 : static void ext4_update_other_inodes_time(struct super_block *sb,
5000 : unsigned long orig_ino, char *buf)
5001 : {
5002 25 : unsigned long ino;
5003 25 : int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
5004 25 : int inode_size = EXT4_INODE_SIZE(sb);
5005 :
5006 : /*
5007 : * Calculate the first inode in the inode table block. Inode
5008 : * numbers are one-based. That is, the first inode in a block
5009 : * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1).
5010 : */
5011 25 : ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1;
5012 25 : rcu_read_lock();
5013 450 : for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
5014 400 : if (ino == orig_ino)
5015 25 : continue;
5016 375 : __ext4_update_other_inode_time(sb, orig_ino, ino,
5017 : (struct ext4_inode *)buf);
5018 : }
5019 25 : rcu_read_unlock();
5020 25 : }
5021 :
5022 : /*
5023 : * Post the struct inode info into an on-disk inode location in the
5024 : * buffer-cache. This gobbles the caller's reference to the
5025 : * buffer_head in the inode location struct.
5026 : *
5027 : * The caller must have write access to iloc->bh.
5028 : */
5029 78557940 : static int ext4_do_update_inode(handle_t *handle,
5030 : struct inode *inode,
5031 : struct ext4_iloc *iloc)
5032 : {
5033 78557940 : struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
5034 78557940 : struct ext4_inode_info *ei = EXT4_I(inode);
5035 78557940 : struct buffer_head *bh = iloc->bh;
5036 78557940 : struct super_block *sb = inode->i_sb;
5037 78557940 : int err;
5038 78557940 : int need_datasync = 0, set_large_file = 0;
5039 :
5040 78557940 : spin_lock(&ei->i_raw_lock);
5041 :
5042 : /*
5043 : * For fields not tracked in the in-memory inode, initialise them
5044 : * to zero for new inodes.
5045 : */
5046 78555718 : if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
5047 5674570 : memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
5048 :
5049 78555718 : if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode))
5050 3165017 : need_datasync = 1;
5051 78555718 : if (ei->i_disksize > 0x7fffffffULL) {
5052 3553746 : if (!ext4_has_feature_large_file(sb) ||
5053 3553805 : EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV))
5054 : set_large_file = 1;
5055 : }
5056 :
5057 78555718 : err = ext4_fill_raw_inode(inode, raw_inode);
5058 78608308 : spin_unlock(&ei->i_raw_lock);
5059 78683641 : if (err) {
5060 0 : EXT4_ERROR_INODE(inode, "corrupted inode contents");
5061 0 : goto out_brelse;
5062 : }
5063 :
5064 78683641 : if (inode->i_sb->s_flags & SB_LAZYTIME)
5065 25 : ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
5066 : bh->b_data);
5067 :
5068 78683641 : BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
5069 78683641 : err = ext4_handle_dirty_metadata(handle, NULL, bh);
5070 78515078 : if (err)
5071 0 : goto out_error;
5072 78515078 : ext4_clear_inode_state(inode, EXT4_STATE_NEW);
5073 78677201 : if (set_large_file) {
5074 0 : BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
5075 0 : err = ext4_journal_get_write_access(handle, sb,
5076 : EXT4_SB(sb)->s_sbh,
5077 : EXT4_JTR_NONE);
5078 0 : if (err)
5079 0 : goto out_error;
5080 0 : lock_buffer(EXT4_SB(sb)->s_sbh);
5081 0 : ext4_set_feature_large_file(sb);
5082 0 : ext4_superblock_csum_set(sb);
5083 0 : unlock_buffer(EXT4_SB(sb)->s_sbh);
5084 0 : ext4_handle_sync(handle);
5085 0 : err = ext4_handle_dirty_metadata(handle, NULL,
5086 : EXT4_SB(sb)->s_sbh);
5087 : }
5088 78677201 : ext4_update_inode_fsync_trans(handle, inode, need_datasync);
5089 78645483 : out_error:
5090 78645483 : ext4_std_error(inode->i_sb, err);
5091 78645483 : out_brelse:
5092 78645483 : brelse(bh);
5093 78641312 : return err;
5094 : }
5095 :
5096 : /*
5097 : * ext4_write_inode()
5098 : *
5099 : * We are called from a few places:
5100 : *
5101 : * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
5102 : * Here, there will be no transaction running. We wait for any running
5103 : * transaction to commit.
5104 : *
5105 : * - Within flush work (sys_sync(), kupdate and such).
5106 : * We wait on commit, if told to.
5107 : *
5108 : * - Within iput_final() -> write_inode_now()
5109 : * We wait on commit, if told to.
5110 : *
5111 : * In all cases it is actually safe for us to return without doing anything,
5112 : * because the inode has been copied into a raw inode buffer in
5113 : * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
5114 : * writeback.
5115 : *
5116 : * Note that we are absolutely dependent upon all inode dirtiers doing the
5117 : * right thing: they *must* call mark_inode_dirty() after dirtying info in
5118 : * which we are interested.
5119 : *
5120 : * It would be a bug for them to not do this. The code:
5121 : *
5122 : * mark_inode_dirty(inode)
5123 : * stuff();
5124 : * inode->i_size = expr;
5125 : *
5126 : * is in error because write_inode() could occur while `stuff()' is running,
5127 : * and the new i_size will be lost. Plus the inode will no longer be on the
5128 : * superblock's dirty inode list.
5129 : */
5130 922475 : int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5131 : {
5132 922475 : int err;
5133 :
5134 922475 : if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) ||
5135 922475 : sb_rdonly(inode->i_sb))
5136 : return 0;
5137 :
5138 1844950 : if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
5139 : return -EIO;
5140 :
5141 894209 : if (EXT4_SB(inode->i_sb)->s_journal) {
5142 794185 : if (ext4_journal_current_handle()) {
5143 0 : ext4_debug("called recursively, non-PF_MEMALLOC!\n");
5144 0 : dump_stack();
5145 0 : return -EIO;
5146 : }
5147 :
5148 : /*
5149 : * No need to force transaction in WB_SYNC_NONE mode. Also
5150 : * ext4_sync_fs() will force the commit after everything is
5151 : * written.
5152 : */
5153 794185 : if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
5154 : return 0;
5155 :
5156 1 : err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
5157 1 : EXT4_I(inode)->i_sync_tid);
5158 : } else {
5159 100024 : struct ext4_iloc iloc;
5160 :
5161 100024 : err = __ext4_get_inode_loc_noinmem(inode, &iloc);
5162 100024 : if (err)
5163 0 : return err;
5164 : /*
5165 : * sync(2) will flush the whole buffer cache. No need to do
5166 : * it here separately for each inode.
5167 : */
5168 100024 : if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
5169 21 : sync_dirty_buffer(iloc.bh);
5170 200220 : if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5171 0 : ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO,
5172 : "IO error syncing inode");
5173 0 : err = -EIO;
5174 : }
5175 100024 : brelse(iloc.bh);
5176 : }
5177 : return err;
5178 : }
5179 :
5180 : /*
5181 : * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate
5182 : * buffers that are attached to a folio straddling i_size and are undergoing
5183 : * commit. In that case we have to wait for commit to finish and try again.
5184 : */
5185 1065 : static void ext4_wait_for_tail_page_commit(struct inode *inode)
5186 : {
5187 1065 : unsigned offset;
5188 1065 : journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
5189 1065 : tid_t commit_tid = 0;
5190 1065 : int ret;
5191 :
5192 1065 : offset = inode->i_size & (PAGE_SIZE - 1);
5193 : /*
5194 : * If the folio is fully truncated, we don't need to wait for any commit
5195 : * (and we even should not as __ext4_journalled_invalidate_folio() may
5196 : * strip all buffers from the folio but keep the folio dirty which can then
5197 : * confuse e.g. concurrent ext4_writepages() seeing dirty folio without
5198 : * buffers). Also we don't need to wait for any commit if all buffers in
5199 : * the folio remain valid. This is most beneficial for the common case of
5200 : * blocksize == PAGESIZE.
5201 : */
5202 1065 : if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
5203 1065 : return;
5204 0 : while (1) {
5205 0 : struct folio *folio = filemap_lock_folio(inode->i_mapping,
5206 0 : inode->i_size >> PAGE_SHIFT);
5207 0 : if (IS_ERR(folio))
5208 : return;
5209 0 : ret = __ext4_journalled_invalidate_folio(folio, offset,
5210 0 : folio_size(folio) - offset);
5211 0 : folio_unlock(folio);
5212 0 : folio_put(folio);
5213 0 : if (ret != -EBUSY)
5214 : return;
5215 0 : commit_tid = 0;
5216 0 : read_lock(&journal->j_state_lock);
5217 0 : if (journal->j_committing_transaction)
5218 0 : commit_tid = journal->j_committing_transaction->t_tid;
5219 0 : read_unlock(&journal->j_state_lock);
5220 0 : if (commit_tid)
5221 0 : jbd2_log_wait_commit(journal, commit_tid);
5222 : }
5223 : }
5224 :
5225 : /*
5226 : * ext4_setattr()
5227 : *
5228 : * Called from notify_change.
5229 : *
5230 : * We want to trap VFS attempts to truncate the file as soon as
5231 : * possible. In particular, we want to make sure that when the VFS
5232 : * shrinks i_size, we put the inode on the orphan list and modify
5233 : * i_disksize immediately, so that during the subsequent flushing of
5234 : * dirty pages and freeing of disk blocks, we can guarantee that any
5235 : * commit will leave the blocks being flushed in an unused state on
5236 : * disk. (On recovery, the inode will get truncated and the blocks will
5237 : * be freed, so we have a strong guarantee that no future commit will
5238 : * leave these blocks visible to the user.)
5239 : *
5240 : * Another thing we have to assure is that if we are in ordered mode
5241 : * and inode is still attached to the committing transaction, we must
5242 : * we start writeout of all the dirty pages which are being truncated.
5243 : * This way we are sure that all the data written in the previous
5244 : * transaction are already on disk (truncate waits for pages under
5245 : * writeback).
5246 : *
5247 : * Called with inode->i_rwsem down.
5248 : */
5249 2616098 : int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
5250 : struct iattr *attr)
5251 : {
5252 2616098 : struct inode *inode = d_inode(dentry);
5253 2616098 : int error, rc = 0;
5254 2616098 : int orphan = 0;
5255 2616098 : const unsigned int ia_valid = attr->ia_valid;
5256 2616098 : bool inc_ivers = true;
5257 :
5258 5232196 : if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
5259 : return -EIO;
5260 :
5261 2615988 : if (unlikely(IS_IMMUTABLE(inode)))
5262 : return -EPERM;
5263 :
5264 2615988 : if (unlikely(IS_APPEND(inode) &&
5265 : (ia_valid & (ATTR_MODE | ATTR_UID |
5266 : ATTR_GID | ATTR_TIMES_SET))))
5267 : return -EPERM;
5268 :
5269 2615988 : error = setattr_prepare(idmap, dentry, attr);
5270 2615974 : if (error)
5271 : return error;
5272 :
5273 2615872 : error = fscrypt_prepare_setattr(dentry, attr);
5274 2615872 : if (error)
5275 : return error;
5276 :
5277 2615872 : error = fsverity_prepare_setattr(dentry, attr);
5278 2615872 : if (error)
5279 : return error;
5280 :
5281 2615872 : if (is_quota_modification(idmap, inode, attr)) {
5282 1546857 : error = dquot_initialize(inode);
5283 1546859 : if (error)
5284 : return error;
5285 : }
5286 :
5287 4202287 : if (i_uid_needs_update(idmap, attr, inode) ||
5288 1586435 : i_gid_needs_update(idmap, attr, inode)) {
5289 1030301 : handle_t *handle;
5290 :
5291 : /* (user+group)*(old+new) structure, inode write (sb,
5292 : * inode block, ? - but truncate inode update has it) */
5293 1041441 : handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
5294 : (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
5295 : EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
5296 1030299 : if (IS_ERR(handle)) {
5297 0 : error = PTR_ERR(handle);
5298 0 : goto err_out;
5299 : }
5300 :
5301 : /* dquot_transfer() calls back ext4_get_inode_usage() which
5302 : * counts xattr inode references.
5303 : */
5304 1030299 : down_read(&EXT4_I(inode)->xattr_sem);
5305 1030297 : error = dquot_transfer(idmap, inode, attr);
5306 1030295 : up_read(&EXT4_I(inode)->xattr_sem);
5307 :
5308 1030296 : if (error) {
5309 1 : ext4_journal_stop(handle);
5310 1 : return error;
5311 : }
5312 : /* Update corresponding info in inode so that everything is in
5313 : * one transaction */
5314 1030295 : i_uid_update(idmap, attr, inode);
5315 1030298 : i_gid_update(idmap, attr, inode);
5316 1030295 : error = ext4_mark_inode_dirty(handle, inode);
5317 1030303 : ext4_journal_stop(handle);
5318 1030290 : if (unlikely(error)) {
5319 : return error;
5320 : }
5321 : }
5322 :
5323 2615849 : if (attr->ia_valid & ATTR_SIZE) {
5324 516538 : handle_t *handle;
5325 516538 : loff_t oldsize = inode->i_size;
5326 516538 : loff_t old_disksize;
5327 516538 : int shrink = (attr->ia_size < inode->i_size);
5328 :
5329 516538 : if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
5330 28 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5331 :
5332 28 : if (attr->ia_size > sbi->s_bitmap_maxbytes) {
5333 : return -EFBIG;
5334 : }
5335 : }
5336 516538 : if (!S_ISREG(inode->i_mode)) {
5337 : return -EINVAL;
5338 : }
5339 :
5340 516538 : if (attr->ia_size == inode->i_size)
5341 63165 : inc_ivers = false;
5342 :
5343 516538 : if (shrink) {
5344 295955 : if (ext4_should_order_data(inode)) {
5345 293313 : error = ext4_begin_ordered_truncate(inode,
5346 : attr->ia_size);
5347 293320 : if (error)
5348 0 : goto err_out;
5349 : }
5350 : /*
5351 : * Blocks are going to be removed from the inode. Wait
5352 : * for dio in flight.
5353 : */
5354 295967 : inode_dio_wait(inode);
5355 : }
5356 :
5357 516546 : filemap_invalidate_lock(inode->i_mapping);
5358 :
5359 516597 : rc = ext4_break_layouts(inode);
5360 516539 : if (rc) {
5361 0 : filemap_invalidate_unlock(inode->i_mapping);
5362 0 : goto err_out;
5363 : }
5364 :
5365 516539 : if (attr->ia_size != inode->i_size) {
5366 453377 : handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
5367 453399 : if (IS_ERR(handle)) {
5368 0 : error = PTR_ERR(handle);
5369 0 : goto out_mmap_sem;
5370 : }
5371 453399 : if (ext4_handle_valid(handle) && shrink) {
5372 295954 : error = ext4_orphan_add(handle, inode);
5373 295954 : orphan = 1;
5374 : }
5375 : /*
5376 : * Update c/mtime on truncate up, ext4_truncate() will
5377 : * update c/mtime in shrink case below
5378 : */
5379 453423 : if (!shrink) {
5380 157436 : inode->i_mtime = current_time(inode);
5381 157426 : inode->i_ctime = inode->i_mtime;
5382 : }
5383 :
5384 453413 : if (shrink)
5385 591958 : ext4_fc_track_range(handle, inode,
5386 295979 : (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
5387 295979 : inode->i_sb->s_blocksize_bits,
5388 : EXT_MAX_BLOCKS - 1);
5389 : else
5390 157434 : ext4_fc_track_range(
5391 : handle, inode,
5392 280249 : (oldsize > 0 ? oldsize - 1 : oldsize) >>
5393 157434 : inode->i_sb->s_blocksize_bits,
5394 157434 : (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
5395 157434 : inode->i_sb->s_blocksize_bits);
5396 :
5397 453399 : down_write(&EXT4_I(inode)->i_data_sem);
5398 453403 : old_disksize = EXT4_I(inode)->i_disksize;
5399 453403 : EXT4_I(inode)->i_disksize = attr->ia_size;
5400 453403 : rc = ext4_mark_inode_dirty(handle, inode);
5401 453425 : if (!error)
5402 453421 : error = rc;
5403 : /*
5404 : * We have to update i_size under i_data_sem together
5405 : * with i_disksize to avoid races with writeback code
5406 : * running ext4_wb_update_i_disksize().
5407 : */
5408 453425 : if (!error)
5409 453425 : i_size_write(inode, attr->ia_size);
5410 : else
5411 0 : EXT4_I(inode)->i_disksize = old_disksize;
5412 453425 : up_write(&EXT4_I(inode)->i_data_sem);
5413 453418 : ext4_journal_stop(handle);
5414 453416 : if (error)
5415 0 : goto out_mmap_sem;
5416 453416 : if (!shrink) {
5417 157442 : pagecache_isize_extended(inode, oldsize,
5418 : inode->i_size);
5419 295974 : } else if (ext4_should_journal_data(inode)) {
5420 1065 : ext4_wait_for_tail_page_commit(inode);
5421 : }
5422 : }
5423 :
5424 : /*
5425 : * Truncate pagecache after we've waited for commit
5426 : * in data=journal mode to make pages freeable.
5427 : */
5428 516569 : truncate_pagecache(inode, inode->i_size);
5429 : /*
5430 : * Call ext4_truncate() even if i_size didn't change to
5431 : * truncate possible preallocated blocks.
5432 : */
5433 516599 : if (attr->ia_size <= oldsize) {
5434 359169 : rc = ext4_truncate(inode);
5435 359187 : if (rc)
5436 0 : error = rc;
5437 : }
5438 516617 : out_mmap_sem:
5439 516617 : filemap_invalidate_unlock(inode->i_mapping);
5440 : }
5441 :
5442 2615932 : if (!error) {
5443 2615935 : if (inc_ivers)
5444 2552728 : inode_inc_iversion(inode);
5445 2615949 : setattr_copy(idmap, inode, attr);
5446 2615916 : mark_inode_dirty(inode);
5447 : }
5448 :
5449 : /*
5450 : * If the call to ext4_truncate failed to get a transaction handle at
5451 : * all, we need to clean up the in-core orphan list manually.
5452 : */
5453 2615980 : if (orphan && inode->i_nlink)
5454 295978 : ext4_orphan_del(NULL, inode);
5455 :
5456 2615981 : if (!error && (ia_valid & ATTR_MODE))
5457 40017 : rc = posix_acl_chmod(idmap, dentry, inode->i_mode);
5458 :
5459 2575964 : err_out:
5460 2615969 : if (error)
5461 0 : ext4_std_error(inode->i_sb, error);
5462 2615969 : if (!error)
5463 2615970 : error = rc;
5464 : return error;
5465 : }
5466 :
5467 2919571 : u32 ext4_dio_alignment(struct inode *inode)
5468 : {
5469 2919571 : if (fsverity_active(inode))
5470 : return 0;
5471 2919571 : if (ext4_should_journal_data(inode))
5472 : return 0;
5473 2918894 : if (ext4_has_inline_data(inode))
5474 : return 0;
5475 2918894 : if (IS_ENCRYPTED(inode)) {
5476 0 : if (!fscrypt_dio_supported(inode))
5477 : return 0;
5478 0 : return i_blocksize(inode);
5479 : }
5480 : return 1; /* use the iomap defaults */
5481 : }
5482 :
5483 17629248 : int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
5484 : struct kstat *stat, u32 request_mask, unsigned int query_flags)
5485 : {
5486 17629248 : struct inode *inode = d_inode(path->dentry);
5487 17629248 : struct ext4_inode *raw_inode;
5488 17629248 : struct ext4_inode_info *ei = EXT4_I(inode);
5489 17629248 : unsigned int flags;
5490 :
5491 17629248 : if ((request_mask & STATX_BTIME) &&
5492 274 : EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
5493 274 : stat->result_mask |= STATX_BTIME;
5494 274 : stat->btime.tv_sec = ei->i_crtime.tv_sec;
5495 274 : stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
5496 : }
5497 :
5498 : /*
5499 : * Return the DIO alignment restrictions if requested. We only return
5500 : * this information when requested, since on encrypted files it might
5501 : * take a fair bit of work to get if the file wasn't opened recently.
5502 : */
5503 17629248 : if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
5504 0 : u32 dio_align = ext4_dio_alignment(inode);
5505 :
5506 0 : stat->result_mask |= STATX_DIOALIGN;
5507 0 : if (dio_align == 1) {
5508 0 : struct block_device *bdev = inode->i_sb->s_bdev;
5509 :
5510 : /* iomap defaults */
5511 0 : stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
5512 0 : stat->dio_offset_align = bdev_logical_block_size(bdev);
5513 : } else {
5514 0 : stat->dio_mem_align = dio_align;
5515 0 : stat->dio_offset_align = dio_align;
5516 : }
5517 : }
5518 :
5519 17629248 : flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
5520 17629248 : if (flags & EXT4_APPEND_FL)
5521 185 : stat->attributes |= STATX_ATTR_APPEND;
5522 17629248 : if (flags & EXT4_COMPR_FL)
5523 71 : stat->attributes |= STATX_ATTR_COMPRESSED;
5524 17629248 : if (flags & EXT4_ENCRYPT_FL)
5525 0 : stat->attributes |= STATX_ATTR_ENCRYPTED;
5526 17629248 : if (flags & EXT4_IMMUTABLE_FL)
5527 190 : stat->attributes |= STATX_ATTR_IMMUTABLE;
5528 17629248 : if (flags & EXT4_NODUMP_FL)
5529 66 : stat->attributes |= STATX_ATTR_NODUMP;
5530 17629248 : if (flags & EXT4_VERITY_FL)
5531 0 : stat->attributes |= STATX_ATTR_VERITY;
5532 :
5533 17629248 : stat->attributes_mask |= (STATX_ATTR_APPEND |
5534 : STATX_ATTR_COMPRESSED |
5535 : STATX_ATTR_ENCRYPTED |
5536 : STATX_ATTR_IMMUTABLE |
5537 : STATX_ATTR_NODUMP |
5538 : STATX_ATTR_VERITY);
5539 :
5540 17629248 : generic_fillattr(idmap, inode, stat);
5541 17639627 : return 0;
5542 : }
5543 :
5544 10450661 : int ext4_file_getattr(struct mnt_idmap *idmap,
5545 : const struct path *path, struct kstat *stat,
5546 : u32 request_mask, unsigned int query_flags)
5547 : {
5548 10450661 : struct inode *inode = d_inode(path->dentry);
5549 10450661 : u64 delalloc_blocks;
5550 :
5551 10450661 : ext4_getattr(idmap, path, stat, request_mask, query_flags);
5552 :
5553 : /*
5554 : * If there is inline data in the inode, the inode will normally not
5555 : * have data blocks allocated (it may have an external xattr block).
5556 : * Report at least one sector for such files, so tools like tar, rsync,
5557 : * others don't incorrectly think the file is completely sparse.
5558 : */
5559 10452363 : if (unlikely(ext4_has_inline_data(inode)))
5560 0 : stat->blocks += (stat->size + 511) >> 9;
5561 :
5562 : /*
5563 : * We can't update i_blocks if the block allocation is delayed
5564 : * otherwise in the case of system crash before the real block
5565 : * allocation is done, we will have i_blocks inconsistent with
5566 : * on-disk file blocks.
5567 : * We always keep i_blocks updated together with real
5568 : * allocation. But to not confuse with user, stat
5569 : * will return the blocks that include the delayed allocation
5570 : * blocks for this file.
5571 : */
5572 10452363 : delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
5573 : EXT4_I(inode)->i_reserved_data_blocks);
5574 10452363 : stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
5575 10452363 : return 0;
5576 : }
5577 :
5578 9388185 : static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
5579 : int pextents)
5580 : {
5581 9388185 : if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5582 4566 : return ext4_ind_trans_blocks(inode, lblocks);
5583 9383619 : return ext4_ext_index_trans_blocks(inode, pextents);
5584 : }
5585 :
5586 : /*
5587 : * Account for index blocks, block groups bitmaps and block group
5588 : * descriptor blocks if modify datablocks and index blocks
5589 : * worse case, the indexs blocks spread over different block groups
5590 : *
5591 : * If datablocks are discontiguous, they are possible to spread over
5592 : * different block groups too. If they are contiguous, with flexbg,
5593 : * they could still across block group boundary.
5594 : *
5595 : * Also account for superblock, inode, quota and xattr blocks
5596 : */
5597 9388819 : static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
5598 : int pextents)
5599 : {
5600 9388819 : ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
5601 9388453 : int gdpblocks;
5602 9388453 : int idxblocks;
5603 9388453 : int ret;
5604 :
5605 : /*
5606 : * How many index blocks need to touch to map @lblocks logical blocks
5607 : * to @pextents physical extents?
5608 : */
5609 9388453 : idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
5610 :
5611 9388361 : ret = idxblocks;
5612 :
5613 : /*
5614 : * Now let's see how many group bitmaps and group descriptors need
5615 : * to account
5616 : */
5617 9388361 : groups = idxblocks + pextents;
5618 9388361 : gdpblocks = groups;
5619 9388361 : if (groups > ngroups)
5620 : groups = ngroups;
5621 9388361 : if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
5622 617628 : gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
5623 :
5624 : /* bitmaps and block group descriptor blocks */
5625 9388361 : ret += groups + gdpblocks;
5626 :
5627 : /* Blocks for super block, inode, quota and xattr blocks */
5628 9388361 : ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
5629 :
5630 9388361 : return ret;
5631 : }
5632 :
5633 : /*
5634 : * Calculate the total number of credits to reserve to fit
5635 : * the modification of a single pages into a single transaction,
5636 : * which may include multiple chunks of block allocations.
5637 : *
5638 : * This could be called via ext4_write_begin()
5639 : *
5640 : * We need to consider the worse case, when
5641 : * one new block per extent.
5642 : */
5643 3883676 : int ext4_writepage_trans_blocks(struct inode *inode)
5644 : {
5645 3883676 : int bpp = ext4_journal_blocks_per_page(inode);
5646 3883459 : int ret;
5647 :
5648 3883459 : ret = ext4_meta_trans_blocks(inode, bpp, bpp);
5649 :
5650 : /* Account for data blocks for journalled mode */
5651 3883194 : if (ext4_should_journal_data(inode))
5652 200606 : ret += bpp;
5653 3882751 : return ret;
5654 : }
5655 :
5656 : /*
5657 : * Calculate the journal credits for a chunk of data modification.
5658 : *
5659 : * This is called from DIO, fallocate or whoever calling
5660 : * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
5661 : *
5662 : * journal buffers for data blocks are not included here, as DIO
5663 : * and fallocate do no need to journal data buffers.
5664 : */
5665 1620680 : int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
5666 : {
5667 4347147 : return ext4_meta_trans_blocks(inode, nrblocks, 1);
5668 : }
5669 :
5670 : /*
5671 : * The caller must have previously called ext4_reserve_inode_write().
5672 : * Give this, we know that the caller already has write access to iloc->bh.
5673 : */
5674 78531920 : int ext4_mark_iloc_dirty(handle_t *handle,
5675 : struct inode *inode, struct ext4_iloc *iloc)
5676 : {
5677 78531920 : int err = 0;
5678 :
5679 157063840 : if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
5680 0 : put_bh(iloc->bh);
5681 0 : return -EIO;
5682 : }
5683 78531920 : ext4_fc_track_inode(handle, inode);
5684 :
5685 : /* the do_update_inode consumes one bh->b_count */
5686 78471234 : get_bh(iloc->bh);
5687 :
5688 : /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
5689 78584272 : err = ext4_do_update_inode(handle, inode, iloc);
5690 78628691 : put_bh(iloc->bh);
5691 78628691 : return err;
5692 : }
5693 :
5694 : /*
5695 : * On success, We end up with an outstanding reference count against
5696 : * iloc->bh. This _must_ be cleaned up later.
5697 : */
5698 :
5699 : int
5700 79403770 : ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
5701 : struct ext4_iloc *iloc)
5702 : {
5703 79403770 : int err;
5704 :
5705 158807540 : if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
5706 : return -EIO;
5707 :
5708 79403770 : err = ext4_get_inode_loc(inode, iloc);
5709 79453781 : if (!err) {
5710 79471575 : BUFFER_TRACE(iloc->bh, "get_write_access");
5711 79471575 : err = ext4_journal_get_write_access(handle, inode->i_sb,
5712 : iloc->bh, EXT4_JTR_NONE);
5713 79543087 : if (err) {
5714 0 : brelse(iloc->bh);
5715 0 : iloc->bh = NULL;
5716 : }
5717 : }
5718 79525293 : ext4_std_error(inode->i_sb, err);
5719 : return err;
5720 : }
5721 :
5722 20 : static int __ext4_expand_extra_isize(struct inode *inode,
5723 : unsigned int new_extra_isize,
5724 : struct ext4_iloc *iloc,
5725 : handle_t *handle, int *no_expand)
5726 : {
5727 20 : struct ext4_inode *raw_inode;
5728 20 : struct ext4_xattr_ibody_header *header;
5729 20 : unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
5730 20 : struct ext4_inode_info *ei = EXT4_I(inode);
5731 20 : int error;
5732 :
5733 : /* this was checked at iget time, but double check for good measure */
5734 20 : if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
5735 : (ei->i_extra_isize & 3)) {
5736 0 : EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
5737 : ei->i_extra_isize,
5738 : EXT4_INODE_SIZE(inode->i_sb));
5739 0 : return -EFSCORRUPTED;
5740 : }
5741 20 : if ((new_extra_isize < ei->i_extra_isize) ||
5742 20 : (new_extra_isize < 4) ||
5743 20 : (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
5744 : return -EINVAL; /* Should never happen */
5745 :
5746 20 : raw_inode = ext4_raw_inode(iloc);
5747 :
5748 20 : header = IHDR(inode, raw_inode);
5749 :
5750 : /* No extended attributes present */
5751 20 : if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
5752 18 : header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
5753 2 : memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE +
5754 : EXT4_I(inode)->i_extra_isize, 0,
5755 : new_extra_isize - EXT4_I(inode)->i_extra_isize);
5756 2 : EXT4_I(inode)->i_extra_isize = new_extra_isize;
5757 2 : return 0;
5758 : }
5759 :
5760 : /*
5761 : * We may need to allocate external xattr block so we need quotas
5762 : * initialized. Here we can be called with various locks held so we
5763 : * cannot affort to initialize quotas ourselves. So just bail.
5764 : */
5765 18 : if (dquot_initialize_needed(inode))
5766 : return -EAGAIN;
5767 :
5768 : /* try to expand with EAs present */
5769 18 : error = ext4_expand_extra_isize_ea(inode, new_extra_isize,
5770 : raw_inode, handle);
5771 18 : if (error) {
5772 : /*
5773 : * Inode size expansion failed; don't try again
5774 : */
5775 1 : *no_expand = 1;
5776 : }
5777 :
5778 : return error;
5779 : }
5780 :
5781 : /*
5782 : * Expand an inode by new_extra_isize bytes.
5783 : * Returns 0 on success or negative error number on failure.
5784 : */
5785 32 : static int ext4_try_to_expand_extra_isize(struct inode *inode,
5786 : unsigned int new_extra_isize,
5787 : struct ext4_iloc iloc,
5788 : handle_t *handle)
5789 : {
5790 32 : int no_expand;
5791 32 : int error;
5792 :
5793 32 : if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND))
5794 : return -EOVERFLOW;
5795 :
5796 : /*
5797 : * In nojournal mode, we can immediately attempt to expand
5798 : * the inode. When journaled, we first need to obtain extra
5799 : * buffer credits since we may write into the EA block
5800 : * with this same handle. If journal_extend fails, then it will
5801 : * only result in a minor loss of functionality for that inode.
5802 : * If this is felt to be critical, then e2fsck should be run to
5803 : * force a large enough s_min_extra_isize.
5804 : */
5805 20 : if (ext4_journal_extend(handle,
5806 40 : EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
5807 : return -ENOSPC;
5808 :
5809 20 : if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
5810 : return -EBUSY;
5811 :
5812 20 : error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc,
5813 : handle, &no_expand);
5814 20 : ext4_write_unlock_xattr(inode, &no_expand);
5815 :
5816 20 : return error;
5817 : }
5818 :
5819 0 : int ext4_expand_extra_isize(struct inode *inode,
5820 : unsigned int new_extra_isize,
5821 : struct ext4_iloc *iloc)
5822 : {
5823 0 : handle_t *handle;
5824 0 : int no_expand;
5825 0 : int error, rc;
5826 :
5827 0 : if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
5828 0 : brelse(iloc->bh);
5829 0 : return -EOVERFLOW;
5830 : }
5831 :
5832 0 : handle = ext4_journal_start(inode, EXT4_HT_INODE,
5833 : EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
5834 0 : if (IS_ERR(handle)) {
5835 0 : error = PTR_ERR(handle);
5836 0 : brelse(iloc->bh);
5837 0 : return error;
5838 : }
5839 :
5840 0 : ext4_write_lock_xattr(inode, &no_expand);
5841 :
5842 0 : BUFFER_TRACE(iloc->bh, "get_write_access");
5843 0 : error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh,
5844 : EXT4_JTR_NONE);
5845 0 : if (error) {
5846 0 : brelse(iloc->bh);
5847 0 : goto out_unlock;
5848 : }
5849 :
5850 0 : error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc,
5851 : handle, &no_expand);
5852 :
5853 0 : rc = ext4_mark_iloc_dirty(handle, inode, iloc);
5854 0 : if (!error)
5855 0 : error = rc;
5856 :
5857 0 : out_unlock:
5858 0 : ext4_write_unlock_xattr(inode, &no_expand);
5859 0 : ext4_journal_stop(handle);
5860 0 : return error;
5861 : }
5862 :
5863 : /*
5864 : * What we do here is to mark the in-core inode as clean with respect to inode
5865 : * dirtiness (it may still be data-dirty).
5866 : * This means that the in-core inode may be reaped by prune_icache
5867 : * without having to perform any I/O. This is a very good thing,
5868 : * because *any* task may call prune_icache - even ones which
5869 : * have a transaction open against a different journal.
5870 : *
5871 : * Is this cheating? Not really. Sure, we haven't written the
5872 : * inode out, but prune_icache isn't a user-visible syncing function.
5873 : * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
5874 : * we start and wait on commits.
5875 : */
5876 72191448 : int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
5877 : const char *func, unsigned int line)
5878 : {
5879 72191448 : struct ext4_iloc iloc;
5880 72191448 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5881 72191448 : int err;
5882 :
5883 72191448 : might_sleep();
5884 72128732 : trace_ext4_mark_inode_dirty(inode, _RET_IP_);
5885 72086805 : err = ext4_reserve_inode_write(handle, inode, &iloc);
5886 72207799 : if (err)
5887 1 : goto out;
5888 :
5889 72207798 : if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize)
5890 32 : ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize,
5891 : iloc, handle);
5892 :
5893 72207798 : err = ext4_mark_iloc_dirty(handle, inode, &iloc);
5894 72292469 : out:
5895 72292469 : if (unlikely(err))
5896 1 : ext4_error_inode_err(inode, func, line, 0, err,
5897 : "mark_inode_dirty error");
5898 72292469 : return err;
5899 : }
5900 :
5901 : /*
5902 : * ext4_dirty_inode() is called from __mark_inode_dirty()
5903 : *
5904 : * We're really interested in the case where a file is being extended.
5905 : * i_size has been changed by generic_commit_write() and we thus need
5906 : * to include the updated inode in the current transaction.
5907 : *
5908 : * Also, dquot_alloc_block() will always dirty the inode when blocks
5909 : * are allocated to the file.
5910 : *
5911 : * If the inode is marked synchronous, we don't honour that here - doing
5912 : * so would cause a commit on atime updates, which we don't bother doing.
5913 : * We handle synchronous inodes at the highest possible level.
5914 : */
5915 38541735 : void ext4_dirty_inode(struct inode *inode, int flags)
5916 : {
5917 38541735 : handle_t *handle;
5918 :
5919 38541735 : handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
5920 38476864 : if (IS_ERR(handle))
5921 : return;
5922 38476824 : ext4_mark_inode_dirty(handle, inode);
5923 38570739 : ext4_journal_stop(handle);
5924 : }
5925 :
5926 24 : int ext4_change_inode_journal_flag(struct inode *inode, int val)
5927 : {
5928 24 : journal_t *journal;
5929 24 : handle_t *handle;
5930 24 : int err;
5931 24 : int alloc_ctx;
5932 :
5933 : /*
5934 : * We have to be very careful here: changing a data block's
5935 : * journaling status dynamically is dangerous. If we write a
5936 : * data block to the journal, change the status and then delete
5937 : * that block, we risk forgetting to revoke the old log record
5938 : * from the journal and so a subsequent replay can corrupt data.
5939 : * So, first we make sure that the journal is empty and that
5940 : * nobody is changing anything.
5941 : */
5942 :
5943 24 : journal = EXT4_JOURNAL(inode);
5944 24 : if (!journal)
5945 : return 0;
5946 24 : if (is_journal_aborted(journal))
5947 : return -EROFS;
5948 :
5949 : /* Wait for all existing dio workers */
5950 24 : inode_dio_wait(inode);
5951 :
5952 : /*
5953 : * Before flushing the journal and switching inode's aops, we have
5954 : * to flush all dirty data the inode has. There can be outstanding
5955 : * delayed allocations, there can be unwritten extents created by
5956 : * fallocate or buffered writes in dioread_nolock mode covered by
5957 : * dirty data which can be converted only after flushing the dirty
5958 : * data (and journalled aops don't know how to handle these cases).
5959 : */
5960 24 : if (val) {
5961 12 : filemap_invalidate_lock(inode->i_mapping);
5962 12 : err = filemap_write_and_wait(inode->i_mapping);
5963 12 : if (err < 0) {
5964 0 : filemap_invalidate_unlock(inode->i_mapping);
5965 0 : return err;
5966 : }
5967 : }
5968 :
5969 24 : alloc_ctx = ext4_writepages_down_write(inode->i_sb);
5970 24 : jbd2_journal_lock_updates(journal);
5971 :
5972 : /*
5973 : * OK, there are no updates running now, and all cached data is
5974 : * synced to disk. We are now in a completely consistent state
5975 : * which doesn't have anything in the journal, and we know that
5976 : * no filesystem updates are running, so it is safe to modify
5977 : * the inode's in-core data-journaling state flag now.
5978 : */
5979 :
5980 24 : if (val)
5981 12 : ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5982 : else {
5983 12 : err = jbd2_journal_flush(journal, 0);
5984 12 : if (err < 0) {
5985 0 : jbd2_journal_unlock_updates(journal);
5986 0 : ext4_writepages_up_write(inode->i_sb, alloc_ctx);
5987 0 : return err;
5988 : }
5989 12 : ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5990 : }
5991 24 : ext4_set_aops(inode);
5992 :
5993 24 : jbd2_journal_unlock_updates(journal);
5994 24 : ext4_writepages_up_write(inode->i_sb, alloc_ctx);
5995 :
5996 24 : if (val)
5997 12 : filemap_invalidate_unlock(inode->i_mapping);
5998 :
5999 : /* Finally we can mark the inode as dirty. */
6000 :
6001 24 : handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
6002 24 : if (IS_ERR(handle))
6003 0 : return PTR_ERR(handle);
6004 :
6005 24 : ext4_fc_mark_ineligible(inode->i_sb,
6006 : EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle);
6007 24 : err = ext4_mark_inode_dirty(handle, inode);
6008 24 : ext4_handle_sync(handle);
6009 24 : ext4_journal_stop(handle);
6010 24 : ext4_std_error(inode->i_sb, err);
6011 :
6012 : return err;
6013 : }
6014 :
6015 97 : static int ext4_bh_unmapped(handle_t *handle, struct inode *inode,
6016 : struct buffer_head *bh)
6017 : {
6018 97 : return !buffer_mapped(bh);
6019 : }
6020 :
6021 8496389 : vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
6022 : {
6023 8496389 : struct vm_area_struct *vma = vmf->vma;
6024 8496389 : struct folio *folio = page_folio(vmf->page);
6025 8481056 : loff_t size;
6026 8481056 : unsigned long len;
6027 8481056 : int err;
6028 8481056 : vm_fault_t ret;
6029 8481056 : struct file *file = vma->vm_file;
6030 8481056 : struct inode *inode = file_inode(file);
6031 8481056 : struct address_space *mapping = inode->i_mapping;
6032 8481056 : handle_t *handle;
6033 8481056 : get_block_t *get_block;
6034 8481056 : int retries = 0;
6035 :
6036 8481056 : if (unlikely(IS_IMMUTABLE(inode)))
6037 : return VM_FAULT_SIGBUS;
6038 :
6039 8481056 : sb_start_pagefault(inode->i_sb);
6040 8477512 : file_update_time(vma->vm_file);
6041 :
6042 8477457 : filemap_invalidate_lock_shared(mapping);
6043 :
6044 8483129 : err = ext4_convert_inline_data(inode);
6045 8458723 : if (err)
6046 0 : goto out_ret;
6047 :
6048 : /*
6049 : * On data journalling we skip straight to the transaction handle:
6050 : * there's no delalloc; page truncated will be checked later; the
6051 : * early return w/ all buffers mapped (calculates size/len) can't
6052 : * be used; and there's no dioread_nolock, so only ext4_get_block.
6053 : */
6054 8464935 : if (ext4_should_journal_data(inode))
6055 0 : goto retry_alloc;
6056 :
6057 : /* Delalloc case is easy... */
6058 16936182 : if (test_opt(inode->i_sb, DELALLOC) &&
6059 8470768 : !ext4_nonda_switch(inode->i_sb)) {
6060 8473857 : do {
6061 8473857 : err = block_page_mkwrite(vma, vmf,
6062 : ext4_da_get_block_prep);
6063 8471482 : } while (err == -ENOSPC &&
6064 5294 : ext4_should_retry_alloc(inode->i_sb, &retries));
6065 8463723 : goto out_ret;
6066 : }
6067 :
6068 114 : folio_lock(folio);
6069 952 : size = i_size_read(inode);
6070 : /* Page got truncated from under us? */
6071 952 : if (folio->mapping != mapping || folio_pos(folio) > size) {
6072 0 : folio_unlock(folio);
6073 0 : ret = VM_FAULT_NOPAGE;
6074 0 : goto out;
6075 : }
6076 :
6077 952 : len = folio_size(folio);
6078 952 : if (folio_pos(folio) + len > size)
6079 0 : len = size - folio_pos(folio);
6080 : /*
6081 : * Return if we have all the buffers mapped. This avoids the need to do
6082 : * journal_start/journal_stop which can block and take a long time
6083 : *
6084 : * This cannot be done for data journalling, as we have to add the
6085 : * inode to the transaction's list to writeprotect pages on commit.
6086 : */
6087 952 : if (folio_buffers(folio)) {
6088 97 : if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio),
6089 : 0, len, NULL,
6090 : ext4_bh_unmapped)) {
6091 : /* Wait so that we don't change page under IO */
6092 97 : folio_wait_stable(folio);
6093 97 : ret = VM_FAULT_LOCKED;
6094 97 : goto out;
6095 : }
6096 : }
6097 855 : folio_unlock(folio);
6098 : /* OK, we need to fill the hole... */
6099 855 : if (ext4_should_dioread_nolock(inode))
6100 : get_block = ext4_get_block_unwritten;
6101 : else
6102 108 : get_block = ext4_get_block;
6103 854 : retry_alloc:
6104 947 : handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
6105 : ext4_writepage_trans_blocks(inode));
6106 947 : if (IS_ERR(handle)) {
6107 0 : ret = VM_FAULT_SIGBUS;
6108 0 : goto out;
6109 : }
6110 : /*
6111 : * Data journalling can't use block_page_mkwrite() because it
6112 : * will set_buffer_dirty() before do_journal_get_write_access()
6113 : * thus might hit warning messages for dirty metadata buffers.
6114 : */
6115 947 : if (!ext4_should_journal_data(inode)) {
6116 948 : err = block_page_mkwrite(vma, vmf, get_block);
6117 : } else {
6118 0 : folio_lock(folio);
6119 0 : size = i_size_read(inode);
6120 : /* Page got truncated from under us? */
6121 0 : if (folio->mapping != mapping || folio_pos(folio) > size) {
6122 0 : ret = VM_FAULT_NOPAGE;
6123 0 : goto out_error;
6124 : }
6125 :
6126 0 : len = folio_size(folio);
6127 0 : if (folio_pos(folio) + len > size)
6128 0 : len = size - folio_pos(folio);
6129 :
6130 0 : err = __block_write_begin(&folio->page, 0, len, ext4_get_block);
6131 0 : if (!err) {
6132 0 : ret = VM_FAULT_SIGBUS;
6133 0 : if (ext4_journal_folio_buffers(handle, folio, len))
6134 0 : goto out_error;
6135 : } else {
6136 0 : folio_unlock(folio);
6137 : }
6138 : }
6139 948 : ext4_journal_stop(handle);
6140 947 : if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
6141 93 : goto retry_alloc;
6142 854 : out_ret:
6143 8458365 : ret = block_page_mkwrite_return(err);
6144 8458462 : out:
6145 8458462 : filemap_invalidate_unlock_shared(mapping);
6146 8481550 : sb_end_pagefault(inode->i_sb);
6147 8481550 : return ret;
6148 0 : out_error:
6149 0 : folio_unlock(folio);
6150 0 : ext4_journal_stop(handle);
6151 0 : goto out;
6152 : }
|