Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * linux/fs/ext4/inode.c
4 : *
5 : * Copyright (C) 1992, 1993, 1994, 1995
6 : * Remy Card (card@masi.ibp.fr)
7 : * Laboratoire MASI - Institut Blaise Pascal
8 : * Universite Pierre et Marie Curie (Paris VI)
9 : *
10 : * from
11 : *
12 : * linux/fs/minix/inode.c
13 : *
14 : * Copyright (C) 1991, 1992 Linus Torvalds
15 : *
16 : * 64-bit file support on 64-bit platforms by Jakub Jelinek
17 : * (jj@sunsite.ms.mff.cuni.cz)
18 : *
19 : * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
20 : */
21 :
22 : #include <linux/fs.h>
23 : #include <linux/mount.h>
24 : #include <linux/time.h>
25 : #include <linux/highuid.h>
26 : #include <linux/pagemap.h>
27 : #include <linux/dax.h>
28 : #include <linux/quotaops.h>
29 : #include <linux/string.h>
30 : #include <linux/buffer_head.h>
31 : #include <linux/writeback.h>
32 : #include <linux/pagevec.h>
33 : #include <linux/mpage.h>
34 : #include <linux/namei.h>
35 : #include <linux/uio.h>
36 : #include <linux/bio.h>
37 : #include <linux/workqueue.h>
38 : #include <linux/kernel.h>
39 : #include <linux/printk.h>
40 : #include <linux/slab.h>
41 : #include <linux/bitops.h>
42 : #include <linux/iomap.h>
43 : #include <linux/iversion.h>
44 :
45 : #include "ext4_jbd2.h"
46 : #include "xattr.h"
47 : #include "acl.h"
48 : #include "truncate.h"
49 :
50 : #include <trace/events/ext4.h>
51 :
52 73355010 : static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
53 : struct ext4_inode_info *ei)
54 : {
55 73355010 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
56 73355010 : __u32 csum;
57 73355010 : __u16 dummy_csum = 0;
58 73355010 : int offset = offsetof(struct ext4_inode, i_checksum_lo);
59 73355010 : unsigned int csum_size = sizeof(dummy_csum);
60 :
61 73355010 : csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
62 73495093 : csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
63 73410339 : offset += csum_size;
64 73410339 : csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
65 : EXT4_GOOD_OLD_INODE_SIZE - offset);
66 :
67 73396161 : if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
68 73386847 : offset = offsetof(struct ext4_inode, i_checksum_hi);
69 73386847 : csum = ext4_chksum(sbi, csum, (__u8 *)raw +
70 : EXT4_GOOD_OLD_INODE_SIZE,
71 : offset - EXT4_GOOD_OLD_INODE_SIZE);
72 73436333 : if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
73 73447789 : csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
74 : csum_size);
75 73447789 : offset += csum_size;
76 : }
77 73393495 : csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
78 73393495 : EXT4_INODE_SIZE(inode->i_sb) - offset);
79 : }
80 :
81 73527675 : return csum;
82 : }
83 :
84 215779 : static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
85 : struct ext4_inode_info *ei)
86 : {
87 215779 : __u32 provided, calculated;
88 :
89 215779 : if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
90 215780 : cpu_to_le32(EXT4_OS_LINUX) ||
91 215779 : !ext4_has_metadata_csum(inode->i_sb))
92 470 : return 1;
93 :
94 215310 : provided = le16_to_cpu(raw->i_checksum_lo);
95 215310 : calculated = ext4_inode_csum(inode, raw, ei);
96 215310 : if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
97 215306 : EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
98 215304 : provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
99 : else
100 6 : calculated &= 0xFFFF;
101 :
102 215310 : return provided == calculated;
103 : }
104 :
105 73223540 : void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
106 : struct ext4_inode_info *ei)
107 : {
108 73223540 : __u32 csum;
109 :
110 73223540 : if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
111 73135515 : cpu_to_le32(EXT4_OS_LINUX) ||
112 73242808 : !ext4_has_metadata_csum(inode->i_sb))
113 16001 : return;
114 :
115 73100246 : csum = ext4_inode_csum(inode, raw, ei);
116 73297256 : raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
117 73297256 : if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
118 73299303 : EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
119 73299303 : raw->i_checksum_hi = cpu_to_le16(csum >> 16);
120 : }
121 :
122 1957127 : static inline int ext4_begin_ordered_truncate(struct inode *inode,
123 : loff_t new_size)
124 : {
125 1957127 : trace_ext4_begin_ordered_truncate(inode, new_size);
126 : /*
127 : * If jinode is zero, then we never opened the file for
128 : * writing, so there's no need to call
129 : * jbd2_journal_begin_ordered_truncate() since there's no
130 : * outstanding writes we need to flush.
131 : */
132 1957060 : if (!EXT4_I(inode)->jinode)
133 : return 0;
134 1809526 : return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
135 1809526 : EXT4_I(inode)->jinode,
136 : new_size);
137 : }
138 :
139 : static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
140 : int pextents);
141 :
142 : /*
143 : * Test whether an inode is a fast symlink.
144 : * A fast symlink has its symlink data stored in ext4_inode_info->i_data.
145 : */
146 1977069 : int ext4_inode_is_fast_symlink(struct inode *inode)
147 : {
148 1977069 : if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
149 1977059 : int ea_blocks = EXT4_I(inode)->i_file_acl ?
150 14146 : EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
151 :
152 1977059 : if (ext4_has_inline_data(inode))
153 : return 0;
154 :
155 1977059 : return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
156 : }
157 10 : return S_ISLNK(inode->i_mode) && inode->i_size &&
158 : (inode->i_size < EXT4_N_BLOCKS * 4);
159 : }
160 :
161 : /*
162 : * Called at the last iput() if i_nlink is zero.
163 : */
164 3160768 : void ext4_evict_inode(struct inode *inode)
165 : {
166 3160768 : handle_t *handle;
167 3160768 : int err;
168 : /*
169 : * Credits for final inode cleanup and freeing:
170 : * sb + inode (ext4_orphan_del()), block bitmap, group descriptor
171 : * (xattr block freeing), bitmap, group descriptor (inode freeing)
172 : */
173 3160768 : int extra_credits = 6;
174 3160768 : struct ext4_xattr_inode_array *ea_inode_array = NULL;
175 3160768 : bool freeze_protected = false;
176 :
177 3160768 : trace_ext4_evict_inode(inode);
178 :
179 3160591 : if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
180 10 : ext4_evict_ea_inode(inode);
181 3160591 : if (inode->i_nlink) {
182 1142890 : truncate_inode_pages_final(&inode->i_data);
183 :
184 1142890 : goto no_delete;
185 : }
186 :
187 2017701 : if (is_bad_inode(inode))
188 1035 : goto no_delete;
189 2016643 : dquot_initialize(inode);
190 :
191 2016738 : if (ext4_should_order_data(inode))
192 1757800 : ext4_begin_ordered_truncate(inode, 0);
193 2016431 : truncate_inode_pages_final(&inode->i_data);
194 :
195 : /*
196 : * For inodes with journalled data, transaction commit could have
197 : * dirtied the inode. And for inodes with dioread_nolock, unwritten
198 : * extents converting worker could merge extents and also have dirtied
199 : * the inode. Flush worker is ignoring it because of I_FREEING flag but
200 : * we still need to remove the inode from the writeback lists.
201 : */
202 2015906 : if (!list_empty_careful(&inode->i_io_list))
203 189 : inode_io_list_del(inode);
204 :
205 : /*
206 : * Protect us against freezing - iput() caller didn't have to have any
207 : * protection against it. When we are in a running transaction though,
208 : * we are already protected against freezing and we cannot grab further
209 : * protection due to lock ordering constraints.
210 : */
211 2015294 : if (!ext4_journal_current_handle()) {
212 2013593 : sb_start_intwrite(inode->i_sb);
213 2013593 : freeze_protected = true;
214 : }
215 :
216 2015517 : if (!IS_NOQUOTA(inode))
217 2032797 : extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
218 :
219 : /*
220 : * Block bitmap, group descriptor, and inode are accounted in both
221 : * ext4_blocks_for_truncate() and extra_credits. So subtract 3.
222 : */
223 2015517 : handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
224 : ext4_blocks_for_truncate(inode) + extra_credits - 3);
225 2012539 : if (IS_ERR(handle)) {
226 50595 : ext4_std_error(inode->i_sb, PTR_ERR(handle));
227 : /*
228 : * If we're going to skip the normal cleanup, we still need to
229 : * make sure that the in-core orphan linked list is properly
230 : * cleaned up.
231 : */
232 50595 : ext4_orphan_del(NULL, inode);
233 50595 : if (freeze_protected)
234 50595 : sb_end_intwrite(inode->i_sb);
235 50595 : goto no_delete;
236 : }
237 :
238 1961944 : if (IS_SYNC(inode))
239 3 : ext4_handle_sync(handle);
240 :
241 : /*
242 : * Set inode->i_size to 0 before calling ext4_truncate(). We need
243 : * special handling of symlinks here because i_size is used to
244 : * determine whether ext4_inode_info->i_data contains symlink data or
245 : * block mappings. Setting i_size to 0 will remove its fast symlink
246 : * status. Erase i_data so that it becomes a valid empty block map.
247 : */
248 1961944 : if (ext4_inode_is_fast_symlink(inode))
249 16121 : memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data));
250 1958975 : inode->i_size = 0;
251 1958975 : err = ext4_mark_inode_dirty(handle, inode);
252 1963957 : if (err) {
253 0 : ext4_warning(inode->i_sb,
254 : "couldn't mark inode dirty (err %d)", err);
255 0 : goto stop_handle;
256 : }
257 1963957 : if (inode->i_blocks) {
258 282052 : err = ext4_truncate(inode);
259 282053 : if (err) {
260 0 : ext4_error_err(inode->i_sb, -err,
261 : "couldn't truncate inode %lu (err %d)",
262 : inode->i_ino, err);
263 0 : goto stop_handle;
264 : }
265 : }
266 :
267 : /* Remove xattr references. */
268 1963958 : err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
269 : extra_credits);
270 1954458 : if (err) {
271 0 : ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
272 0 : stop_handle:
273 0 : ext4_journal_stop(handle);
274 0 : ext4_orphan_del(NULL, inode);
275 0 : if (freeze_protected)
276 0 : sb_end_intwrite(inode->i_sb);
277 0 : ext4_xattr_inode_array_free(ea_inode_array);
278 0 : goto no_delete;
279 : }
280 :
281 : /*
282 : * Kill off the orphan record which ext4_truncate created.
283 : * AKPM: I think this can be inside the above `if'.
284 : * Note that ext4_orphan_del() has to be able to cope with the
285 : * deletion of a non-existent orphan - this is because we don't
286 : * know if ext4_truncate() actually created an orphan record.
287 : * (Well, we could do this if we need to, but heck - it works)
288 : */
289 1954458 : ext4_orphan_del(handle, inode);
290 1966482 : EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds();
291 :
292 : /*
293 : * One subtle ordering requirement: if anything has gone wrong
294 : * (transaction abort, IO errors, whatever), then we can still
295 : * do these next steps (the fs will already have been marked as
296 : * having errors), but we can't free the inode if the mark_dirty
297 : * fails.
298 : */
299 1966478 : if (ext4_mark_inode_dirty(handle, inode))
300 : /* If that failed, just do the required in-core inode clear. */
301 0 : ext4_clear_inode(inode);
302 : else
303 1966457 : ext4_free_inode(handle, inode);
304 1965972 : ext4_journal_stop(handle);
305 1964870 : if (freeze_protected)
306 1963295 : sb_end_intwrite(inode->i_sb);
307 1965546 : ext4_xattr_inode_array_free(ea_inode_array);
308 1964994 : return;
309 1194520 : no_delete:
310 : /*
311 : * Check out some where else accidentally dirty the evicting inode,
312 : * which may probably cause inode use-after-free issues later.
313 : */
314 2389040 : WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list));
315 :
316 1194520 : if (!list_empty(&EXT4_I(inode)->i_fc_list))
317 0 : ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
318 1194520 : ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
319 : }
320 :
321 : #ifdef CONFIG_QUOTA
322 26878405 : qsize_t *ext4_get_reserved_space(struct inode *inode)
323 : {
324 26878405 : return &EXT4_I(inode)->i_reserved_quota;
325 : }
326 : #endif
327 :
328 : /*
329 : * Called with i_data_sem down, which is important since we can call
330 : * ext4_discard_preallocations() from here.
331 : */
332 928252 : void ext4_da_update_reserve_space(struct inode *inode,
333 : int used, int quota_claim)
334 : {
335 928252 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
336 928252 : struct ext4_inode_info *ei = EXT4_I(inode);
337 :
338 928252 : spin_lock(&ei->i_block_reservation_lock);
339 928258 : trace_ext4_da_update_reserve_space(inode, used, quota_claim);
340 928244 : if (unlikely(used > ei->i_reserved_data_blocks)) {
341 0 : ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
342 : "with only %d reserved data blocks",
343 : __func__, inode->i_ino, used,
344 : ei->i_reserved_data_blocks);
345 0 : WARN_ON(1);
346 0 : used = ei->i_reserved_data_blocks;
347 : }
348 :
349 : /* Update per-inode reservations */
350 928244 : ei->i_reserved_data_blocks -= used;
351 928244 : percpu_counter_sub(&sbi->s_dirtyclusters_counter, used);
352 :
353 928258 : spin_unlock(&ei->i_block_reservation_lock);
354 :
355 : /* Update quota subsystem for data blocks */
356 928260 : if (quota_claim)
357 890312 : dquot_claim_block(inode, EXT4_C2B(sbi, used));
358 : else {
359 : /*
360 : * We did fallocate with an offset that is already delayed
361 : * allocated. So on delayed allocated writeback we should
362 : * not re-claim the quota for fallocated blocks.
363 : */
364 37948 : dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
365 : }
366 :
367 : /*
368 : * If we have done all the pending block allocations and if
369 : * there aren't any writers on the inode, we can discard the
370 : * inode's preallocations.
371 : */
372 928264 : if ((ei->i_reserved_data_blocks == 0) &&
373 : !inode_is_open_for_write(inode))
374 359248 : ext4_discard_preallocations(inode, 0);
375 928264 : }
376 :
377 30471762 : static int __check_block_validity(struct inode *inode, const char *func,
378 : unsigned int line,
379 : struct ext4_map_blocks *map)
380 : {
381 30471762 : if (ext4_has_feature_journal(inode->i_sb) &&
382 27626966 : (inode->i_ino ==
383 27626966 : le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
384 : return 0;
385 26773061 : if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
386 0 : ext4_error_inode(inode, func, line, map->m_pblk,
387 : "lblock %lu mapped to illegal pblock %llu "
388 : "(length %d)", (unsigned long) map->m_lblk,
389 : map->m_pblk, map->m_len);
390 0 : return -EFSCORRUPTED;
391 : }
392 : return 0;
393 : }
394 :
395 0 : int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
396 : ext4_lblk_t len)
397 : {
398 0 : int ret;
399 :
400 0 : if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
401 : return fscrypt_zeroout_range(inode, lblk, pblk, len);
402 :
403 0 : ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
404 0 : if (ret > 0)
405 : ret = 0;
406 :
407 : return ret;
408 : }
409 :
410 : #define check_block_validity(inode, map) \
411 : __check_block_validity((inode), __func__, __LINE__, (map))
412 :
413 : #ifdef ES_AGGRESSIVE_TEST
414 : static void ext4_map_blocks_es_recheck(handle_t *handle,
415 : struct inode *inode,
416 : struct ext4_map_blocks *es_map,
417 : struct ext4_map_blocks *map,
418 : int flags)
419 : {
420 : int retval;
421 :
422 : map->m_flags = 0;
423 : /*
424 : * There is a race window that the result is not the same.
425 : * e.g. xfstests #223 when dioread_nolock enables. The reason
426 : * is that we lookup a block mapping in extent status tree with
427 : * out taking i_data_sem. So at the time the unwritten extent
428 : * could be converted.
429 : */
430 : down_read(&EXT4_I(inode)->i_data_sem);
431 : if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
432 : retval = ext4_ext_map_blocks(handle, inode, map, 0);
433 : } else {
434 : retval = ext4_ind_map_blocks(handle, inode, map, 0);
435 : }
436 : up_read((&EXT4_I(inode)->i_data_sem));
437 :
438 : /*
439 : * We don't check m_len because extent will be collpased in status
440 : * tree. So the m_len might not equal.
441 : */
442 : if (es_map->m_lblk != map->m_lblk ||
443 : es_map->m_flags != map->m_flags ||
444 : es_map->m_pblk != map->m_pblk) {
445 : printk("ES cache assertion failed for inode: %lu "
446 : "es_cached ex [%d/%d/%llu/%x] != "
447 : "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
448 : inode->i_ino, es_map->m_lblk, es_map->m_len,
449 : es_map->m_pblk, es_map->m_flags, map->m_lblk,
450 : map->m_len, map->m_pblk, map->m_flags,
451 : retval, flags);
452 : }
453 : }
454 : #endif /* ES_AGGRESSIVE_TEST */
455 :
456 : /*
457 : * The ext4_map_blocks() function tries to look up the requested blocks,
458 : * and returns if the blocks are already mapped.
459 : *
460 : * Otherwise it takes the write lock of the i_data_sem and allocate blocks
461 : * and store the allocated blocks in the result buffer head and mark it
462 : * mapped.
463 : *
464 : * If file type is extents based, it will call ext4_ext_map_blocks(),
465 : * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
466 : * based files
467 : *
468 : * On success, it returns the number of blocks being mapped or allocated. if
469 : * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
470 : * is marked as unwritten. If the create == 1, it will mark @map as mapped.
471 : *
472 : * It returns 0 if plain look up failed (blocks have not been allocated), in
473 : * that case, @map is returned as unmapped but we still do fill map->m_len to
474 : * indicate the length of a hole starting at map->m_lblk.
475 : *
476 : * It returns the error in case of allocation failure.
477 : */
478 70686789 : int ext4_map_blocks(handle_t *handle, struct inode *inode,
479 : struct ext4_map_blocks *map, int flags)
480 : {
481 70686789 : struct extent_status es;
482 70686789 : int retval;
483 70686789 : int ret = 0;
484 : #ifdef ES_AGGRESSIVE_TEST
485 : struct ext4_map_blocks orig_map;
486 :
487 : memcpy(&orig_map, map, sizeof(*map));
488 : #endif
489 :
490 70686789 : map->m_flags = 0;
491 70686789 : ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n",
492 : flags, map->m_len, (unsigned long) map->m_lblk);
493 :
494 : /*
495 : * ext4_map_blocks returns an int, and m_len is an unsigned int
496 : */
497 70686789 : if (unlikely(map->m_len > INT_MAX))
498 83669 : map->m_len = INT_MAX;
499 :
500 : /* We can handle the block number less than EXT_MAX_BLOCKS */
501 70686789 : if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
502 : return -EFSCORRUPTED;
503 :
504 : /* Lookup extent status tree firstly */
505 141405760 : if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
506 70682426 : ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
507 67905769 : if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
508 35061223 : map->m_pblk = ext4_es_pblock(&es) +
509 35061223 : map->m_lblk - es.es_lblk;
510 70122446 : map->m_flags |= ext4_es_is_written(&es) ?
511 35061223 : EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
512 35061223 : retval = es.es_len - (map->m_lblk - es.es_lblk);
513 35061223 : if (retval > map->m_len)
514 : retval = map->m_len;
515 35061223 : map->m_len = retval;
516 32844546 : } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
517 32844546 : map->m_pblk = 0;
518 32844546 : retval = es.es_len - (map->m_lblk - es.es_lblk);
519 32844546 : if (retval > map->m_len)
520 : retval = map->m_len;
521 32844546 : map->m_len = retval;
522 32844546 : retval = 0;
523 : } else {
524 0 : BUG();
525 : }
526 :
527 67905769 : if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
528 : return retval;
529 : #ifdef ES_AGGRESSIVE_TEST
530 : ext4_map_blocks_es_recheck(handle, inode, map,
531 : &orig_map, flags);
532 : #endif
533 67898573 : goto found;
534 : }
535 : /*
536 : * In the query cache no-wait mode, nothing we can do more if we
537 : * cannot find extent in the cache.
538 : */
539 2817565 : if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
540 : return 0;
541 :
542 : /*
543 : * Try to see if we can get the block without requesting a new
544 : * file system block.
545 : */
546 2817565 : down_read(&EXT4_I(inode)->i_data_sem);
547 2817438 : if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
548 2797200 : retval = ext4_ext_map_blocks(handle, inode, map, 0);
549 : } else {
550 20238 : retval = ext4_ind_map_blocks(handle, inode, map, 0);
551 : }
552 2817249 : if (retval > 0) {
553 1004394 : unsigned int status;
554 :
555 1004394 : if (unlikely(retval != map->m_len)) {
556 0 : ext4_warning(inode->i_sb,
557 : "ES len assertion failed for inode "
558 : "%lu: retval %d != map->m_len %d",
559 : inode->i_ino, retval, map->m_len);
560 0 : WARN_ON(1);
561 : }
562 :
563 1004394 : status = map->m_flags & EXT4_MAP_UNWRITTEN ?
564 1004394 : EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
565 1004394 : if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
566 1307336 : !(status & EXTENT_STATUS_WRITTEN) &&
567 302942 : ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
568 302942 : map->m_lblk + map->m_len - 1))
569 0 : status |= EXTENT_STATUS_DELAYED;
570 1004394 : ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
571 : map->m_pblk, status);
572 : }
573 2817250 : up_read((&EXT4_I(inode)->i_data_sem));
574 :
575 70713236 : found:
576 70713236 : if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
577 24755048 : ret = check_block_validity(inode, map);
578 24753759 : if (ret != 0)
579 : return ret;
580 : }
581 :
582 : /* If it is only a block(s) look up */
583 70711947 : if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
584 : return retval;
585 :
586 : /*
587 : * Returns if the blocks have already allocated
588 : *
589 : * Note that if blocks have been preallocated
590 : * ext4_ext_get_block() returns the create = 0
591 : * with buffer head unmapped.
592 : */
593 7309606 : if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
594 : /*
595 : * If we need to convert extent to unwritten
596 : * we continue and do the actual work in
597 : * ext4_ext_map_blocks()
598 : */
599 868289 : if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
600 : return retval;
601 :
602 : /*
603 : * Here we clear m_flags because after allocating an new extent,
604 : * it will be set again.
605 : */
606 6563347 : map->m_flags &= ~EXT4_MAP_FLAGS;
607 :
608 : /*
609 : * New blocks allocate and/or writing to unwritten extent
610 : * will possibly result in updating i_data, so we take
611 : * the write lock of i_data_sem, and call get_block()
612 : * with create == 1 flag.
613 : */
614 6563347 : down_write(&EXT4_I(inode)->i_data_sem);
615 :
616 : /*
617 : * We need to check for EXT4 here because migrate
618 : * could have changed the inode type in between
619 : */
620 6563054 : if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
621 6558032 : retval = ext4_ext_map_blocks(handle, inode, map, flags);
622 : } else {
623 5022 : retval = ext4_ind_map_blocks(handle, inode, map, flags);
624 :
625 5025 : if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
626 : /*
627 : * We allocated new blocks which will result in
628 : * i_data's format changing. Force the migrate
629 : * to fail by clearing migrate flags
630 : */
631 5015 : ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
632 : }
633 : }
634 :
635 6564162 : if (retval > 0) {
636 5840323 : unsigned int status;
637 :
638 5840323 : if (unlikely(retval != map->m_len)) {
639 0 : ext4_warning(inode->i_sb,
640 : "ES len assertion failed for inode "
641 : "%lu: retval %d != map->m_len %d",
642 : inode->i_ino, retval, map->m_len);
643 0 : WARN_ON(1);
644 : }
645 :
646 : /*
647 : * We have to zeroout blocks before inserting them into extent
648 : * status tree. Otherwise someone could look them up there and
649 : * use them before they are really zeroed. We also have to
650 : * unmap metadata before zeroing as otherwise writeback can
651 : * overwrite zeros with stale data from block device.
652 : */
653 5840323 : if (flags & EXT4_GET_BLOCKS_ZERO &&
654 0 : map->m_flags & EXT4_MAP_MAPPED &&
655 : map->m_flags & EXT4_MAP_NEW) {
656 0 : ret = ext4_issue_zeroout(inode, map->m_lblk,
657 : map->m_pblk, map->m_len);
658 0 : if (ret) {
659 0 : retval = ret;
660 0 : goto out_sem;
661 : }
662 : }
663 :
664 : /*
665 : * If the extent has been zeroed out, we don't need to update
666 : * extent status tree.
667 : */
668 7489220 : if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
669 1648798 : ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
670 1648823 : if (ext4_es_is_written(&es))
671 2 : goto out_sem;
672 : }
673 5840420 : status = map->m_flags & EXT4_MAP_UNWRITTEN ?
674 5840420 : EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
675 5840420 : if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
676 7554781 : !(status & EXTENT_STATUS_WRITTEN) &&
677 2626583 : ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
678 2626583 : map->m_lblk + map->m_len - 1))
679 76037 : status |= EXTENT_STATUS_DELAYED;
680 5840943 : ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
681 : map->m_pblk, status);
682 : }
683 :
684 723839 : out_sem:
685 6563898 : up_write((&EXT4_I(inode)->i_data_sem));
686 6563695 : if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
687 5717974 : ret = check_block_validity(inode, map);
688 5718234 : if (ret != 0)
689 : return ret;
690 :
691 : /*
692 : * Inodes with freshly allocated blocks where contents will be
693 : * visible after transaction commit must be on transaction's
694 : * ordered data list.
695 : */
696 5718234 : if (map->m_flags & EXT4_MAP_NEW &&
697 657473 : !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
698 1314942 : !(flags & EXT4_GET_BLOCKS_ZERO) &&
699 655435 : !ext4_is_quota_file(inode) &&
700 : ext4_should_order_data(inode)) {
701 512126 : loff_t start_byte =
702 256063 : (loff_t)map->m_lblk << inode->i_blkbits;
703 256063 : loff_t length = (loff_t)map->m_len << inode->i_blkbits;
704 :
705 256063 : if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
706 207 : ret = ext4_jbd2_inode_add_wait(handle, inode,
707 : start_byte, length);
708 : else
709 255856 : ret = ext4_jbd2_inode_add_write(handle, inode,
710 : start_byte, length);
711 256218 : if (ret)
712 : return ret;
713 : }
714 : }
715 6564028 : if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN ||
716 : map->m_flags & EXT4_MAP_MAPPED))
717 5840162 : ext4_fc_track_range(handle, inode, map->m_lblk,
718 5840162 : map->m_lblk + map->m_len - 1);
719 : if (retval < 0)
720 : ext_debug(inode, "failed with err %d\n", retval);
721 : return retval;
722 : }
723 :
724 : /*
725 : * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages
726 : * we have to be careful as someone else may be manipulating b_state as well.
727 : */
728 7417169 : static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
729 : {
730 7417169 : unsigned long old_state;
731 7417169 : unsigned long new_state;
732 :
733 7417169 : flags &= EXT4_MAP_FLAGS;
734 :
735 : /* Dummy buffer_head? Set non-atomically. */
736 7417169 : if (!bh->b_page) {
737 0 : bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
738 0 : return;
739 : }
740 : /*
741 : * Someone else may be modifying b_state. Be careful! This is ugly but
742 : * once we get rid of using bh as a container for mapping information
743 : * to pass to / from get_block functions, this can go away.
744 : */
745 7417169 : old_state = READ_ONCE(bh->b_state);
746 7417169 : do {
747 7417169 : new_state = (old_state & ~EXT4_MAP_FLAGS) | flags;
748 7417169 : } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state)));
749 : }
750 :
751 984215 : static int _ext4_get_block(struct inode *inode, sector_t iblock,
752 : struct buffer_head *bh, int flags)
753 : {
754 984215 : struct ext4_map_blocks map;
755 984215 : int ret = 0;
756 :
757 984215 : if (ext4_has_inline_data(inode))
758 : return -ERANGE;
759 :
760 984215 : map.m_lblk = iblock;
761 984215 : map.m_len = bh->b_size >> inode->i_blkbits;
762 :
763 984215 : ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map,
764 : flags);
765 985690 : if (ret > 0) {
766 701441 : map_bh(bh, inode->i_sb, map.m_pblk);
767 701913 : ext4_update_bh_state(bh, map.m_flags);
768 702049 : bh->b_size = inode->i_sb->s_blocksize * map.m_len;
769 702049 : ret = 0;
770 284249 : } else if (ret == 0) {
771 : /* hole case, need to fill in bh->b_size */
772 277520 : bh->b_size = inode->i_sb->s_blocksize * map.m_len;
773 : }
774 : return ret;
775 : }
776 :
777 134426 : int ext4_get_block(struct inode *inode, sector_t iblock,
778 : struct buffer_head *bh, int create)
779 : {
780 134426 : return _ext4_get_block(inode, iblock, bh,
781 : create ? EXT4_GET_BLOCKS_CREATE : 0);
782 : }
783 :
784 : /*
785 : * Get block function used when preparing for buffered write if we require
786 : * creating an unwritten extent if blocks haven't been allocated. The extent
787 : * will be converted to written after the IO is complete.
788 : */
789 91157 : int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
790 : struct buffer_head *bh_result, int create)
791 : {
792 91157 : ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
793 : inode->i_ino, create);
794 91157 : return _ext4_get_block(inode, iblock, bh_result,
795 : EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
796 : }
797 :
798 : /* Maximum number of blocks we map for direct IO at once. */
799 : #define DIO_MAX_BLOCKS 4096
800 :
801 : /*
802 : * `handle' can be NULL if create is zero
803 : */
804 18450538 : struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
805 : ext4_lblk_t block, int map_flags)
806 : {
807 18450538 : struct ext4_map_blocks map;
808 18450538 : struct buffer_head *bh;
809 18450538 : int create = map_flags & EXT4_GET_BLOCKS_CREATE;
810 18450538 : bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT;
811 18450538 : int err;
812 :
813 18450538 : ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
814 : || handle != NULL || create == 0);
815 18450538 : ASSERT(create == 0 || !nowait);
816 :
817 18450538 : map.m_lblk = block;
818 18450538 : map.m_len = 1;
819 18450538 : err = ext4_map_blocks(handle, inode, &map, map_flags);
820 :
821 18484010 : if (err == 0)
822 0 : return create ? ERR_PTR(-ENOSPC) : NULL;
823 18484010 : if (err < 0)
824 25707 : return ERR_PTR(err);
825 :
826 18458303 : if (nowait)
827 913 : return sb_find_get_block(inode->i_sb, map.m_pblk);
828 :
829 18457390 : bh = sb_getblk(inode->i_sb, map.m_pblk);
830 18466080 : if (unlikely(!bh))
831 : return ERR_PTR(-ENOMEM);
832 18466080 : if (map.m_flags & EXT4_MAP_NEW) {
833 401407 : ASSERT(create != 0);
834 401407 : ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
835 : || (handle != NULL));
836 :
837 : /*
838 : * Now that we do not always journal data, we should
839 : * keep in mind whether this should always journal the
840 : * new buffer as metadata. For now, regular file
841 : * writes use ext4_get_block instead, so it's not a
842 : * problem.
843 : */
844 401407 : lock_buffer(bh);
845 401448 : BUFFER_TRACE(bh, "call get_create_access");
846 401448 : err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
847 : EXT4_JTR_NONE);
848 401439 : if (unlikely(err)) {
849 0 : unlock_buffer(bh);
850 0 : goto errout;
851 : }
852 802918 : if (!buffer_uptodate(bh)) {
853 370138 : memset(bh->b_data, 0, inode->i_sb->s_blocksize);
854 370138 : set_buffer_uptodate(bh);
855 : }
856 401506 : unlock_buffer(bh);
857 401457 : BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
858 401457 : err = ext4_handle_dirty_metadata(handle, inode, bh);
859 401527 : if (unlikely(err))
860 0 : goto errout;
861 : } else
862 : BUFFER_TRACE(bh, "not a new buffer");
863 : return bh;
864 0 : errout:
865 0 : brelse(bh);
866 0 : return ERR_PTR(err);
867 : }
868 :
869 16314350 : struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
870 : ext4_lblk_t block, int map_flags)
871 : {
872 16314350 : struct buffer_head *bh;
873 16314350 : int ret;
874 :
875 16314350 : bh = ext4_getblk(handle, inode, block, map_flags);
876 16340118 : if (IS_ERR(bh))
877 : return bh;
878 16314423 : if (!bh || ext4_buffer_uptodate(bh))
879 16282282 : return bh;
880 :
881 21003 : ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true);
882 21003 : if (ret) {
883 1 : put_bh(bh);
884 1 : return ERR_PTR(ret);
885 : }
886 : return bh;
887 : }
888 :
889 : /* Read a contiguous batch of blocks. */
890 2148889 : int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
891 : bool wait, struct buffer_head **bhs)
892 : {
893 2148889 : int i, err;
894 :
895 4301030 : for (i = 0; i < bh_count; i++) {
896 2149206 : bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */);
897 2152141 : if (IS_ERR(bhs[i])) {
898 0 : err = PTR_ERR(bhs[i]);
899 0 : bh_count = i;
900 0 : goto out_brelse;
901 : }
902 : }
903 :
904 4298379 : for (i = 0; i < bh_count; i++)
905 : /* Note that NULL bhs[i] is valid because of holes. */
906 2151350 : if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
907 1049 : ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false);
908 :
909 2147029 : if (!wait)
910 : return 0;
911 :
912 321 : for (i = 0; i < bh_count; i++)
913 279 : if (bhs[i])
914 279 : wait_on_buffer(bhs[i]);
915 :
916 321 : for (i = 0; i < bh_count; i++) {
917 558 : if (bhs[i] && !buffer_uptodate(bhs[i])) {
918 0 : err = -EIO;
919 0 : goto out_brelse;
920 : }
921 : }
922 : return 0;
923 :
924 0 : out_brelse:
925 0 : for (i = 0; i < bh_count; i++) {
926 0 : brelse(bhs[i]);
927 0 : bhs[i] = NULL;
928 : }
929 : return err;
930 : }
931 :
932 143 : int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,
933 : struct buffer_head *head,
934 : unsigned from,
935 : unsigned to,
936 : int *partial,
937 : int (*fn)(handle_t *handle, struct inode *inode,
938 : struct buffer_head *bh))
939 : {
940 143 : struct buffer_head *bh;
941 143 : unsigned block_start, block_end;
942 143 : unsigned blocksize = head->b_size;
943 143 : int err, ret = 0;
944 143 : struct buffer_head *next;
945 :
946 143 : for (bh = head, block_start = 0;
947 286 : ret == 0 && (bh != head || !block_start);
948 : block_start = block_end, bh = next) {
949 143 : next = bh->b_this_page;
950 143 : block_end = block_start + blocksize;
951 143 : if (block_end <= from || block_start >= to) {
952 0 : if (partial && !buffer_uptodate(bh))
953 0 : *partial = 1;
954 0 : continue;
955 : }
956 143 : err = (*fn)(handle, inode, bh);
957 143 : if (!ret)
958 143 : ret = err;
959 : }
960 143 : return ret;
961 : }
962 :
963 : /*
964 : * Helper for handling dirtying of journalled data. We also mark the folio as
965 : * dirty so that writeback code knows about this page (and inode) contains
966 : * dirty data. ext4_writepages() then commits appropriate transaction to
967 : * make data stable.
968 : */
969 0 : static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh)
970 : {
971 0 : folio_mark_dirty(bh->b_folio);
972 0 : return ext4_handle_dirty_metadata(handle, NULL, bh);
973 : }
974 :
975 0 : int do_journal_get_write_access(handle_t *handle, struct inode *inode,
976 : struct buffer_head *bh)
977 : {
978 0 : int dirty = buffer_dirty(bh);
979 0 : int ret;
980 :
981 0 : if (!buffer_mapped(bh) || buffer_freed(bh))
982 : return 0;
983 : /*
984 : * __block_write_begin() could have dirtied some buffers. Clean
985 : * the dirty bit as jbd2_journal_get_write_access() could complain
986 : * otherwise about fs integrity issues. Setting of the dirty bit
987 : * by __block_write_begin() isn't a real problem here as we clear
988 : * the bit before releasing a page lock and thus writeback cannot
989 : * ever write the buffer.
990 : */
991 0 : if (dirty)
992 0 : clear_buffer_dirty(bh);
993 0 : BUFFER_TRACE(bh, "get write access");
994 0 : ret = ext4_journal_get_write_access(handle, inode->i_sb, bh,
995 : EXT4_JTR_NONE);
996 0 : if (!ret && dirty)
997 0 : ret = ext4_dirty_journalled_data(handle, bh);
998 : return ret;
999 : }
1000 :
1001 : #ifdef CONFIG_FS_ENCRYPTION
1002 : static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
1003 : get_block_t *get_block)
1004 : {
1005 : unsigned from = pos & (PAGE_SIZE - 1);
1006 : unsigned to = from + len;
1007 : struct inode *inode = folio->mapping->host;
1008 : unsigned block_start, block_end;
1009 : sector_t block;
1010 : int err = 0;
1011 : unsigned blocksize = inode->i_sb->s_blocksize;
1012 : unsigned bbits;
1013 : struct buffer_head *bh, *head, *wait[2];
1014 : int nr_wait = 0;
1015 : int i;
1016 :
1017 : BUG_ON(!folio_test_locked(folio));
1018 : BUG_ON(from > PAGE_SIZE);
1019 : BUG_ON(to > PAGE_SIZE);
1020 : BUG_ON(from > to);
1021 :
1022 : head = folio_buffers(folio);
1023 : if (!head) {
1024 : create_empty_buffers(&folio->page, blocksize, 0);
1025 : head = folio_buffers(folio);
1026 : }
1027 : bbits = ilog2(blocksize);
1028 : block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
1029 :
1030 : for (bh = head, block_start = 0; bh != head || !block_start;
1031 : block++, block_start = block_end, bh = bh->b_this_page) {
1032 : block_end = block_start + blocksize;
1033 : if (block_end <= from || block_start >= to) {
1034 : if (folio_test_uptodate(folio)) {
1035 : set_buffer_uptodate(bh);
1036 : }
1037 : continue;
1038 : }
1039 : if (buffer_new(bh))
1040 : clear_buffer_new(bh);
1041 : if (!buffer_mapped(bh)) {
1042 : WARN_ON(bh->b_size != blocksize);
1043 : err = get_block(inode, block, bh, 1);
1044 : if (err)
1045 : break;
1046 : if (buffer_new(bh)) {
1047 : if (folio_test_uptodate(folio)) {
1048 : clear_buffer_new(bh);
1049 : set_buffer_uptodate(bh);
1050 : mark_buffer_dirty(bh);
1051 : continue;
1052 : }
1053 : if (block_end > to || block_start < from)
1054 : folio_zero_segments(folio, to,
1055 : block_end,
1056 : block_start, from);
1057 : continue;
1058 : }
1059 : }
1060 : if (folio_test_uptodate(folio)) {
1061 : set_buffer_uptodate(bh);
1062 : continue;
1063 : }
1064 : if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1065 : !buffer_unwritten(bh) &&
1066 : (block_start < from || block_end > to)) {
1067 : ext4_read_bh_lock(bh, 0, false);
1068 : wait[nr_wait++] = bh;
1069 : }
1070 : }
1071 : /*
1072 : * If we issued read requests, let them complete.
1073 : */
1074 : for (i = 0; i < nr_wait; i++) {
1075 : wait_on_buffer(wait[i]);
1076 : if (!buffer_uptodate(wait[i]))
1077 : err = -EIO;
1078 : }
1079 : if (unlikely(err)) {
1080 : folio_zero_new_buffers(folio, from, to);
1081 : } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
1082 : for (i = 0; i < nr_wait; i++) {
1083 : int err2;
1084 :
1085 : err2 = fscrypt_decrypt_pagecache_blocks(folio,
1086 : blocksize, bh_offset(wait[i]));
1087 : if (err2) {
1088 : clear_buffer_uptodate(wait[i]);
1089 : err = err2;
1090 : }
1091 : }
1092 : }
1093 :
1094 : return err;
1095 : }
1096 : #endif
1097 :
1098 : /*
1099 : * To preserve ordering, it is essential that the hole instantiation and
1100 : * the data write be encapsulated in a single transaction. We cannot
1101 : * close off a transaction and start a new one between the ext4_get_block()
1102 : * and the ext4_write_end(). So doing the jbd2_journal_start at the start of
1103 : * ext4_write_begin() is the right place.
1104 : */
1105 127589 : static int ext4_write_begin(struct file *file, struct address_space *mapping,
1106 : loff_t pos, unsigned len,
1107 : struct page **pagep, void **fsdata)
1108 : {
1109 127589 : struct inode *inode = mapping->host;
1110 127589 : int ret, needed_blocks;
1111 127589 : handle_t *handle;
1112 127589 : int retries = 0;
1113 127589 : struct folio *folio;
1114 127589 : pgoff_t index;
1115 127589 : unsigned from, to;
1116 :
1117 255178 : if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
1118 : return -EIO;
1119 :
1120 127589 : trace_ext4_write_begin(inode, pos, len);
1121 : /*
1122 : * Reserve one block more for addition to orphan list in case
1123 : * we allocate blocks but write fails for some reason
1124 : */
1125 127823 : needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
1126 127325 : index = pos >> PAGE_SHIFT;
1127 127325 : from = pos & (PAGE_SIZE - 1);
1128 127325 : to = from + len;
1129 :
1130 127325 : if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
1131 0 : ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
1132 : pagep);
1133 0 : if (ret < 0)
1134 : return ret;
1135 0 : if (ret == 1)
1136 : return 0;
1137 : }
1138 :
1139 : /*
1140 : * __filemap_get_folio() can take a long time if the
1141 : * system is thrashing due to memory pressure, or if the folio
1142 : * is being written back. So grab it first before we start
1143 : * the transaction handle. This also allows us to allocate
1144 : * the folio (if needed) without using GFP_NOFS.
1145 : */
1146 127325 : retry_grab:
1147 130028 : folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
1148 : mapping_gfp_mask(mapping));
1149 128878 : if (IS_ERR(folio))
1150 0 : return PTR_ERR(folio);
1151 : /*
1152 : * The same as page allocation, we prealloc buffer heads before
1153 : * starting the handle.
1154 : */
1155 128878 : if (!folio_buffers(folio))
1156 92378 : create_empty_buffers(&folio->page, inode->i_sb->s_blocksize, 0);
1157 :
1158 130719 : folio_unlock(folio);
1159 :
1160 134666 : retry_journal:
1161 134666 : handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
1162 133601 : if (IS_ERR(handle)) {
1163 0 : folio_put(folio);
1164 0 : return PTR_ERR(handle);
1165 : }
1166 :
1167 133601 : folio_lock(folio);
1168 133649 : if (folio->mapping != mapping) {
1169 : /* The folio got truncated from under us */
1170 2707 : folio_unlock(folio);
1171 2707 : folio_put(folio);
1172 2704 : ext4_journal_stop(handle);
1173 2703 : goto retry_grab;
1174 : }
1175 : /* In case writeback began while the folio was unlocked */
1176 130942 : folio_wait_stable(folio);
1177 :
1178 : #ifdef CONFIG_FS_ENCRYPTION
1179 : if (ext4_should_dioread_nolock(inode))
1180 : ret = ext4_block_write_begin(folio, pos, len,
1181 : ext4_get_block_unwritten);
1182 : else
1183 : ret = ext4_block_write_begin(folio, pos, len, ext4_get_block);
1184 : #else
1185 130887 : if (ext4_should_dioread_nolock(inode))
1186 126499 : ret = __block_write_begin(&folio->page, pos, len,
1187 : ext4_get_block_unwritten);
1188 : else
1189 4312 : ret = __block_write_begin(&folio->page, pos, len, ext4_get_block);
1190 : #endif
1191 254879 : if (!ret && ext4_should_journal_data(inode)) {
1192 0 : ret = ext4_walk_page_buffers(handle, inode,
1193 : folio_buffers(folio), from, to,
1194 : NULL, do_journal_get_write_access);
1195 : }
1196 :
1197 130336 : if (ret) {
1198 6494 : bool extended = (pos + len > inode->i_size) &&
1199 : !ext4_verity_in_progress(inode);
1200 :
1201 6494 : folio_unlock(folio);
1202 : /*
1203 : * __block_write_begin may have instantiated a few blocks
1204 : * outside i_size. Trim these off again. Don't need
1205 : * i_size_read because we hold i_rwsem.
1206 : *
1207 : * Add inode to orphan list in case we crash before
1208 : * truncate finishes
1209 : */
1210 6494 : if (extended && ext4_can_truncate(inode))
1211 3786 : ext4_orphan_add(handle, inode);
1212 :
1213 6494 : ext4_journal_stop(handle);
1214 6489 : if (extended) {
1215 3783 : ext4_truncate_failed_write(inode);
1216 : /*
1217 : * If truncate failed early the inode might
1218 : * still be on the orphan list; we need to
1219 : * make sure the inode is removed from the
1220 : * orphan list in that case.
1221 : */
1222 3786 : if (inode->i_nlink)
1223 3786 : ext4_orphan_del(NULL, inode);
1224 : }
1225 :
1226 12972 : if (ret == -ENOSPC &&
1227 6492 : ext4_should_retry_alloc(inode->i_sb, &retries))
1228 3950 : goto retry_journal;
1229 2530 : folio_put(folio);
1230 2530 : return ret;
1231 : }
1232 123842 : *pagep = &folio->page;
1233 123842 : return ret;
1234 : }
1235 :
1236 : /* For write_end() in data=journal mode */
1237 0 : static int write_end_fn(handle_t *handle, struct inode *inode,
1238 : struct buffer_head *bh)
1239 : {
1240 0 : int ret;
1241 0 : if (!buffer_mapped(bh) || buffer_freed(bh))
1242 : return 0;
1243 0 : set_buffer_uptodate(bh);
1244 0 : ret = ext4_dirty_journalled_data(handle, bh);
1245 0 : clear_buffer_meta(bh);
1246 0 : clear_buffer_prio(bh);
1247 0 : return ret;
1248 : }
1249 :
1250 : /*
1251 : * We need to pick up the new inode size which generic_commit_write gave us
1252 : * `file' can be NULL - eg, when called from page_symlink().
1253 : *
1254 : * ext4 never places buffers on inode->i_mapping->private_list. metadata
1255 : * buffers are managed internally.
1256 : */
1257 124602 : static int ext4_write_end(struct file *file,
1258 : struct address_space *mapping,
1259 : loff_t pos, unsigned len, unsigned copied,
1260 : struct page *page, void *fsdata)
1261 : {
1262 124602 : struct folio *folio = page_folio(page);
1263 125028 : handle_t *handle = ext4_journal_current_handle();
1264 125028 : struct inode *inode = mapping->host;
1265 125028 : loff_t old_size = inode->i_size;
1266 125028 : int ret = 0, ret2;
1267 125028 : int i_size_changed = 0;
1268 125028 : bool verity = ext4_verity_in_progress(inode);
1269 :
1270 125028 : trace_ext4_write_end(inode, pos, len, copied);
1271 :
1272 124892 : if (ext4_has_inline_data(inode) &&
1273 : ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
1274 0 : return ext4_write_inline_data_end(inode, pos, len, copied,
1275 : folio);
1276 :
1277 124892 : copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1278 : /*
1279 : * it's important to update i_size while still holding folio lock:
1280 : * page writeout could otherwise come in and zero beyond i_size.
1281 : *
1282 : * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree
1283 : * blocks are being written past EOF, so skip the i_size update.
1284 : */
1285 125428 : if (!verity)
1286 125428 : i_size_changed = ext4_update_inode_size(inode, pos + copied);
1287 125374 : folio_unlock(folio);
1288 125498 : folio_put(folio);
1289 :
1290 125516 : if (old_size < pos && !verity)
1291 1028 : pagecache_isize_extended(inode, old_size, pos);
1292 : /*
1293 : * Don't mark the inode dirty under folio lock. First, it unnecessarily
1294 : * makes the holding time of folio lock longer. Second, it forces lock
1295 : * ordering of folio lock and transaction start for journaling
1296 : * filesystems.
1297 : */
1298 125516 : if (i_size_changed)
1299 19759 : ret = ext4_mark_inode_dirty(handle, inode);
1300 :
1301 125524 : if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
1302 : /* if we have allocated more blocks and copied
1303 : * less. We will have blocks allocated outside
1304 : * inode->i_size. So truncate them
1305 : */
1306 0 : ext4_orphan_add(handle, inode);
1307 :
1308 125524 : ret2 = ext4_journal_stop(handle);
1309 124909 : if (!ret)
1310 124909 : ret = ret2;
1311 :
1312 124909 : if (pos + len > inode->i_size && !verity) {
1313 0 : ext4_truncate_failed_write(inode);
1314 : /*
1315 : * If truncate failed early the inode might still be
1316 : * on the orphan list; we need to make sure the inode
1317 : * is removed from the orphan list in that case.
1318 : */
1319 0 : if (inode->i_nlink)
1320 0 : ext4_orphan_del(NULL, inode);
1321 : }
1322 :
1323 124909 : return ret ? ret : copied;
1324 : }
1325 :
1326 : /*
1327 : * This is a private version of folio_zero_new_buffers() which doesn't
1328 : * set the buffer to be dirty, since in data=journalled mode we need
1329 : * to call ext4_dirty_journalled_data() instead.
1330 : */
1331 0 : static void ext4_journalled_zero_new_buffers(handle_t *handle,
1332 : struct inode *inode,
1333 : struct folio *folio,
1334 : unsigned from, unsigned to)
1335 : {
1336 0 : unsigned int block_start = 0, block_end;
1337 0 : struct buffer_head *head, *bh;
1338 :
1339 0 : bh = head = folio_buffers(folio);
1340 0 : do {
1341 0 : block_end = block_start + bh->b_size;
1342 0 : if (buffer_new(bh)) {
1343 0 : if (block_end > from && block_start < to) {
1344 0 : if (!folio_test_uptodate(folio)) {
1345 0 : unsigned start, size;
1346 :
1347 0 : start = max(from, block_start);
1348 0 : size = min(to, block_end) - start;
1349 :
1350 0 : folio_zero_range(folio, start, size);
1351 0 : write_end_fn(handle, inode, bh);
1352 : }
1353 0 : clear_buffer_new(bh);
1354 : }
1355 : }
1356 0 : block_start = block_end;
1357 0 : bh = bh->b_this_page;
1358 0 : } while (bh != head);
1359 0 : }
1360 :
1361 0 : static int ext4_journalled_write_end(struct file *file,
1362 : struct address_space *mapping,
1363 : loff_t pos, unsigned len, unsigned copied,
1364 : struct page *page, void *fsdata)
1365 : {
1366 0 : struct folio *folio = page_folio(page);
1367 0 : handle_t *handle = ext4_journal_current_handle();
1368 0 : struct inode *inode = mapping->host;
1369 0 : loff_t old_size = inode->i_size;
1370 0 : int ret = 0, ret2;
1371 0 : int partial = 0;
1372 0 : unsigned from, to;
1373 0 : int size_changed = 0;
1374 0 : bool verity = ext4_verity_in_progress(inode);
1375 :
1376 0 : trace_ext4_journalled_write_end(inode, pos, len, copied);
1377 0 : from = pos & (PAGE_SIZE - 1);
1378 0 : to = from + len;
1379 :
1380 0 : BUG_ON(!ext4_handle_valid(handle));
1381 :
1382 0 : if (ext4_has_inline_data(inode))
1383 0 : return ext4_write_inline_data_end(inode, pos, len, copied,
1384 : folio);
1385 :
1386 0 : if (unlikely(copied < len) && !folio_test_uptodate(folio)) {
1387 0 : copied = 0;
1388 0 : ext4_journalled_zero_new_buffers(handle, inode, folio,
1389 : from, to);
1390 : } else {
1391 0 : if (unlikely(copied < len))
1392 0 : ext4_journalled_zero_new_buffers(handle, inode, folio,
1393 : from + copied, to);
1394 0 : ret = ext4_walk_page_buffers(handle, inode,
1395 : folio_buffers(folio),
1396 : from, from + copied, &partial,
1397 : write_end_fn);
1398 0 : if (!partial)
1399 0 : folio_mark_uptodate(folio);
1400 : }
1401 0 : if (!verity)
1402 0 : size_changed = ext4_update_inode_size(inode, pos + copied);
1403 0 : EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
1404 0 : folio_unlock(folio);
1405 0 : folio_put(folio);
1406 :
1407 0 : if (old_size < pos && !verity)
1408 0 : pagecache_isize_extended(inode, old_size, pos);
1409 :
1410 0 : if (size_changed) {
1411 0 : ret2 = ext4_mark_inode_dirty(handle, inode);
1412 0 : if (!ret)
1413 0 : ret = ret2;
1414 : }
1415 :
1416 0 : if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
1417 : /* if we have allocated more blocks and copied
1418 : * less. We will have blocks allocated outside
1419 : * inode->i_size. So truncate them
1420 : */
1421 0 : ext4_orphan_add(handle, inode);
1422 :
1423 0 : ret2 = ext4_journal_stop(handle);
1424 0 : if (!ret)
1425 0 : ret = ret2;
1426 0 : if (pos + len > inode->i_size && !verity) {
1427 0 : ext4_truncate_failed_write(inode);
1428 : /*
1429 : * If truncate failed early the inode might still be
1430 : * on the orphan list; we need to make sure the inode
1431 : * is removed from the orphan list in that case.
1432 : */
1433 0 : if (inode->i_nlink)
1434 0 : ext4_orphan_del(NULL, inode);
1435 : }
1436 :
1437 0 : return ret ? ret : copied;
1438 : }
1439 :
1440 : /*
1441 : * Reserve space for a single cluster
1442 : */
1443 24727320 : static int ext4_da_reserve_space(struct inode *inode)
1444 : {
1445 24727320 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1446 24727320 : struct ext4_inode_info *ei = EXT4_I(inode);
1447 24727320 : int ret;
1448 :
1449 : /*
1450 : * We will charge metadata quota at writeout time; this saves
1451 : * us from metadata over-estimation, though we may go over by
1452 : * a small amount in the end. Here we just reserve for data.
1453 : */
1454 24727320 : ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
1455 24726707 : if (ret)
1456 : return ret;
1457 :
1458 24724064 : spin_lock(&ei->i_block_reservation_lock);
1459 24759819 : if (ext4_claim_free_clusters(sbi, 1, 0)) {
1460 112041 : spin_unlock(&ei->i_block_reservation_lock);
1461 112058 : dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
1462 112058 : return -ENOSPC;
1463 : }
1464 24624975 : ei->i_reserved_data_blocks++;
1465 24624975 : trace_ext4_da_reserve_space(inode);
1466 24553620 : spin_unlock(&ei->i_block_reservation_lock);
1467 :
1468 24553620 : return 0; /* success */
1469 : }
1470 :
1471 6672730 : void ext4_da_release_space(struct inode *inode, int to_free)
1472 : {
1473 6672730 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1474 6672730 : struct ext4_inode_info *ei = EXT4_I(inode);
1475 :
1476 6672730 : if (!to_free)
1477 : return; /* Nothing to release, exit */
1478 :
1479 1012134 : spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1480 :
1481 1012301 : trace_ext4_da_release_space(inode, to_free);
1482 1012117 : if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1483 : /*
1484 : * if there aren't enough reserved blocks, then the
1485 : * counter is messed up somewhere. Since this
1486 : * function is called from invalidate page, it's
1487 : * harmless to return without any action.
1488 : */
1489 0 : ext4_warning(inode->i_sb, "ext4_da_release_space: "
1490 : "ino %lu, to_free %d with only %d reserved "
1491 : "data blocks", inode->i_ino, to_free,
1492 : ei->i_reserved_data_blocks);
1493 0 : WARN_ON(1);
1494 0 : to_free = ei->i_reserved_data_blocks;
1495 : }
1496 1012117 : ei->i_reserved_data_blocks -= to_free;
1497 :
1498 : /* update fs dirty data blocks counter */
1499 1012117 : percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
1500 :
1501 1012136 : spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1502 :
1503 1012295 : dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
1504 : }
1505 :
1506 : /*
1507 : * Delayed allocation stuff
1508 : */
1509 :
1510 : struct mpage_da_data {
1511 : /* These are input fields for ext4_do_writepages() */
1512 : struct inode *inode;
1513 : struct writeback_control *wbc;
1514 : unsigned int can_map:1; /* Can writepages call map blocks? */
1515 :
1516 : /* These are internal state of ext4_do_writepages() */
1517 : pgoff_t first_page; /* The first page to write */
1518 : pgoff_t next_page; /* Current page to examine */
1519 : pgoff_t last_page; /* Last page to examine */
1520 : /*
1521 : * Extent to map - this can be after first_page because that can be
1522 : * fully mapped. We somewhat abuse m_flags to store whether the extent
1523 : * is delalloc or unwritten.
1524 : */
1525 : struct ext4_map_blocks map;
1526 : struct ext4_io_submit io_submit; /* IO submission data */
1527 : unsigned int do_map:1;
1528 : unsigned int scanned_until_end:1;
1529 : unsigned int journalled_more_data:1;
1530 : };
1531 :
1532 2555113 : static void mpage_release_unused_pages(struct mpage_da_data *mpd,
1533 : bool invalidate)
1534 : {
1535 2555113 : unsigned nr, i;
1536 2555113 : pgoff_t index, end;
1537 2555113 : struct folio_batch fbatch;
1538 2555113 : struct inode *inode = mpd->inode;
1539 2555113 : struct address_space *mapping = inode->i_mapping;
1540 :
1541 : /* This is necessary when next_page == 0. */
1542 2555113 : if (mpd->first_page >= mpd->next_page)
1543 1649293 : return;
1544 :
1545 905820 : mpd->scanned_until_end = 0;
1546 905820 : index = mpd->first_page;
1547 905820 : end = mpd->next_page - 1;
1548 905820 : if (invalidate) {
1549 0 : ext4_lblk_t start, last;
1550 0 : start = index << (PAGE_SHIFT - inode->i_blkbits);
1551 0 : last = end << (PAGE_SHIFT - inode->i_blkbits);
1552 :
1553 : /*
1554 : * avoid racing with extent status tree scans made by
1555 : * ext4_insert_delayed_block()
1556 : */
1557 0 : down_write(&EXT4_I(inode)->i_data_sem);
1558 0 : ext4_es_remove_extent(inode, start, last - start + 1);
1559 0 : up_write(&EXT4_I(inode)->i_data_sem);
1560 : }
1561 :
1562 905814 : folio_batch_init(&fbatch);
1563 2082194 : while (index <= end) {
1564 1176378 : nr = filemap_get_folios(mapping, &index, end, &fbatch);
1565 1176373 : if (nr == 0)
1566 : break;
1567 6466205 : for (i = 0; i < nr; i++) {
1568 5289816 : struct folio *folio = fbatch.folios[i];
1569 :
1570 5289810 : if (folio->index < mpd->first_page)
1571 0 : continue;
1572 5289810 : if (folio->index + folio_nr_pages(folio) - 1 > end)
1573 0 : continue;
1574 5289810 : BUG_ON(!folio_test_locked(folio));
1575 5289810 : BUG_ON(folio_test_writeback(folio));
1576 5289810 : if (invalidate) {
1577 0 : if (folio_mapped(folio))
1578 0 : folio_clear_dirty_for_io(folio);
1579 0 : block_invalidate_folio(folio, 0,
1580 : folio_size(folio));
1581 0 : folio_clear_uptodate(folio);
1582 : }
1583 5289810 : folio_unlock(folio);
1584 : }
1585 1176389 : folio_batch_release(&fbatch);
1586 : }
1587 : }
1588 :
1589 0 : static void ext4_print_free_blocks(struct inode *inode)
1590 : {
1591 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1592 0 : struct super_block *sb = inode->i_sb;
1593 0 : struct ext4_inode_info *ei = EXT4_I(inode);
1594 :
1595 0 : ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
1596 : EXT4_C2B(EXT4_SB(inode->i_sb),
1597 : ext4_count_free_clusters(sb)));
1598 0 : ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
1599 0 : ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
1600 : (long long) EXT4_C2B(EXT4_SB(sb),
1601 : percpu_counter_sum(&sbi->s_freeclusters_counter)));
1602 0 : ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
1603 : (long long) EXT4_C2B(EXT4_SB(sb),
1604 : percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
1605 0 : ext4_msg(sb, KERN_CRIT, "Block reservation details");
1606 0 : ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
1607 : ei->i_reserved_data_blocks);
1608 0 : return;
1609 : }
1610 :
1611 : /*
1612 : * ext4_insert_delayed_block - adds a delayed block to the extents status
1613 : * tree, incrementing the reserved cluster/block
1614 : * count or making a pending reservation
1615 : * where needed
1616 : *
1617 : * @inode - file containing the newly added block
1618 : * @lblk - logical block to be added
1619 : *
1620 : * Returns 0 on success, negative error code on failure.
1621 : */
1622 24804930 : static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
1623 : {
1624 24804930 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1625 24804930 : int ret;
1626 24804930 : bool allocated = false;
1627 :
1628 : /*
1629 : * If the cluster containing lblk is shared with a delayed,
1630 : * written, or unwritten extent in a bigalloc file system, it's
1631 : * already been accounted for and does not need to be reserved.
1632 : * A pending reservation must be made for the cluster if it's
1633 : * shared with a written or unwritten extent and doesn't already
1634 : * have one. Written and unwritten extents can be purged from the
1635 : * extents status tree if the system is under memory pressure, so
1636 : * it's necessary to examine the extent tree if a search of the
1637 : * extents status tree doesn't get a match.
1638 : */
1639 24804930 : if (sbi->s_cluster_ratio == 1) {
1640 24739394 : ret = ext4_da_reserve_space(inode);
1641 24758318 : if (ret != 0) /* ENOSPC */
1642 : return ret;
1643 : } else { /* bigalloc */
1644 65536 : if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
1645 4097 : if (!ext4_es_scan_clu(inode,
1646 : &ext4_es_is_mapped, lblk)) {
1647 8192 : ret = ext4_clu_mapped(inode,
1648 4096 : EXT4_B2C(sbi, lblk));
1649 4096 : if (ret < 0)
1650 : return ret;
1651 4096 : if (ret == 0) {
1652 4096 : ret = ext4_da_reserve_space(inode);
1653 4096 : if (ret != 0) /* ENOSPC */
1654 : return ret;
1655 : } else {
1656 : allocated = true;
1657 : }
1658 : } else {
1659 : allocated = true;
1660 : }
1661 : }
1662 : }
1663 :
1664 24708985 : ext4_es_insert_delayed_block(inode, lblk, allocated);
1665 24708985 : return 0;
1666 : }
1667 :
1668 : /*
1669 : * This function is grabs code from the very beginning of
1670 : * ext4_map_blocks, but assumes that the caller is from delayed write
1671 : * time. This function looks up the requested blocks and sets the
1672 : * buffer delay bit under the protection of i_data_sem.
1673 : */
1674 31491327 : static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1675 : struct ext4_map_blocks *map,
1676 : struct buffer_head *bh)
1677 : {
1678 31491327 : struct extent_status es;
1679 31491327 : int retval;
1680 31491327 : sector_t invalid_block = ~((sector_t) 0xffff);
1681 : #ifdef ES_AGGRESSIVE_TEST
1682 : struct ext4_map_blocks orig_map;
1683 :
1684 : memcpy(&orig_map, map, sizeof(*map));
1685 : #endif
1686 :
1687 62860294 : if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
1688 0 : invalid_block = ~0;
1689 :
1690 31491327 : map->m_flags = 0;
1691 31491327 : ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
1692 : (unsigned long) map->m_lblk);
1693 :
1694 : /* Lookup extent status tree firstly */
1695 31491327 : if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
1696 29605271 : if (ext4_es_is_hole(&es)) {
1697 23105231 : retval = 0;
1698 23105231 : down_read(&EXT4_I(inode)->i_data_sem);
1699 23104337 : goto add_delayed;
1700 : }
1701 :
1702 : /*
1703 : * Delayed extent could be allocated by fallocate.
1704 : * So we need to check it.
1705 : */
1706 6500040 : if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
1707 0 : map_bh(bh, inode->i_sb, invalid_block);
1708 0 : set_buffer_new(bh);
1709 0 : set_buffer_delay(bh);
1710 0 : return 0;
1711 : }
1712 :
1713 6500040 : map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
1714 6500040 : retval = es.es_len - (iblock - es.es_lblk);
1715 6500040 : if (retval > map->m_len)
1716 : retval = map->m_len;
1717 6500040 : map->m_len = retval;
1718 6500040 : if (ext4_es_is_written(&es))
1719 2085765 : map->m_flags |= EXT4_MAP_MAPPED;
1720 4414275 : else if (ext4_es_is_unwritten(&es))
1721 4414275 : map->m_flags |= EXT4_MAP_UNWRITTEN;
1722 : else
1723 0 : BUG();
1724 :
1725 : #ifdef ES_AGGRESSIVE_TEST
1726 : ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
1727 : #endif
1728 6500040 : return retval;
1729 : }
1730 :
1731 : /*
1732 : * Try to see if we can get the block without requesting a new
1733 : * file system block.
1734 : */
1735 1923408 : down_read(&EXT4_I(inode)->i_data_sem);
1736 1923399 : if (ext4_has_inline_data(inode))
1737 : retval = 0;
1738 1923399 : else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1739 1798179 : retval = ext4_ext_map_blocks(NULL, inode, map, 0);
1740 : else
1741 125220 : retval = ext4_ind_map_blocks(NULL, inode, map, 0);
1742 :
1743 1914711 : add_delayed:
1744 25019048 : if (retval == 0) {
1745 24805251 : int ret;
1746 :
1747 : /*
1748 : * XXX: __block_prepare_write() unmaps passed block,
1749 : * is it OK?
1750 : */
1751 :
1752 24805251 : ret = ext4_insert_delayed_block(inode, map->m_lblk);
1753 24762727 : if (ret != 0) {
1754 114801 : retval = ret;
1755 114801 : goto out_unlock;
1756 : }
1757 :
1758 24647926 : map_bh(bh, inode->i_sb, invalid_block);
1759 24708574 : set_buffer_new(bh);
1760 24715548 : set_buffer_delay(bh);
1761 213797 : } else if (retval > 0) {
1762 213796 : unsigned int status;
1763 :
1764 213796 : if (unlikely(retval != map->m_len)) {
1765 0 : ext4_warning(inode->i_sb,
1766 : "ES len assertion failed for inode "
1767 : "%lu: retval %d != map->m_len %d",
1768 : inode->i_ino, retval, map->m_len);
1769 0 : WARN_ON(1);
1770 : }
1771 :
1772 213796 : status = map->m_flags & EXT4_MAP_UNWRITTEN ?
1773 213796 : EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
1774 213796 : ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
1775 : map->m_pblk, status);
1776 : }
1777 :
1778 1 : out_unlock:
1779 25026641 : up_read((&EXT4_I(inode)->i_data_sem));
1780 :
1781 25026641 : return retval;
1782 : }
1783 :
1784 : /*
1785 : * This is a special get_block_t callback which is used by
1786 : * ext4_da_write_begin(). It will either return mapped block or
1787 : * reserve space for a single block.
1788 : *
1789 : * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
1790 : * We also have b_blocknr = -1 and b_bdev initialized properly
1791 : *
1792 : * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
1793 : * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
1794 : * initialized properly.
1795 : */
1796 31513579 : int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1797 : struct buffer_head *bh, int create)
1798 : {
1799 31513579 : struct ext4_map_blocks map;
1800 31513579 : int ret = 0;
1801 :
1802 31513579 : BUG_ON(create == 0);
1803 31513579 : BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
1804 :
1805 31513579 : map.m_lblk = iblock;
1806 31513579 : map.m_len = 1;
1807 :
1808 : /*
1809 : * first, we need to know whether the block is allocated already
1810 : * preallocated blocks are unmapped but should treated
1811 : * the same as allocated blocks.
1812 : */
1813 31513579 : ret = ext4_da_map_blocks(inode, iblock, &map, bh);
1814 31512133 : if (ret <= 0)
1815 : return ret;
1816 :
1817 6713699 : map_bh(bh, inode->i_sb, map.m_pblk);
1818 6716957 : ext4_update_bh_state(bh, map.m_flags);
1819 :
1820 13436518 : if (buffer_unwritten(bh)) {
1821 : /* A delayed write to unwritten bh should be marked
1822 : * new and mapped. Mapped ensures that we don't do
1823 : * get_block multiple times when we write to the same
1824 : * offset and new ensures that we do proper zero out
1825 : * for partial write.
1826 : */
1827 4498595 : set_buffer_new(bh);
1828 4497931 : set_buffer_mapped(bh);
1829 : }
1830 : return 0;
1831 : }
1832 :
1833 25203866 : static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio)
1834 : {
1835 25203866 : mpd->first_page += folio_nr_pages(folio);
1836 25203866 : folio_unlock(folio);
1837 25207746 : }
1838 :
1839 25205631 : static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
1840 : {
1841 25205631 : size_t len;
1842 25205631 : loff_t size;
1843 25205631 : int err;
1844 :
1845 25205631 : BUG_ON(folio->index != mpd->first_page);
1846 25205631 : folio_clear_dirty_for_io(folio);
1847 : /*
1848 : * We have to be very careful here! Nothing protects writeback path
1849 : * against i_size changes and the page can be writeably mapped into
1850 : * page tables. So an application can be growing i_size and writing
1851 : * data through mmap while writeback runs. folio_clear_dirty_for_io()
1852 : * write-protects our page in page tables and the page cannot get
1853 : * written to again until we release folio lock. So only after
1854 : * folio_clear_dirty_for_io() we are safe to sample i_size for
1855 : * ext4_bio_write_folio() to zero-out tail of the written page. We rely
1856 : * on the barrier provided by folio_test_clear_dirty() in
1857 : * folio_clear_dirty_for_io() to make sure i_size is really sampled only
1858 : * after page tables are updated.
1859 : */
1860 25203606 : size = i_size_read(mpd->inode);
1861 25203606 : len = folio_size(folio);
1862 25203675 : if (folio_pos(folio) + len > size &&
1863 : !ext4_verity_in_progress(mpd->inode))
1864 356437 : len = size & ~PAGE_MASK;
1865 25203675 : err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
1866 25203968 : if (!err)
1867 25203968 : mpd->wbc->nr_to_write--;
1868 :
1869 25203968 : return err;
1870 : }
1871 :
1872 : #define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay))
1873 :
1874 : /*
1875 : * mballoc gives us at most this number of blocks...
1876 : * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
1877 : * The rest of mballoc seems to handle chunks up to full group size.
1878 : */
1879 : #define MAX_WRITEPAGES_EXTENT_LEN 2048
1880 :
1881 : /*
1882 : * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
1883 : *
1884 : * @mpd - extent of blocks
1885 : * @lblk - logical number of the block in the file
1886 : * @bh - buffer head we want to add to the extent
1887 : *
1888 : * The function is used to collect contig. blocks in the same state. If the
1889 : * buffer doesn't require mapping for writeback and we haven't started the
1890 : * extent of buffers to map yet, the function returns 'true' immediately - the
1891 : * caller can write the buffer right away. Otherwise the function returns true
1892 : * if the block has been added to the extent, false if the block couldn't be
1893 : * added.
1894 : */
1895 30179380 : static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
1896 : struct buffer_head *bh)
1897 : {
1898 30179380 : struct ext4_map_blocks *map = &mpd->map;
1899 :
1900 : /* Buffer that doesn't need mapping for writeback? */
1901 120718010 : if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
1902 7703441 : (!buffer_delay(bh) && !buffer_unwritten(bh))) {
1903 : /* So far no extent to map => we write the buffer right away */
1904 2782209 : if (map->m_len == 0)
1905 : return true;
1906 77852 : return false;
1907 : }
1908 :
1909 : /* First block in the extent? */
1910 27397171 : if (map->m_len == 0) {
1911 : /* We cannot map unless handle is started... */
1912 1814610 : if (!mpd->do_map)
1913 : return false;
1914 1122925 : map->m_lblk = lblk;
1915 1122925 : map->m_len = 1;
1916 1122925 : map->m_flags = bh->b_state & BH_FLAGS;
1917 1122925 : return true;
1918 : }
1919 :
1920 : /* Don't go larger than mballoc is willing to allocate */
1921 25582561 : if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
1922 : return false;
1923 :
1924 : /* Can we merge the block to our big extent? */
1925 25575768 : if (lblk == map->m_lblk + map->m_len &&
1926 25576208 : (bh->b_state & BH_FLAGS) == map->m_flags) {
1927 25494689 : map->m_len++;
1928 25494689 : return true;
1929 : }
1930 : return false;
1931 : }
1932 :
1933 : /*
1934 : * mpage_process_page_bufs - submit page buffers for IO or add them to extent
1935 : *
1936 : * @mpd - extent of blocks for mapping
1937 : * @head - the first buffer in the page
1938 : * @bh - buffer we should start processing from
1939 : * @lblk - logical number of the block in the file corresponding to @bh
1940 : *
1941 : * Walk through page buffers from @bh upto @head (exclusive) and either submit
1942 : * the page for IO if all buffers in this page were mapped and there's no
1943 : * accumulated extent of buffers to map or add buffers in the page to the
1944 : * extent of buffers to map. The function returns 1 if the caller can continue
1945 : * by processing the next page, 0 if it should stop adding buffers to the
1946 : * extent to map because we cannot extend it anymore. It can also return value
1947 : * < 0 in case of error during IO submission.
1948 : */
1949 30185282 : static int mpage_process_page_bufs(struct mpage_da_data *mpd,
1950 : struct buffer_head *head,
1951 : struct buffer_head *bh,
1952 : ext4_lblk_t lblk)
1953 : {
1954 30185282 : struct inode *inode = mpd->inode;
1955 30185282 : int err;
1956 30184923 : ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1)
1957 30185282 : >> inode->i_blkbits;
1958 :
1959 30184923 : if (ext4_verity_in_progress(inode))
1960 : blocks = EXT_MAX_BLOCKS;
1961 :
1962 30158236 : do {
1963 60316472 : BUG_ON(buffer_locked(bh));
1964 :
1965 30158236 : if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
1966 : /* Found extent to map? */
1967 857968 : if (mpd->map.m_len)
1968 : return 0;
1969 : /* Buffer needs mapping and handle is not started? */
1970 691744 : if (!mpd->do_map)
1971 : return 0;
1972 : /* Everything mapped so far and we hit EOF */
1973 : break;
1974 : }
1975 29295305 : } while (lblk++, (bh = bh->b_this_page) != head);
1976 : /* So far everything mapped? Submit the page for IO. */
1977 29322027 : if (mpd->map.m_len == 0) {
1978 2704397 : err = mpage_submit_folio(mpd, head->b_folio);
1979 2704287 : if (err < 0)
1980 : return err;
1981 2704287 : mpage_folio_done(mpd, head->b_folio);
1982 : }
1983 29322083 : if (lblk >= blocks) {
1984 471100 : mpd->scanned_until_end = 1;
1985 471100 : return 0;
1986 : }
1987 : return 1;
1988 : }
1989 :
1990 : /*
1991 : * mpage_process_folio - update folio buffers corresponding to changed extent
1992 : * and may submit fully mapped page for IO
1993 : * @mpd: description of extent to map, on return next extent to map
1994 : * @folio: Contains these buffers.
1995 : * @m_lblk: logical block mapping.
1996 : * @m_pblk: corresponding physical mapping.
1997 : * @map_bh: determines on return whether this page requires any further
1998 : * mapping or not.
1999 : *
2000 : * Scan given folio buffers corresponding to changed extent and update buffer
2001 : * state according to new extent state.
2002 : * We map delalloc buffers to their physical location, clear unwritten bits.
2003 : * If the given folio is not fully mapped, we update @mpd to the next extent in
2004 : * the given folio that needs mapping & return @map_bh as true.
2005 : */
2006 22190547 : static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio,
2007 : ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
2008 : bool *map_bh)
2009 : {
2010 22190547 : struct buffer_head *head, *bh;
2011 22190547 : ext4_io_end_t *io_end = mpd->io_submit.io_end;
2012 22190547 : ext4_lblk_t lblk = *m_lblk;
2013 22190547 : ext4_fsblk_t pblock = *m_pblk;
2014 22190547 : int err = 0;
2015 22190547 : int blkbits = mpd->inode->i_blkbits;
2016 22190547 : ssize_t io_end_size = 0;
2017 22190547 : struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
2018 :
2019 22190457 : bh = head = folio_buffers(folio);
2020 22189982 : do {
2021 22189982 : if (lblk < mpd->map.m_lblk)
2022 0 : continue;
2023 22189982 : if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
2024 : /*
2025 : * Buffer after end of mapped extent.
2026 : * Find next buffer in the folio to map.
2027 : */
2028 1 : mpd->map.m_len = 0;
2029 1 : mpd->map.m_flags = 0;
2030 1 : io_end_vec->size += io_end_size;
2031 :
2032 1 : err = mpage_process_page_bufs(mpd, head, bh, lblk);
2033 1 : if (err > 0)
2034 : err = 0;
2035 1 : if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
2036 0 : io_end_vec = ext4_alloc_io_end_vec(io_end);
2037 0 : if (IS_ERR(io_end_vec)) {
2038 0 : err = PTR_ERR(io_end_vec);
2039 0 : goto out;
2040 : }
2041 0 : io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits;
2042 : }
2043 1 : *map_bh = true;
2044 1 : goto out;
2045 : }
2046 44379962 : if (buffer_delay(bh)) {
2047 18285168 : clear_buffer_delay(bh);
2048 18287081 : bh->b_blocknr = pblock++;
2049 : }
2050 22191894 : clear_buffer_unwritten(bh);
2051 22190729 : io_end_size += (1 << blkbits);
2052 22190729 : } while (lblk++, (bh = bh->b_this_page) != head);
2053 :
2054 22191204 : io_end_vec->size += io_end_size;
2055 22191204 : *map_bh = false;
2056 22191205 : out:
2057 22191205 : *m_lblk = lblk;
2058 22191205 : *m_pblk = pblock;
2059 22191205 : return err;
2060 : }
2061 :
2062 : /*
2063 : * mpage_map_buffers - update buffers corresponding to changed extent and
2064 : * submit fully mapped pages for IO
2065 : *
2066 : * @mpd - description of extent to map, on return next extent to map
2067 : *
2068 : * Scan buffers corresponding to changed extent (we expect corresponding pages
2069 : * to be already locked) and update buffer state according to new extent state.
2070 : * We map delalloc buffers to their physical location, clear unwritten bits,
2071 : * and mark buffers as uninit when we perform writes to unwritten extents
2072 : * and do extent conversion after IO is finished. If the last page is not fully
2073 : * mapped, we update @map to the next extent in the last page that needs
2074 : * mapping. Otherwise we submit the page for IO.
2075 : */
2076 1122820 : static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
2077 : {
2078 1122820 : struct folio_batch fbatch;
2079 1122820 : unsigned nr, i;
2080 1122820 : struct inode *inode = mpd->inode;
2081 1122820 : int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
2082 1122820 : pgoff_t start, end;
2083 1122820 : ext4_lblk_t lblk;
2084 1122820 : ext4_fsblk_t pblock;
2085 1122820 : int err;
2086 1122820 : bool map_bh = false;
2087 :
2088 1122820 : start = mpd->map.m_lblk >> bpp_bits;
2089 1122820 : end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
2090 1122820 : lblk = start << bpp_bits;
2091 1122820 : pblock = mpd->map.m_pblk;
2092 :
2093 1122820 : folio_batch_init(&fbatch);
2094 3446396 : while (start <= end) {
2095 2323561 : nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch);
2096 2323564 : if (nr == 0)
2097 : break;
2098 24515472 : for (i = 0; i < nr; i++) {
2099 22191884 : struct folio *folio = fbatch.folios[i];
2100 :
2101 22191062 : err = mpage_process_folio(mpd, folio, &lblk, &pblock,
2102 : &map_bh);
2103 : /*
2104 : * If map_bh is true, means page may require further bh
2105 : * mapping, or maybe the page was submitted for IO.
2106 : * So we return to call further extent mapping.
2107 : */
2108 22190555 : if (err < 0 || map_bh)
2109 1 : goto out;
2110 : /* Page fully mapped - let IO run! */
2111 22190554 : err = mpage_submit_folio(mpd, folio);
2112 22188865 : if (err < 0)
2113 0 : goto out;
2114 22188865 : mpage_folio_done(mpd, folio);
2115 : }
2116 2323588 : folio_batch_release(&fbatch);
2117 : }
2118 : /* Extent fully mapped and matches with page boundary. We are done. */
2119 1122835 : mpd->map.m_len = 0;
2120 1122835 : mpd->map.m_flags = 0;
2121 1122835 : return 0;
2122 1 : out:
2123 1 : folio_batch_release(&fbatch);
2124 : return err;
2125 : }
2126 :
2127 1122927 : static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
2128 : {
2129 1122927 : struct inode *inode = mpd->inode;
2130 1122927 : struct ext4_map_blocks *map = &mpd->map;
2131 1122927 : int get_blocks_flags;
2132 1122927 : int err, dioread_nolock;
2133 :
2134 1122927 : trace_ext4_da_write_pages_extent(inode, map);
2135 : /*
2136 : * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
2137 : * to convert an unwritten extent to be initialized (in the case
2138 : * where we have written into one or more preallocated blocks). It is
2139 : * possible that we're going to need more metadata blocks than
2140 : * previously reserved. However we must not fail because we're in
2141 : * writeback and there is nothing we can do about it so it might result
2142 : * in data loss. So use reserved blocks to allocate metadata if
2143 : * possible.
2144 : *
2145 : * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if
2146 : * the blocks in question are delalloc blocks. This indicates
2147 : * that the blocks and quotas has already been checked when
2148 : * the data was copied into the page cache.
2149 : */
2150 1122916 : get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
2151 : EXT4_GET_BLOCKS_METADATA_NOFAIL |
2152 : EXT4_GET_BLOCKS_IO_SUBMIT;
2153 1122916 : dioread_nolock = ext4_should_dioread_nolock(inode);
2154 1122910 : if (dioread_nolock)
2155 1122708 : get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2156 1122910 : if (map->m_flags & BIT(BH_Delay))
2157 912531 : get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2158 :
2159 1122910 : err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
2160 1122905 : if (err < 0)
2161 : return err;
2162 1122813 : if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
2163 1122603 : if (!mpd->io_submit.io_end->handle &&
2164 : ext4_handle_valid(handle)) {
2165 1122577 : mpd->io_submit.io_end->handle = handle->h_rsv_handle;
2166 1122577 : handle->h_rsv_handle = NULL;
2167 : }
2168 1122603 : ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
2169 : }
2170 :
2171 1122828 : BUG_ON(map->m_len == 0);
2172 : return 0;
2173 : }
2174 :
2175 : /*
2176 : * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
2177 : * mpd->len and submit pages underlying it for IO
2178 : *
2179 : * @handle - handle for journal operations
2180 : * @mpd - extent to map
2181 : * @give_up_on_write - we set this to true iff there is a fatal error and there
2182 : * is no hope of writing the data. The caller should discard
2183 : * dirty pages to avoid infinite loops.
2184 : *
2185 : * The function maps extent starting at mpd->lblk of length mpd->len. If it is
2186 : * delayed, blocks are allocated, if it is unwritten, we may need to convert
2187 : * them to initialized or split the described range from larger unwritten
2188 : * extent. Note that we need not map all the described range since allocation
2189 : * can return less blocks or the range is covered by more unwritten extents. We
2190 : * cannot map more because we are limited by reserved transaction credits. On
2191 : * the other hand we always make sure that the last touched page is fully
2192 : * mapped so that it can be written out (and thus forward progress is
2193 : * guaranteed). After mapping we submit all mapped pages for IO.
2194 : */
2195 1122930 : static int mpage_map_and_submit_extent(handle_t *handle,
2196 : struct mpage_da_data *mpd,
2197 : bool *give_up_on_write)
2198 : {
2199 1122930 : struct inode *inode = mpd->inode;
2200 1122930 : struct ext4_map_blocks *map = &mpd->map;
2201 1122930 : int err;
2202 1122930 : loff_t disksize;
2203 1122930 : int progress = 0;
2204 1122930 : ext4_io_end_t *io_end = mpd->io_submit.io_end;
2205 1122930 : struct ext4_io_end_vec *io_end_vec;
2206 :
2207 1122930 : io_end_vec = ext4_alloc_io_end_vec(io_end);
2208 1122927 : if (IS_ERR(io_end_vec))
2209 0 : return PTR_ERR(io_end_vec);
2210 1122927 : io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
2211 1122930 : do {
2212 1122930 : err = mpage_map_one_extent(handle, mpd);
2213 1122917 : if (err < 0) {
2214 92 : struct super_block *sb = inode->i_sb;
2215 :
2216 184 : if (ext4_forced_shutdown(EXT4_SB(sb)) ||
2217 : ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
2218 0 : goto invalidate_dirty_pages;
2219 : /*
2220 : * Let the uper layers retry transient errors.
2221 : * In the case of ENOSPC, if ext4_count_free_blocks()
2222 : * is non-zero, a commit should free up blocks.
2223 : */
2224 92 : if ((err == -ENOMEM) ||
2225 92 : (err == -ENOSPC && ext4_count_free_clusters(sb))) {
2226 92 : if (progress)
2227 0 : goto update_disksize;
2228 : return err;
2229 : }
2230 0 : ext4_msg(sb, KERN_CRIT,
2231 : "Delayed block allocation failed for "
2232 : "inode %lu at logical offset %llu with"
2233 : " max blocks %u with error %d",
2234 : inode->i_ino,
2235 : (unsigned long long)map->m_lblk,
2236 : (unsigned)map->m_len, -err);
2237 0 : ext4_msg(sb, KERN_CRIT,
2238 : "This should not happen!! Data will "
2239 : "be lost\n");
2240 0 : if (err == -ENOSPC)
2241 0 : ext4_print_free_blocks(inode);
2242 0 : invalidate_dirty_pages:
2243 0 : *give_up_on_write = true;
2244 0 : return err;
2245 : }
2246 1122825 : progress = 1;
2247 : /*
2248 : * Update buffer state, submit mapped pages, and get us new
2249 : * extent to map
2250 : */
2251 1122825 : err = mpage_map_and_submit_buffers(mpd);
2252 1122833 : if (err < 0)
2253 0 : goto update_disksize;
2254 1122833 : } while (map->m_len);
2255 :
2256 1122830 : update_disksize:
2257 : /*
2258 : * Update on-disk size after IO is submitted. Races with
2259 : * truncate are avoided by checking i_size under i_data_sem.
2260 : */
2261 1122830 : disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
2262 1122830 : if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
2263 496652 : int err2;
2264 496652 : loff_t i_size;
2265 :
2266 496652 : down_write(&EXT4_I(inode)->i_data_sem);
2267 496651 : i_size = i_size_read(inode);
2268 496651 : if (disksize > i_size)
2269 : disksize = i_size;
2270 496651 : if (disksize > EXT4_I(inode)->i_disksize)
2271 439056 : EXT4_I(inode)->i_disksize = disksize;
2272 496651 : up_write(&EXT4_I(inode)->i_data_sem);
2273 496650 : err2 = ext4_mark_inode_dirty(handle, inode);
2274 496652 : if (err2) {
2275 0 : ext4_error_err(inode->i_sb, -err2,
2276 : "Failed to mark inode %lu dirty",
2277 : inode->i_ino);
2278 : }
2279 496652 : if (!err)
2280 496652 : err = err2;
2281 : }
2282 : return err;
2283 : }
2284 :
2285 : /*
2286 : * Calculate the total number of credits to reserve for one writepages
2287 : * iteration. This is called from ext4_writepages(). We map an extent of
2288 : * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
2289 : * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
2290 : * bpp - 1 blocks in bpp different extents.
2291 : */
2292 1162688 : static int ext4_da_writepages_trans_blocks(struct inode *inode)
2293 : {
2294 1162688 : int bpp = ext4_journal_blocks_per_page(inode);
2295 :
2296 1162685 : return ext4_meta_trans_blocks(inode,
2297 : MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
2298 : }
2299 :
2300 0 : static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio,
2301 : size_t len)
2302 : {
2303 0 : struct buffer_head *page_bufs = folio_buffers(folio);
2304 0 : struct inode *inode = folio->mapping->host;
2305 0 : int ret, err;
2306 :
2307 0 : ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
2308 : NULL, do_journal_get_write_access);
2309 0 : err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
2310 : NULL, write_end_fn);
2311 0 : if (ret == 0)
2312 0 : ret = err;
2313 0 : err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len);
2314 0 : if (ret == 0)
2315 0 : ret = err;
2316 0 : EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
2317 :
2318 0 : return ret;
2319 : }
2320 :
2321 0 : static int mpage_journal_page_buffers(handle_t *handle,
2322 : struct mpage_da_data *mpd,
2323 : struct folio *folio)
2324 : {
2325 0 : struct inode *inode = mpd->inode;
2326 0 : loff_t size = i_size_read(inode);
2327 0 : size_t len = folio_size(folio);
2328 :
2329 0 : folio_clear_checked(folio);
2330 0 : mpd->wbc->nr_to_write--;
2331 :
2332 0 : if (folio_pos(folio) + len > size &&
2333 : !ext4_verity_in_progress(inode))
2334 0 : len = size - folio_pos(folio);
2335 :
2336 0 : return ext4_journal_folio_buffers(handle, folio, len);
2337 : }
2338 :
2339 : /*
2340 : * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
2341 : * needing mapping, submit mapped pages
2342 : *
2343 : * @mpd - where to look for pages
2344 : *
2345 : * Walk dirty pages in the mapping. If they are fully mapped, submit them for
2346 : * IO immediately. If we cannot map blocks, we submit just already mapped
2347 : * buffers in the page for IO and keep page dirty. When we can map blocks and
2348 : * we find a page which isn't mapped we start accumulating extent of buffers
2349 : * underlying these pages that needs mapping (formed by either delayed or
2350 : * unwritten buffers). We also lock the pages containing these buffers. The
2351 : * extent found is returned in @mpd structure (starting at mpd->lblk with
2352 : * length mpd->len blocks).
2353 : *
2354 : * Note that this function can attach bios to one io_end structure which are
2355 : * neither logically nor physically contiguous. Although it may seem as an
2356 : * unnecessary complication, it is actually inevitable in blocksize < pagesize
2357 : * case as we need to track IO to all buffers underlying a page in one io_end.
2358 : */
2359 2555125 : static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
2360 : {
2361 2555125 : struct address_space *mapping = mpd->inode->i_mapping;
2362 2555125 : struct folio_batch fbatch;
2363 2555125 : unsigned int nr_folios;
2364 2555125 : pgoff_t index = mpd->first_page;
2365 2555125 : pgoff_t end = mpd->last_page;
2366 2555125 : xa_mark_t tag;
2367 2555125 : int i, err = 0;
2368 2555125 : int blkbits = mpd->inode->i_blkbits;
2369 2555125 : ext4_lblk_t lblk;
2370 2555125 : struct buffer_head *head;
2371 2555125 : handle_t *handle = NULL;
2372 2555125 : int bpp = ext4_journal_blocks_per_page(mpd->inode);
2373 :
2374 2555067 : if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
2375 : tag = PAGECACHE_TAG_TOWRITE;
2376 : else
2377 600003 : tag = PAGECACHE_TAG_DIRTY;
2378 :
2379 2555067 : mpd->map.m_len = 0;
2380 2555067 : mpd->next_page = index;
2381 2555067 : if (ext4_should_journal_data(mpd->inode)) {
2382 0 : handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE,
2383 : bpp);
2384 0 : if (IS_ERR(handle))
2385 0 : return PTR_ERR(handle);
2386 : }
2387 2555090 : folio_batch_init(&fbatch);
2388 4640255 : while (index <= end) {
2389 4230189 : nr_folios = filemap_get_folios_tag(mapping, &index, end,
2390 : tag, &fbatch);
2391 4230180 : if (nr_folios == 0)
2392 : break;
2393 :
2394 33213139 : for (i = 0; i < nr_folios; i++) {
2395 31127991 : struct folio *folio = fbatch.folios[i];
2396 :
2397 : /*
2398 : * Accumulated enough dirty pages? This doesn't apply
2399 : * to WB_SYNC_ALL mode. For integrity sync we have to
2400 : * keep going because someone may be concurrently
2401 : * dirtying pages, and we might have synced a lot of
2402 : * newly appeared dirty pages, but have not synced all
2403 : * of the old dirty pages.
2404 : */
2405 31128547 : if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
2406 18539877 : mpd->wbc->nr_to_write <=
2407 18539877 : mpd->map.m_len >> (PAGE_SHIFT - blkbits))
2408 144 : goto out;
2409 :
2410 : /* If we can't merge this page, we are done. */
2411 31128403 : if (mpd->map.m_len > 0 && mpd->next_page != folio->index)
2412 263987 : goto out;
2413 :
2414 30864416 : if (handle) {
2415 0 : err = ext4_journal_ensure_credits(handle, bpp,
2416 : 0);
2417 0 : if (err < 0)
2418 0 : goto out;
2419 : }
2420 :
2421 30864416 : folio_lock(folio);
2422 : /*
2423 : * If the page is no longer dirty, or its mapping no
2424 : * longer corresponds to inode we are writing (which
2425 : * means it has been truncated or invalidated), or the
2426 : * page is already under writeback and we are not doing
2427 : * a data integrity writeback, skip the page
2428 : */
2429 30869020 : if (!folio_test_dirty(folio) ||
2430 12733 : (folio_test_writeback(folio) &&
2431 12733 : (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
2432 30496261 : unlikely(folio->mapping != mapping)) {
2433 372759 : folio_unlock(folio);
2434 372780 : continue;
2435 : }
2436 :
2437 30496261 : folio_wait_writeback(folio);
2438 30496373 : BUG_ON(folio_test_writeback(folio));
2439 :
2440 : /*
2441 : * Should never happen but for buggy code in
2442 : * other subsystems that call
2443 : * set_page_dirty() without properly warning
2444 : * the file system first. See [1] for more
2445 : * information.
2446 : *
2447 : * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
2448 : */
2449 30496373 : if (!folio_buffers(folio)) {
2450 0 : ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index);
2451 0 : folio_clear_dirty(folio);
2452 0 : folio_unlock(folio);
2453 0 : continue;
2454 : }
2455 :
2456 30496373 : if (mpd->map.m_len == 0)
2457 4829929 : mpd->first_page = folio->index;
2458 30496373 : mpd->next_page = folio->index + folio_nr_pages(folio);
2459 : /*
2460 : * Writeout when we cannot modify metadata is simple.
2461 : * Just submit the page. For data=journal mode we
2462 : * first handle writeout of the page for checkpoint and
2463 : * only after that handle delayed page dirtying. This
2464 : * makes sure current data is checkpointed to the final
2465 : * location before possibly journalling it again which
2466 : * is desirable when the page is frequently dirtied
2467 : * through a pin.
2468 : */
2469 30496373 : if (!mpd->can_map) {
2470 310840 : err = mpage_submit_folio(mpd, folio);
2471 310840 : if (err < 0)
2472 0 : goto out;
2473 : /* Pending dirtying of journalled data? */
2474 310840 : if (folio_test_checked(folio)) {
2475 0 : err = mpage_journal_page_buffers(handle,
2476 : mpd, folio);
2477 0 : if (err < 0)
2478 0 : goto out;
2479 0 : mpd->journalled_more_data = 1;
2480 : }
2481 310840 : mpage_folio_done(mpd, folio);
2482 : } else {
2483 : /* Add all dirty buffers to mpd */
2484 0 : lblk = ((ext4_lblk_t)folio->index) <<
2485 30185533 : (PAGE_SHIFT - blkbits);
2486 30185533 : head = folio_buffers(folio);
2487 30185533 : err = mpage_process_page_bufs(mpd, head, head,
2488 : lblk);
2489 30180276 : if (err <= 0)
2490 1329027 : goto out;
2491 : err = 0;
2492 : }
2493 : }
2494 2085148 : folio_batch_release(&fbatch);
2495 2085154 : cond_resched();
2496 : }
2497 961976 : mpd->scanned_until_end = 1;
2498 961976 : if (handle)
2499 0 : ext4_journal_stop(handle);
2500 : return 0;
2501 1593158 : out:
2502 1593158 : folio_batch_release(&fbatch);
2503 1593170 : if (handle)
2504 0 : ext4_journal_stop(handle);
2505 : return err;
2506 : }
2507 :
2508 1630567 : static int ext4_do_writepages(struct mpage_da_data *mpd)
2509 : {
2510 1630567 : struct writeback_control *wbc = mpd->wbc;
2511 1630567 : pgoff_t writeback_index = 0;
2512 1630567 : long nr_to_write = wbc->nr_to_write;
2513 1630567 : int range_whole = 0;
2514 1630567 : int cycled = 1;
2515 1630567 : handle_t *handle = NULL;
2516 1630567 : struct inode *inode = mpd->inode;
2517 1630567 : struct address_space *mapping = inode->i_mapping;
2518 1630567 : int needed_blocks, rsv_blocks = 0, ret = 0;
2519 1630567 : struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2520 1630567 : struct blk_plug plug;
2521 1630567 : bool give_up_on_write = false;
2522 :
2523 1630567 : trace_ext4_writepages(inode, wbc);
2524 :
2525 : /*
2526 : * No pages to write? This is mainly a kludge to avoid starting
2527 : * a transaction for special inodes like journal inode on last iput()
2528 : * because that could violate lock ordering on umount
2529 : */
2530 3169922 : if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2531 298094 : goto out_writepages;
2532 :
2533 : /*
2534 : * If the filesystem has aborted, it is read-only, so return
2535 : * right away instead of dumping stack traces later on that
2536 : * will obscure the real source of the problem. We test
2537 : * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because
2538 : * the latter could be true if the filesystem is mounted
2539 : * read-only, and in that case, ext4_writepages should
2540 : * *never* be called, so if that ever happens, we would want
2541 : * the stack trace.
2542 : */
2543 2664886 : if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) ||
2544 : ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) {
2545 0 : ret = -EROFS;
2546 0 : goto out_writepages;
2547 : }
2548 :
2549 : /*
2550 : * If we have inline data and arrive here, it means that
2551 : * we will soon create the block for the 1st page, so
2552 : * we'd better clear the inline data here.
2553 : */
2554 1332443 : if (ext4_has_inline_data(inode)) {
2555 : /* Just inode will be modified... */
2556 0 : handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
2557 0 : if (IS_ERR(handle)) {
2558 0 : ret = PTR_ERR(handle);
2559 0 : goto out_writepages;
2560 : }
2561 0 : BUG_ON(ext4_test_inode_state(inode,
2562 : EXT4_STATE_MAY_INLINE_DATA));
2563 0 : ext4_destroy_inline_data(handle, inode);
2564 0 : ext4_journal_stop(handle);
2565 : }
2566 :
2567 : /*
2568 : * data=journal mode does not do delalloc so we just need to writeout /
2569 : * journal already mapped buffers. On the other hand we need to commit
2570 : * transaction to make data stable. We expect all the data to be
2571 : * already in the journal (the only exception are DMA pinned pages
2572 : * dirtied behind our back) so we commit transaction here and run the
2573 : * writeback loop to checkpoint them. The checkpointing is not actually
2574 : * necessary to make data persistent *but* quite a few places (extent
2575 : * shifting operations, fsverity, ...) depend on being able to drop
2576 : * pagecache pages after calling filemap_write_and_wait() and for that
2577 : * checkpointing needs to happen.
2578 : */
2579 1332443 : if (ext4_should_journal_data(inode)) {
2580 0 : mpd->can_map = 0;
2581 0 : if (wbc->sync_mode == WB_SYNC_ALL)
2582 0 : ext4_fc_commit(sbi->s_journal,
2583 0 : EXT4_I(inode)->i_datasync_tid);
2584 : }
2585 1332432 : mpd->journalled_more_data = 0;
2586 :
2587 1332432 : if (ext4_should_dioread_nolock(inode)) {
2588 : /*
2589 : * We may need to convert up to one extent per block in
2590 : * the page and we may dirty the inode.
2591 : */
2592 1332112 : rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
2593 1332106 : PAGE_SIZE >> inode->i_blkbits);
2594 : }
2595 :
2596 1332420 : if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2597 408712 : range_whole = 1;
2598 :
2599 1332420 : if (wbc->range_cyclic) {
2600 204017 : writeback_index = mapping->writeback_index;
2601 204017 : if (writeback_index)
2602 60117 : cycled = 0;
2603 204017 : mpd->first_page = writeback_index;
2604 204017 : mpd->last_page = -1;
2605 : } else {
2606 1128403 : mpd->first_page = wbc->range_start >> PAGE_SHIFT;
2607 1128403 : mpd->last_page = wbc->range_end >> PAGE_SHIFT;
2608 : }
2609 :
2610 1332420 : ext4_io_submit_init(&mpd->io_submit, wbc);
2611 1392388 : retry:
2612 1392388 : if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2613 1078706 : tag_pages_for_writeback(mapping, mpd->first_page,
2614 : mpd->last_page);
2615 1392470 : blk_start_plug(&plug);
2616 :
2617 : /*
2618 : * First writeback pages that don't need mapping - we can avoid
2619 : * starting a transaction unnecessarily and also avoid being blocked
2620 : * in the block layer on device congestion while having transaction
2621 : * started.
2622 : */
2623 1392468 : mpd->do_map = 0;
2624 1392468 : mpd->scanned_until_end = 0;
2625 1392468 : mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
2626 1392478 : if (!mpd->io_submit.io_end) {
2627 0 : ret = -ENOMEM;
2628 0 : goto unplug;
2629 : }
2630 1392478 : ret = mpage_prepare_extent_to_map(mpd);
2631 : /* Unlock pages we didn't use */
2632 1392398 : mpage_release_unused_pages(mpd, false);
2633 : /* Submit prepared bio */
2634 1392394 : ext4_io_submit(&mpd->io_submit);
2635 1392384 : ext4_put_io_end_defer(mpd->io_submit.io_end);
2636 1392444 : mpd->io_submit.io_end = NULL;
2637 1392444 : if (ret < 0)
2638 0 : goto unplug;
2639 :
2640 2555123 : while (!mpd->scanned_until_end && wbc->nr_to_write > 0) {
2641 : /* For each extent of pages we use new io_end */
2642 1162704 : mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
2643 1162674 : if (!mpd->io_submit.io_end) {
2644 : ret = -ENOMEM;
2645 : break;
2646 : }
2647 :
2648 1162674 : WARN_ON_ONCE(!mpd->can_map);
2649 : /*
2650 : * We have two constraints: We find one extent to map and we
2651 : * must always write out whole page (makes a difference when
2652 : * blocksize < pagesize) so that we don't block on IO when we
2653 : * try to write out the rest of the page. Journalled mode is
2654 : * not supported by delalloc.
2655 : */
2656 1162674 : BUG_ON(ext4_should_journal_data(inode));
2657 1162691 : needed_blocks = ext4_da_writepages_trans_blocks(inode);
2658 :
2659 : /* start a new transaction */
2660 1162683 : handle = ext4_journal_start_with_reserve(inode,
2661 : EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
2662 1162666 : if (IS_ERR(handle)) {
2663 0 : ret = PTR_ERR(handle);
2664 0 : ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2665 : "%ld pages, ino %lu; err %d", __func__,
2666 : wbc->nr_to_write, inode->i_ino, ret);
2667 : /* Release allocated io_end */
2668 0 : ext4_put_io_end(mpd->io_submit.io_end);
2669 0 : mpd->io_submit.io_end = NULL;
2670 0 : break;
2671 : }
2672 1162666 : mpd->do_map = 1;
2673 :
2674 1162666 : trace_ext4_da_write_pages(inode, mpd->first_page, wbc);
2675 1162656 : ret = mpage_prepare_extent_to_map(mpd);
2676 1162694 : if (!ret && mpd->map.m_len)
2677 1122930 : ret = mpage_map_and_submit_extent(handle, mpd,
2678 : &give_up_on_write);
2679 : /*
2680 : * Caution: If the handle is synchronous,
2681 : * ext4_journal_stop() can wait for transaction commit
2682 : * to finish which may depend on writeback of pages to
2683 : * complete or on page lock to be released. In that
2684 : * case, we have to wait until after we have
2685 : * submitted all the IO, released page locks we hold,
2686 : * and dropped io_end reference (for extent conversion
2687 : * to be able to complete) before stopping the handle.
2688 : */
2689 1162688 : if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
2690 1162687 : ext4_journal_stop(handle);
2691 1162696 : handle = NULL;
2692 1162696 : mpd->do_map = 0;
2693 : }
2694 : /* Unlock pages we didn't use */
2695 1162697 : mpage_release_unused_pages(mpd, give_up_on_write);
2696 : /* Submit prepared bio */
2697 1162697 : ext4_io_submit(&mpd->io_submit);
2698 :
2699 : /*
2700 : * Drop our io_end reference we got from init. We have
2701 : * to be careful and use deferred io_end finishing if
2702 : * we are still holding the transaction as we can
2703 : * release the last reference to io_end which may end
2704 : * up doing unwritten extent conversion.
2705 : */
2706 1162668 : if (handle) {
2707 0 : ext4_put_io_end_defer(mpd->io_submit.io_end);
2708 0 : ext4_journal_stop(handle);
2709 : } else
2710 1162668 : ext4_put_io_end(mpd->io_submit.io_end);
2711 1162679 : mpd->io_submit.io_end = NULL;
2712 :
2713 1162679 : if (ret == -ENOSPC && sbi->s_journal) {
2714 : /*
2715 : * Commit the transaction which would
2716 : * free blocks released in the transaction
2717 : * and try again
2718 : */
2719 92 : jbd2_journal_force_commit_nested(sbi->s_journal);
2720 92 : ret = 0;
2721 92 : continue;
2722 : }
2723 : /* Fatal error - ENOMEM, EIO... */
2724 1162587 : if (ret)
2725 : break;
2726 : }
2727 1392419 : unplug:
2728 1392419 : blk_finish_plug(&plug);
2729 1392414 : if (!ret && !cycled && wbc->nr_to_write > 0) {
2730 59994 : cycled = 1;
2731 59994 : mpd->last_page = writeback_index - 1;
2732 59994 : mpd->first_page = 0;
2733 59994 : goto retry;
2734 : }
2735 :
2736 : /* Update index */
2737 1332420 : if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2738 : /*
2739 : * Set the writeback_index so that range_cyclic
2740 : * mode will write it back later
2741 : */
2742 408730 : mapping->writeback_index = mpd->first_page;
2743 :
2744 923690 : out_writepages:
2745 1630514 : trace_ext4_writepages_result(inode, wbc, ret,
2746 1630514 : nr_to_write - wbc->nr_to_write);
2747 1630450 : return ret;
2748 : }
2749 :
2750 1494552 : static int ext4_writepages(struct address_space *mapping,
2751 : struct writeback_control *wbc)
2752 : {
2753 1494552 : struct super_block *sb = mapping->host->i_sb;
2754 1494552 : struct mpage_da_data mpd = {
2755 : .inode = mapping->host,
2756 : .wbc = wbc,
2757 : .can_map = 1,
2758 : };
2759 1494552 : int ret;
2760 1494552 : int alloc_ctx;
2761 :
2762 2989104 : if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
2763 : return -EIO;
2764 :
2765 1470609 : alloc_ctx = ext4_writepages_down_read(sb);
2766 1470594 : ret = ext4_do_writepages(&mpd);
2767 : /*
2768 : * For data=journal writeback we could have come across pages marked
2769 : * for delayed dirtying (PageChecked) which were just added to the
2770 : * running transaction. Try once more to get them to stable storage.
2771 : */
2772 1470511 : if (!ret && mpd.journalled_more_data)
2773 0 : ret = ext4_do_writepages(&mpd);
2774 1470511 : ext4_writepages_up_read(sb, alloc_ctx);
2775 :
2776 1470511 : return ret;
2777 : }
2778 :
2779 159978 : int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode)
2780 : {
2781 159978 : struct writeback_control wbc = {
2782 : .sync_mode = WB_SYNC_ALL,
2783 : .nr_to_write = LONG_MAX,
2784 159978 : .range_start = jinode->i_dirty_start,
2785 159978 : .range_end = jinode->i_dirty_end,
2786 : };
2787 159978 : struct mpage_da_data mpd = {
2788 159978 : .inode = jinode->i_vfs_inode,
2789 : .wbc = &wbc,
2790 : .can_map = 0,
2791 : };
2792 159978 : return ext4_do_writepages(&mpd);
2793 : }
2794 :
2795 0 : static int ext4_dax_writepages(struct address_space *mapping,
2796 : struct writeback_control *wbc)
2797 : {
2798 0 : int ret;
2799 0 : long nr_to_write = wbc->nr_to_write;
2800 0 : struct inode *inode = mapping->host;
2801 0 : struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2802 0 : int alloc_ctx;
2803 :
2804 0 : if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
2805 : return -EIO;
2806 :
2807 0 : alloc_ctx = ext4_writepages_down_read(inode->i_sb);
2808 0 : trace_ext4_writepages(inode, wbc);
2809 :
2810 0 : ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
2811 0 : trace_ext4_writepages_result(inode, wbc, ret,
2812 0 : nr_to_write - wbc->nr_to_write);
2813 0 : ext4_writepages_up_read(inode->i_sb, alloc_ctx);
2814 0 : return ret;
2815 : }
2816 :
2817 120990794 : static int ext4_nonda_switch(struct super_block *sb)
2818 : {
2819 120990794 : s64 free_clusters, dirty_clusters;
2820 120990794 : struct ext4_sb_info *sbi = EXT4_SB(sb);
2821 :
2822 : /*
2823 : * switch to non delalloc mode if we are running low
2824 : * on free block. The free block accounting via percpu
2825 : * counters can get slightly wrong with percpu_counter_batch getting
2826 : * accumulated on each CPU without updating global counters
2827 : * Delalloc need an accurate free block accounting. So switch
2828 : * to non delalloc when we are near to error range.
2829 : */
2830 120990794 : free_clusters =
2831 : percpu_counter_read_positive(&sbi->s_freeclusters_counter);
2832 120990794 : dirty_clusters =
2833 : percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
2834 : /*
2835 : * Start pushing delalloc when 1/2 of free blocks are dirty.
2836 : */
2837 120990794 : if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
2838 167989 : try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
2839 :
2840 120993460 : if (2 * free_clusters < 3 * dirty_clusters ||
2841 120905215 : free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
2842 : /*
2843 : * free block count is less than 150% of dirty blocks
2844 : * or free blocks is less than watermark
2845 : */
2846 125297 : return 1;
2847 : }
2848 : return 0;
2849 : }
2850 :
2851 112958434 : static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2852 : loff_t pos, unsigned len,
2853 : struct page **pagep, void **fsdata)
2854 : {
2855 112958434 : int ret, retries = 0;
2856 112958434 : struct folio *folio;
2857 112958434 : pgoff_t index;
2858 112958434 : struct inode *inode = mapping->host;
2859 :
2860 225916868 : if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
2861 : return -EIO;
2862 :
2863 112958434 : index = pos >> PAGE_SHIFT;
2864 :
2865 112958434 : if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) {
2866 123778 : *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
2867 123778 : return ext4_write_begin(file, mapping, pos,
2868 : len, pagep, fsdata);
2869 : }
2870 112888973 : *fsdata = (void *)0;
2871 112888973 : trace_ext4_da_write_begin(inode, pos, len);
2872 :
2873 112798959 : if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
2874 0 : ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len,
2875 : pagep, fsdata);
2876 0 : if (ret < 0)
2877 : return ret;
2878 0 : if (ret == 1)
2879 : return 0;
2880 : }
2881 :
2882 112798959 : retry:
2883 112855377 : folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
2884 : mapping_gfp_mask(mapping));
2885 112943592 : if (IS_ERR(folio))
2886 0 : return PTR_ERR(folio);
2887 :
2888 : /* In case writeback began while the folio was unlocked */
2889 112943592 : folio_wait_stable(folio);
2890 :
2891 : #ifdef CONFIG_FS_ENCRYPTION
2892 : ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep);
2893 : #else
2894 112937147 : ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep);
2895 : #endif
2896 112914015 : if (ret < 0) {
2897 111589 : folio_unlock(folio);
2898 111777 : folio_put(folio);
2899 : /*
2900 : * block_write_begin may have instantiated a few blocks
2901 : * outside i_size. Trim these off again. Don't need
2902 : * i_size_read because we hold inode lock.
2903 : */
2904 111810 : if (pos + len > inode->i_size)
2905 75265 : ext4_truncate_failed_write(inode);
2906 :
2907 220733 : if (ret == -ENOSPC &&
2908 108921 : ext4_should_retry_alloc(inode->i_sb, &retries))
2909 56418 : goto retry;
2910 55394 : return ret;
2911 : }
2912 :
2913 112802426 : *pagep = &folio->page;
2914 112802426 : return ret;
2915 : }
2916 :
2917 : /*
2918 : * Check if we should update i_disksize
2919 : * when write to the end of file but not require block allocation
2920 : */
2921 24040308 : static int ext4_da_should_update_i_disksize(struct folio *folio,
2922 : unsigned long offset)
2923 : {
2924 24040308 : struct buffer_head *bh;
2925 24040308 : struct inode *inode = folio->mapping->host;
2926 24040308 : unsigned int idx;
2927 24040308 : int i;
2928 :
2929 24040308 : bh = folio_buffers(folio);
2930 24040308 : idx = offset >> inode->i_blkbits;
2931 :
2932 24040319 : for (i = 0; i < idx; i++)
2933 11 : bh = bh->b_this_page;
2934 :
2935 72266200 : if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
2936 24015871 : return 0;
2937 : return 1;
2938 : }
2939 :
2940 113015588 : static int ext4_da_write_end(struct file *file,
2941 : struct address_space *mapping,
2942 : loff_t pos, unsigned len, unsigned copied,
2943 : struct page *page, void *fsdata)
2944 : {
2945 113015588 : struct inode *inode = mapping->host;
2946 113015588 : loff_t new_i_size;
2947 113015588 : unsigned long start, end;
2948 113015588 : int write_mode = (int)(unsigned long)fsdata;
2949 113015588 : struct folio *folio = page_folio(page);
2950 :
2951 112918203 : if (write_mode == FALL_BACK_TO_NONDELALLOC)
2952 120808 : return ext4_write_end(file, mapping, pos,
2953 : len, copied, &folio->page, fsdata);
2954 :
2955 112797395 : trace_ext4_da_write_end(inode, pos, len, copied);
2956 :
2957 112800637 : if (write_mode != CONVERT_INLINE_DATA &&
2958 : ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
2959 : ext4_has_inline_data(inode))
2960 0 : return ext4_write_inline_data_end(inode, pos, len, copied,
2961 : folio);
2962 :
2963 112800637 : if (unlikely(copied < len) && !PageUptodate(page))
2964 0 : copied = 0;
2965 :
2966 112800637 : start = pos & (PAGE_SIZE - 1);
2967 112800637 : end = start + copied - 1;
2968 :
2969 : /*
2970 : * Since we are holding inode lock, we are sure i_disksize <=
2971 : * i_size. We also know that if i_disksize < i_size, there are
2972 : * delalloc writes pending in the range upto i_size. If the end of
2973 : * the current write is <= i_size, there's no need to touch
2974 : * i_disksize since writeback will push i_disksize upto i_size
2975 : * eventually. If the end of the current write is > i_size and
2976 : * inside an allocated block (ext4_da_should_update_i_disksize()
2977 : * check), we need to update i_disksize here as certain
2978 : * ext4_writepages() paths not allocating blocks update i_disksize.
2979 : *
2980 : * Note that we defer inode dirtying to generic_write_end() /
2981 : * ext4_da_write_inline_data_end().
2982 : */
2983 112800637 : new_i_size = pos + copied;
2984 136827550 : if (copied && new_i_size > inode->i_size &&
2985 24031282 : ext4_da_should_update_i_disksize(folio, end))
2986 24432 : ext4_update_i_disksize(inode, new_i_size);
2987 :
2988 112796298 : return generic_write_end(file, mapping, pos, len, copied, &folio->page,
2989 : fsdata);
2990 : }
2991 :
2992 : /*
2993 : * Force all delayed allocation blocks to be allocated for a given inode.
2994 : */
2995 78963 : int ext4_alloc_da_blocks(struct inode *inode)
2996 : {
2997 78963 : trace_ext4_alloc_da_blocks(inode);
2998 :
2999 78948 : if (!EXT4_I(inode)->i_reserved_data_blocks)
3000 : return 0;
3001 :
3002 : /*
3003 : * We do something simple for now. The filemap_flush() will
3004 : * also start triggering a write of the data blocks, which is
3005 : * not strictly speaking necessary (and for users of
3006 : * laptop_mode, not even desirable). However, to do otherwise
3007 : * would require replicating code paths in:
3008 : *
3009 : * ext4_writepages() ->
3010 : * write_cache_pages() ---> (via passed in callback function)
3011 : * __mpage_da_writepage() -->
3012 : * mpage_add_bh_to_extent()
3013 : * mpage_da_map_blocks()
3014 : *
3015 : * The problem is that write_cache_pages(), located in
3016 : * mm/page-writeback.c, marks pages clean in preparation for
3017 : * doing I/O, which is not desirable if we're not planning on
3018 : * doing I/O at all.
3019 : *
3020 : * We could call write_cache_pages(), and then redirty all of
3021 : * the pages by calling redirty_page_for_writepage() but that
3022 : * would be ugly in the extreme. So instead we would need to
3023 : * replicate parts of the code in the above functions,
3024 : * simplifying them because we wouldn't actually intend to
3025 : * write out the pages, but rather only collect contiguous
3026 : * logical block extents, call the multi-block allocator, and
3027 : * then update the buffer heads with the block allocations.
3028 : *
3029 : * For now, though, we'll cheat by calling filemap_flush(),
3030 : * which will map the blocks, and start the I/O, but not
3031 : * actually wait for the I/O to complete.
3032 : */
3033 46129 : return filemap_flush(inode->i_mapping);
3034 : }
3035 :
3036 : /*
3037 : * bmap() is special. It gets used by applications such as lilo and by
3038 : * the swapper to find the on-disk block of a specific piece of data.
3039 : *
3040 : * Naturally, this is dangerous if the block concerned is still in the
3041 : * journal. If somebody makes a swapfile on an ext4 data-journaling
3042 : * filesystem and enables swap, then they may get a nasty shock when the
3043 : * data getting swapped to that swapfile suddenly gets overwritten by
3044 : * the original zero's written out previously to the journal and
3045 : * awaiting writeback in the kernel's buffer cache.
3046 : *
3047 : * So, if we see any bmap calls here on a modified, data-journaled file,
3048 : * take extra steps to flush any blocks which might be in the cache.
3049 : */
3050 5053 : static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3051 : {
3052 5053 : struct inode *inode = mapping->host;
3053 5053 : sector_t ret = 0;
3054 :
3055 5053 : inode_lock_shared(inode);
3056 : /*
3057 : * We can get here for an inline file via the FIBMAP ioctl
3058 : */
3059 5053 : if (ext4_has_inline_data(inode))
3060 0 : goto out;
3061 :
3062 5053 : if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
3063 2 : (test_opt(inode->i_sb, DELALLOC) ||
3064 : ext4_should_journal_data(inode))) {
3065 : /*
3066 : * With delalloc or journalled data we want to sync the file so
3067 : * that we can make sure we allocate blocks for file and data
3068 : * is in place for the user to see it
3069 : */
3070 2 : filemap_write_and_wait(mapping);
3071 : }
3072 :
3073 5053 : ret = iomap_bmap(mapping, block, &ext4_iomap_ops);
3074 :
3075 5053 : out:
3076 5053 : inode_unlock_shared(inode);
3077 5053 : return ret;
3078 : }
3079 :
3080 63958 : static int ext4_read_folio(struct file *file, struct folio *folio)
3081 : {
3082 63958 : int ret = -EAGAIN;
3083 63958 : struct inode *inode = folio->mapping->host;
3084 :
3085 63958 : trace_ext4_read_folio(inode, folio);
3086 :
3087 63959 : if (ext4_has_inline_data(inode))
3088 0 : ret = ext4_readpage_inline(inode, folio);
3089 :
3090 63959 : if (ret == -EAGAIN)
3091 63959 : return ext4_mpage_readpages(inode, NULL, folio);
3092 :
3093 : return ret;
3094 : }
3095 :
3096 965542 : static void ext4_readahead(struct readahead_control *rac)
3097 : {
3098 965542 : struct inode *inode = rac->mapping->host;
3099 :
3100 : /* If the file has inline data, no need to do readahead. */
3101 965542 : if (ext4_has_inline_data(inode))
3102 : return;
3103 :
3104 965542 : ext4_mpage_readpages(inode, rac, NULL);
3105 : }
3106 :
3107 30342101 : static void ext4_invalidate_folio(struct folio *folio, size_t offset,
3108 : size_t length)
3109 : {
3110 30342101 : trace_ext4_invalidate_folio(folio, offset, length);
3111 :
3112 : /* No journalling happens on data buffers when this function is used */
3113 91020780 : WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio)));
3114 :
3115 30340280 : block_invalidate_folio(folio, offset, length);
3116 30341144 : }
3117 :
3118 1052 : static int __ext4_journalled_invalidate_folio(struct folio *folio,
3119 : size_t offset, size_t length)
3120 : {
3121 1052 : journal_t *journal = EXT4_JOURNAL(folio->mapping->host);
3122 :
3123 1052 : trace_ext4_journalled_invalidate_folio(folio, offset, length);
3124 :
3125 : /*
3126 : * If it's a full truncate we just forget about the pending dirtying
3127 : */
3128 1052 : if (offset == 0 && length == folio_size(folio))
3129 1048 : folio_clear_checked(folio);
3130 :
3131 1052 : return jbd2_journal_invalidate_folio(journal, folio, offset, length);
3132 : }
3133 :
3134 : /* Wrapper for aops... */
3135 1052 : static void ext4_journalled_invalidate_folio(struct folio *folio,
3136 : size_t offset,
3137 : size_t length)
3138 : {
3139 1052 : WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0);
3140 1052 : }
3141 :
3142 33602415 : static bool ext4_release_folio(struct folio *folio, gfp_t wait)
3143 : {
3144 33602415 : struct inode *inode = folio->mapping->host;
3145 33602415 : journal_t *journal = EXT4_JOURNAL(inode);
3146 :
3147 33602415 : trace_ext4_release_folio(inode, folio);
3148 :
3149 : /* Page has dirty journalled data -> cannot release */
3150 33601819 : if (folio_test_checked(folio))
3151 : return false;
3152 33601819 : if (journal)
3153 33601798 : return jbd2_journal_try_to_free_buffers(journal, folio);
3154 : else
3155 21 : return try_to_free_buffers(folio);
3156 : }
3157 :
3158 2541070 : static bool ext4_inode_datasync_dirty(struct inode *inode)
3159 : {
3160 2541070 : journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
3161 :
3162 2541070 : if (journal) {
3163 2538887 : if (jbd2_transaction_committed(journal,
3164 2538579 : EXT4_I(inode)->i_datasync_tid))
3165 : return false;
3166 2086899 : if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
3167 0 : return !list_empty(&EXT4_I(inode)->i_fc_list);
3168 : return true;
3169 : }
3170 :
3171 : /* Any metadata buffers to write? */
3172 2491 : if (!list_empty(&inode->i_mapping->private_list))
3173 : return true;
3174 2491 : return inode->i_state & I_DIRTY_DATASYNC;
3175 : }
3176 :
3177 2541199 : static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
3178 : struct ext4_map_blocks *map, loff_t offset,
3179 : loff_t length, unsigned int flags)
3180 : {
3181 2541199 : u8 blkbits = inode->i_blkbits;
3182 :
3183 : /*
3184 : * Writes that span EOF might trigger an I/O size update on completion,
3185 : * so consider them to be dirty for the purpose of O_DSYNC, even if
3186 : * there is no other metadata changes being made or are pending.
3187 : */
3188 2541199 : iomap->flags = 0;
3189 2541199 : if (ext4_inode_datasync_dirty(inode) ||
3190 454474 : offset + length > i_size_read(inode))
3191 2139082 : iomap->flags |= IOMAP_F_DIRTY;
3192 :
3193 2541296 : if (map->m_flags & EXT4_MAP_NEW)
3194 777909 : iomap->flags |= IOMAP_F_NEW;
3195 :
3196 2541296 : if (flags & IOMAP_DAX)
3197 0 : iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
3198 : else
3199 2541296 : iomap->bdev = inode->i_sb->s_bdev;
3200 2541296 : iomap->offset = (u64) map->m_lblk << blkbits;
3201 2541296 : iomap->length = (u64) map->m_len << blkbits;
3202 :
3203 2541296 : if ((map->m_flags & EXT4_MAP_MAPPED) &&
3204 : !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3205 589 : iomap->flags |= IOMAP_F_MERGED;
3206 :
3207 : /*
3208 : * Flags passed to ext4_map_blocks() for direct I/O writes can result
3209 : * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
3210 : * set. In order for any allocated unwritten extents to be converted
3211 : * into written extents correctly within the ->end_io() handler, we
3212 : * need to ensure that the iomap->type is set appropriately. Hence, the
3213 : * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
3214 : * been set first.
3215 : */
3216 2541296 : if (map->m_flags & EXT4_MAP_UNWRITTEN) {
3217 715550 : iomap->type = IOMAP_UNWRITTEN;
3218 715550 : iomap->addr = (u64) map->m_pblk << blkbits;
3219 715550 : if (flags & IOMAP_DAX)
3220 0 : iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
3221 1825746 : } else if (map->m_flags & EXT4_MAP_MAPPED) {
3222 1348806 : iomap->type = IOMAP_MAPPED;
3223 1348806 : iomap->addr = (u64) map->m_pblk << blkbits;
3224 1348806 : if (flags & IOMAP_DAX)
3225 0 : iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
3226 : } else {
3227 476940 : iomap->type = IOMAP_HOLE;
3228 476940 : iomap->addr = IOMAP_NULL_ADDR;
3229 : }
3230 2541296 : }
3231 :
3232 1441341 : static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
3233 : unsigned int flags)
3234 : {
3235 1441341 : handle_t *handle;
3236 1441341 : u8 blkbits = inode->i_blkbits;
3237 1441341 : int ret, dio_credits, m_flags = 0, retries = 0;
3238 :
3239 : /*
3240 : * Trim the mapping request to the maximum value that we can map at
3241 : * once for direct I/O.
3242 : */
3243 1441341 : if (map->m_len > DIO_MAX_BLOCKS)
3244 31 : map->m_len = DIO_MAX_BLOCKS;
3245 1441341 : dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
3246 :
3247 1463154 : retry:
3248 : /*
3249 : * Either we allocate blocks and then don't get an unwritten extent, so
3250 : * in that case we have reserved enough credits. Or, the blocks are
3251 : * already allocated and unwritten. In that case, the extent conversion
3252 : * fits into the credits as well.
3253 : */
3254 1463154 : handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
3255 1462358 : if (IS_ERR(handle))
3256 0 : return PTR_ERR(handle);
3257 :
3258 : /*
3259 : * DAX and direct I/O are the only two operations that are currently
3260 : * supported with IOMAP_WRITE.
3261 : */
3262 1462358 : WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT)));
3263 1462358 : if (flags & IOMAP_DAX)
3264 : m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
3265 : /*
3266 : * We use i_size instead of i_disksize here because delalloc writeback
3267 : * can complete at any point during the I/O and subsequently push the
3268 : * i_disksize out to i_size. This could be beyond where direct I/O is
3269 : * happening and thus expose allocated blocks to direct I/O reads.
3270 : */
3271 1462373 : else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode))
3272 : m_flags = EXT4_GET_BLOCKS_CREATE;
3273 1181507 : else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3274 1181394 : m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
3275 :
3276 1462358 : ret = ext4_map_blocks(handle, inode, map, m_flags);
3277 :
3278 : /*
3279 : * We cannot fill holes in indirect tree based inodes as that could
3280 : * expose stale data in the case of a crash. Use the magic error code
3281 : * to fallback to buffered I/O.
3282 : */
3283 1464619 : if (!m_flags && !ret)
3284 40 : ret = -ENOTBLK;
3285 :
3286 1464619 : ext4_journal_stop(handle);
3287 1464602 : if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3288 22349 : goto retry;
3289 :
3290 : return ret;
3291 : }
3292 :
3293 :
3294 2614718 : static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
3295 : unsigned flags, struct iomap *iomap, struct iomap *srcmap)
3296 : {
3297 2614718 : int ret;
3298 2614718 : struct ext4_map_blocks map;
3299 2614718 : u8 blkbits = inode->i_blkbits;
3300 :
3301 2614718 : if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
3302 : return -EINVAL;
3303 :
3304 2614718 : if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
3305 : return -ERANGE;
3306 :
3307 : /*
3308 : * Calculate the first and last logical blocks respectively.
3309 : */
3310 2614718 : map.m_lblk = offset >> blkbits;
3311 2614718 : map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
3312 2614718 : EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
3313 :
3314 2614718 : if (flags & IOMAP_WRITE) {
3315 : /*
3316 : * We check here if the blocks are already allocated, then we
3317 : * don't need to start a journal txn and we can directly return
3318 : * the mapping information. This could boost performance
3319 : * especially in multi-threaded overwrite requests.
3320 : */
3321 1557951 : if (offset + length <= i_size_read(inode)) {
3322 1257211 : ret = ext4_map_blocks(NULL, inode, &map, 0);
3323 1260203 : if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
3324 119317 : goto out;
3325 : }
3326 1441626 : ret = ext4_iomap_alloc(inode, &map, flags);
3327 : } else {
3328 1056767 : ret = ext4_map_blocks(NULL, inode, &map, 0);
3329 : }
3330 :
3331 2499247 : if (ret < 0)
3332 : return ret;
3333 1859069 : out:
3334 : /*
3335 : * When inline encryption is enabled, sometimes I/O to an encrypted file
3336 : * has to be broken up to guarantee DUN contiguity. Handle this by
3337 : * limiting the length of the mapping returned.
3338 : */
3339 1978386 : map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
3340 :
3341 1978386 : ext4_set_iomap(inode, iomap, &map, offset, length, flags);
3342 :
3343 1978386 : return 0;
3344 : }
3345 :
3346 290205 : static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
3347 : loff_t length, unsigned flags, struct iomap *iomap,
3348 : struct iomap *srcmap)
3349 : {
3350 290205 : int ret;
3351 :
3352 : /*
3353 : * Even for writes we don't need to allocate blocks, so just pretend
3354 : * we are reading to save overhead of starting a transaction.
3355 : */
3356 290205 : flags &= ~IOMAP_WRITE;
3357 290205 : ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap);
3358 580471 : WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED);
3359 290236 : return ret;
3360 : }
3361 :
3362 1977668 : static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
3363 : ssize_t written, unsigned flags, struct iomap *iomap)
3364 : {
3365 : /*
3366 : * Check to see whether an error occurred while writing out the data to
3367 : * the allocated blocks. If so, return the magic error code so that we
3368 : * fallback to buffered I/O and attempt to complete the remainder of
3369 : * the I/O. Any blocks that may have been allocated in preparation for
3370 : * the direct I/O will be reused during buffered I/O.
3371 : */
3372 1977668 : if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
3373 88 : return -ENOTBLK;
3374 :
3375 : return 0;
3376 : }
3377 :
3378 : const struct iomap_ops ext4_iomap_ops = {
3379 : .iomap_begin = ext4_iomap_begin,
3380 : .iomap_end = ext4_iomap_end,
3381 : };
3382 :
3383 : const struct iomap_ops ext4_iomap_overwrite_ops = {
3384 : .iomap_begin = ext4_iomap_overwrite_begin,
3385 : .iomap_end = ext4_iomap_end,
3386 : };
3387 :
3388 250542 : static bool ext4_iomap_is_delalloc(struct inode *inode,
3389 : struct ext4_map_blocks *map)
3390 : {
3391 250542 : struct extent_status es;
3392 250542 : ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
3393 :
3394 250542 : ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
3395 : map->m_lblk, end, &es);
3396 :
3397 250542 : if (!es.es_len || es.es_lblk > end)
3398 : return false;
3399 :
3400 432 : if (es.es_lblk > map->m_lblk) {
3401 6 : map->m_len = es.es_lblk - map->m_lblk;
3402 6 : return false;
3403 : }
3404 :
3405 426 : offset = map->m_lblk - es.es_lblk;
3406 426 : map->m_len = es.es_len - offset;
3407 :
3408 426 : return true;
3409 : }
3410 :
3411 562928 : static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
3412 : loff_t length, unsigned int flags,
3413 : struct iomap *iomap, struct iomap *srcmap)
3414 : {
3415 562928 : int ret;
3416 562928 : bool delalloc = false;
3417 562928 : struct ext4_map_blocks map;
3418 562928 : u8 blkbits = inode->i_blkbits;
3419 :
3420 562928 : if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
3421 : return -EINVAL;
3422 :
3423 562928 : if (ext4_has_inline_data(inode)) {
3424 0 : ret = ext4_inline_data_iomap(inode, iomap);
3425 0 : if (ret != -EAGAIN) {
3426 0 : if (ret == 0 && offset >= iomap->length)
3427 0 : ret = -ENOENT;
3428 0 : return ret;
3429 : }
3430 : }
3431 :
3432 : /*
3433 : * Calculate the first and last logical block respectively.
3434 : */
3435 562928 : map.m_lblk = offset >> blkbits;
3436 562928 : map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
3437 562928 : EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
3438 :
3439 : /*
3440 : * Fiemap callers may call for offset beyond s_bitmap_maxbytes.
3441 : * So handle it here itself instead of querying ext4_map_blocks().
3442 : * Since ext4_map_blocks() will warn about it and will return
3443 : * -EIO error.
3444 : */
3445 562928 : if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
3446 5065 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3447 :
3448 5065 : if (offset >= sbi->s_bitmap_maxbytes) {
3449 0 : map.m_flags = 0;
3450 0 : goto set_iomap;
3451 : }
3452 : }
3453 :
3454 562928 : ret = ext4_map_blocks(NULL, inode, &map, 0);
3455 562928 : if (ret < 0)
3456 : return ret;
3457 562928 : if (ret == 0)
3458 250542 : delalloc = ext4_iomap_is_delalloc(inode, &map);
3459 :
3460 312386 : set_iomap:
3461 562928 : ext4_set_iomap(inode, iomap, &map, offset, length, flags);
3462 562928 : if (delalloc && iomap->type == IOMAP_HOLE)
3463 426 : iomap->type = IOMAP_DELALLOC;
3464 :
3465 : return 0;
3466 : }
3467 :
3468 : const struct iomap_ops ext4_iomap_report_ops = {
3469 : .iomap_begin = ext4_iomap_begin_report,
3470 : };
3471 :
3472 : /*
3473 : * For data=journal mode, folio should be marked dirty only when it was
3474 : * writeably mapped. When that happens, it was already attached to the
3475 : * transaction and marked as jbddirty (we take care of this in
3476 : * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings
3477 : * so we should have nothing to do here, except for the case when someone
3478 : * had the page pinned and dirtied the page through this pin (e.g. by doing
3479 : * direct IO to it). In that case we'd need to attach buffers here to the
3480 : * transaction but we cannot due to lock ordering. We cannot just dirty the
3481 : * folio and leave attached buffers clean, because the buffers' dirty state is
3482 : * "definitive". We cannot just set the buffers dirty or jbddirty because all
3483 : * the journalling code will explode. So what we do is to mark the folio
3484 : * "pending dirty" and next time ext4_writepages() is called, attach buffers
3485 : * to the transaction appropriately.
3486 : */
3487 0 : static bool ext4_journalled_dirty_folio(struct address_space *mapping,
3488 : struct folio *folio)
3489 : {
3490 0 : WARN_ON_ONCE(!folio_buffers(folio));
3491 0 : if (folio_maybe_dma_pinned(folio))
3492 0 : folio_set_checked(folio);
3493 0 : return filemap_dirty_folio(mapping, folio);
3494 : }
3495 :
3496 24014854 : static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio)
3497 : {
3498 29260940 : WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio));
3499 24014854 : WARN_ON_ONCE(!folio_buffers(folio));
3500 24014854 : return block_dirty_folio(mapping, folio);
3501 : }
3502 :
3503 30 : static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
3504 : struct file *file, sector_t *span)
3505 : {
3506 30 : return iomap_swapfile_activate(sis, file, span,
3507 : &ext4_iomap_report_ops);
3508 : }
3509 :
3510 : static const struct address_space_operations ext4_aops = {
3511 : .read_folio = ext4_read_folio,
3512 : .readahead = ext4_readahead,
3513 : .writepages = ext4_writepages,
3514 : .write_begin = ext4_write_begin,
3515 : .write_end = ext4_write_end,
3516 : .dirty_folio = ext4_dirty_folio,
3517 : .bmap = ext4_bmap,
3518 : .invalidate_folio = ext4_invalidate_folio,
3519 : .release_folio = ext4_release_folio,
3520 : .direct_IO = noop_direct_IO,
3521 : .migrate_folio = buffer_migrate_folio,
3522 : .is_partially_uptodate = block_is_partially_uptodate,
3523 : .error_remove_page = generic_error_remove_page,
3524 : .swap_activate = ext4_iomap_swap_activate,
3525 : };
3526 :
3527 : static const struct address_space_operations ext4_journalled_aops = {
3528 : .read_folio = ext4_read_folio,
3529 : .readahead = ext4_readahead,
3530 : .writepages = ext4_writepages,
3531 : .write_begin = ext4_write_begin,
3532 : .write_end = ext4_journalled_write_end,
3533 : .dirty_folio = ext4_journalled_dirty_folio,
3534 : .bmap = ext4_bmap,
3535 : .invalidate_folio = ext4_journalled_invalidate_folio,
3536 : .release_folio = ext4_release_folio,
3537 : .direct_IO = noop_direct_IO,
3538 : .migrate_folio = buffer_migrate_folio_norefs,
3539 : .is_partially_uptodate = block_is_partially_uptodate,
3540 : .error_remove_page = generic_error_remove_page,
3541 : .swap_activate = ext4_iomap_swap_activate,
3542 : };
3543 :
3544 : static const struct address_space_operations ext4_da_aops = {
3545 : .read_folio = ext4_read_folio,
3546 : .readahead = ext4_readahead,
3547 : .writepages = ext4_writepages,
3548 : .write_begin = ext4_da_write_begin,
3549 : .write_end = ext4_da_write_end,
3550 : .dirty_folio = ext4_dirty_folio,
3551 : .bmap = ext4_bmap,
3552 : .invalidate_folio = ext4_invalidate_folio,
3553 : .release_folio = ext4_release_folio,
3554 : .direct_IO = noop_direct_IO,
3555 : .migrate_folio = buffer_migrate_folio,
3556 : .is_partially_uptodate = block_is_partially_uptodate,
3557 : .error_remove_page = generic_error_remove_page,
3558 : .swap_activate = ext4_iomap_swap_activate,
3559 : };
3560 :
3561 : static const struct address_space_operations ext4_dax_aops = {
3562 : .writepages = ext4_dax_writepages,
3563 : .direct_IO = noop_direct_IO,
3564 : .dirty_folio = noop_dirty_folio,
3565 : .bmap = ext4_bmap,
3566 : .swap_activate = ext4_iomap_swap_activate,
3567 : };
3568 :
3569 2270190 : void ext4_set_aops(struct inode *inode)
3570 : {
3571 2270190 : switch (ext4_inode_journal_mode(inode)) {
3572 : case EXT4_INODE_ORDERED_DATA_MODE:
3573 : case EXT4_INODE_WRITEBACK_DATA_MODE:
3574 2263017 : break;
3575 5745 : case EXT4_INODE_JOURNAL_DATA_MODE:
3576 5745 : inode->i_mapping->a_ops = &ext4_journalled_aops;
3577 5745 : return;
3578 0 : default:
3579 0 : BUG();
3580 : }
3581 2263017 : if (IS_DAX(inode))
3582 0 : inode->i_mapping->a_ops = &ext4_dax_aops;
3583 2263017 : else if (test_opt(inode->i_sb, DELALLOC))
3584 2262737 : inode->i_mapping->a_ops = &ext4_da_aops;
3585 : else
3586 280 : inode->i_mapping->a_ops = &ext4_aops;
3587 : }
3588 :
3589 899320 : static int __ext4_block_zero_page_range(handle_t *handle,
3590 : struct address_space *mapping, loff_t from, loff_t length)
3591 : {
3592 899320 : ext4_fsblk_t index = from >> PAGE_SHIFT;
3593 899320 : unsigned offset = from & (PAGE_SIZE-1);
3594 899320 : unsigned blocksize, pos;
3595 899320 : ext4_lblk_t iblock;
3596 899320 : struct inode *inode = mapping->host;
3597 899320 : struct buffer_head *bh;
3598 899320 : struct folio *folio;
3599 899320 : int err = 0;
3600 :
3601 899320 : folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT,
3602 : FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
3603 : mapping_gfp_constraint(mapping, ~__GFP_FS));
3604 899328 : if (IS_ERR(folio))
3605 0 : return PTR_ERR(folio);
3606 :
3607 899328 : blocksize = inode->i_sb->s_blocksize;
3608 :
3609 899328 : iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
3610 :
3611 899328 : bh = folio_buffers(folio);
3612 899328 : if (!bh) {
3613 731487 : create_empty_buffers(&folio->page, blocksize, 0);
3614 731488 : bh = folio_buffers(folio);
3615 : }
3616 :
3617 : /* Find the buffer that contains "offset" */
3618 899329 : pos = blocksize;
3619 899329 : while (offset >= pos) {
3620 0 : bh = bh->b_this_page;
3621 0 : iblock++;
3622 0 : pos += blocksize;
3623 : }
3624 1798658 : if (buffer_freed(bh)) {
3625 0 : BUFFER_TRACE(bh, "freed: skip");
3626 0 : goto unlock;
3627 : }
3628 1798658 : if (!buffer_mapped(bh)) {
3629 758807 : BUFFER_TRACE(bh, "unmapped");
3630 758807 : ext4_get_block(inode, iblock, bh, 0);
3631 : /* unmapped? It's a hole - nothing to do */
3632 1517608 : if (!buffer_mapped(bh)) {
3633 629882 : BUFFER_TRACE(bh, "still unmapped");
3634 629882 : goto unlock;
3635 : }
3636 : }
3637 :
3638 : /* Ok, it's mapped. Make sure it's up-to-date */
3639 466965 : if (folio_test_uptodate(folio))
3640 197519 : set_buffer_uptodate(bh);
3641 :
3642 538889 : if (!buffer_uptodate(bh)) {
3643 69778 : err = ext4_read_bh_lock(bh, 0, true);
3644 69778 : if (err)
3645 0 : goto unlock;
3646 : if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
3647 : /* We expect the key to be set. */
3648 : BUG_ON(!fscrypt_has_encryption_key(inode));
3649 : err = fscrypt_decrypt_pagecache_blocks(folio,
3650 : blocksize,
3651 : bh_offset(bh));
3652 : if (err) {
3653 : clear_buffer_uptodate(bh);
3654 : goto unlock;
3655 : }
3656 : }
3657 : }
3658 269445 : if (ext4_should_journal_data(inode)) {
3659 0 : BUFFER_TRACE(bh, "get write access");
3660 0 : err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
3661 : EXT4_JTR_NONE);
3662 0 : if (err)
3663 0 : goto unlock;
3664 : }
3665 269442 : folio_zero_range(folio, offset, length);
3666 269445 : BUFFER_TRACE(bh, "zeroed end of block");
3667 :
3668 269445 : if (ext4_should_journal_data(inode)) {
3669 0 : err = ext4_dirty_journalled_data(handle, bh);
3670 : } else {
3671 269445 : err = 0;
3672 269445 : mark_buffer_dirty(bh);
3673 269447 : if (ext4_should_order_data(inode))
3674 269446 : err = ext4_jbd2_inode_add_write(handle, inode, from,
3675 : length);
3676 : }
3677 :
3678 0 : unlock:
3679 899330 : folio_unlock(folio);
3680 899342 : folio_put(folio);
3681 899342 : return err;
3682 : }
3683 :
3684 : /*
3685 : * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3686 : * starting from file offset 'from'. The range to be zero'd must
3687 : * be contained with in one block. If the specified range exceeds
3688 : * the end of the block it will be shortened to end of the block
3689 : * that corresponds to 'from'
3690 : */
3691 899324 : static int ext4_block_zero_page_range(handle_t *handle,
3692 : struct address_space *mapping, loff_t from, loff_t length)
3693 : {
3694 899324 : struct inode *inode = mapping->host;
3695 899324 : unsigned offset = from & (PAGE_SIZE-1);
3696 899324 : unsigned blocksize = inode->i_sb->s_blocksize;
3697 899324 : unsigned max = blocksize - (offset & (blocksize - 1));
3698 :
3699 : /*
3700 : * correct length if it does not fall between
3701 : * 'from' and the end of the block
3702 : */
3703 899324 : if (length > max || length < 0)
3704 384008 : length = max;
3705 :
3706 899324 : if (IS_DAX(inode)) {
3707 0 : return dax_zero_range(inode, from, length, NULL,
3708 : &ext4_iomap_ops);
3709 : }
3710 899324 : return __ext4_block_zero_page_range(handle, mapping, from, length);
3711 : }
3712 :
3713 : /*
3714 : * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3715 : * up to the end of the block which corresponds to `from'.
3716 : * This required during truncate. We need to physically zero the tail end
3717 : * of that block so it doesn't yield old data if the file is later grown.
3718 : */
3719 134638 : static int ext4_block_truncate_page(handle_t *handle,
3720 : struct address_space *mapping, loff_t from)
3721 : {
3722 134638 : unsigned offset = from & (PAGE_SIZE-1);
3723 134638 : unsigned length;
3724 134638 : unsigned blocksize;
3725 134638 : struct inode *inode = mapping->host;
3726 :
3727 : /* If we are processing an encrypted inode during orphan list handling */
3728 134638 : if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
3729 : return 0;
3730 :
3731 134637 : blocksize = inode->i_sb->s_blocksize;
3732 134637 : length = blocksize - (offset & (blocksize - 1));
3733 :
3734 134637 : return ext4_block_zero_page_range(handle, mapping, from, length);
3735 : }
3736 :
3737 451983 : int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
3738 : loff_t lstart, loff_t length)
3739 : {
3740 451983 : struct super_block *sb = inode->i_sb;
3741 451983 : struct address_space *mapping = inode->i_mapping;
3742 451983 : unsigned partial_start, partial_end;
3743 451983 : ext4_fsblk_t start, end;
3744 451983 : loff_t byte_end = (lstart + length - 1);
3745 451983 : int err = 0;
3746 :
3747 451983 : partial_start = lstart & (sb->s_blocksize - 1);
3748 451983 : partial_end = byte_end & (sb->s_blocksize - 1);
3749 :
3750 451983 : start = lstart >> sb->s_blocksize_bits;
3751 451983 : end = byte_end >> sb->s_blocksize_bits;
3752 :
3753 : /* Handle partial zero within the single block */
3754 451983 : if (start == end &&
3755 53250 : (partial_start || (partial_end != sb->s_blocksize - 1))) {
3756 11394 : err = ext4_block_zero_page_range(handle, mapping,
3757 : lstart, length);
3758 11394 : return err;
3759 : }
3760 : /* Handle partial zero out on the start of the range */
3761 440589 : if (partial_start) {
3762 384009 : err = ext4_block_zero_page_range(handle, mapping,
3763 384009 : lstart, sb->s_blocksize);
3764 384011 : if (err)
3765 : return err;
3766 : }
3767 : /* Handle partial zero out on the end of the range */
3768 440591 : if (partial_end != sb->s_blocksize - 1)
3769 369289 : err = ext4_block_zero_page_range(handle, mapping,
3770 : byte_end - partial_end,
3771 369289 : partial_end + 1);
3772 : return err;
3773 : }
3774 :
3775 598317 : int ext4_can_truncate(struct inode *inode)
3776 : {
3777 598317 : if (S_ISREG(inode->i_mode))
3778 : return 1;
3779 193516 : if (S_ISDIR(inode->i_mode))
3780 : return 1;
3781 7864 : if (S_ISLNK(inode->i_mode))
3782 7864 : return !ext4_inode_is_fast_symlink(inode);
3783 : return 0;
3784 : }
3785 :
3786 : /*
3787 : * We have to make sure i_disksize gets properly updated before we truncate
3788 : * page cache due to hole punching or zero range. Otherwise i_disksize update
3789 : * can get lost as it may have been postponed to submission of writeback but
3790 : * that will never happen after we truncate page cache.
3791 : */
3792 424698 : int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
3793 : loff_t len)
3794 : {
3795 424698 : handle_t *handle;
3796 424698 : int ret;
3797 :
3798 424698 : loff_t size = i_size_read(inode);
3799 :
3800 424698 : WARN_ON(!inode_is_locked(inode));
3801 424698 : if (offset > size || offset + len < size)
3802 : return 0;
3803 :
3804 74776 : if (EXT4_I(inode)->i_disksize >= size)
3805 : return 0;
3806 :
3807 926 : handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
3808 926 : if (IS_ERR(handle))
3809 0 : return PTR_ERR(handle);
3810 926 : ext4_update_i_disksize(inode, size);
3811 926 : ret = ext4_mark_inode_dirty(handle, inode);
3812 926 : ext4_journal_stop(handle);
3813 :
3814 926 : return ret;
3815 : }
3816 :
3817 0 : static void ext4_wait_dax_page(struct inode *inode)
3818 : {
3819 0 : filemap_invalidate_unlock(inode->i_mapping);
3820 0 : schedule();
3821 0 : filemap_invalidate_lock(inode->i_mapping);
3822 0 : }
3823 :
3824 1150136 : int ext4_break_layouts(struct inode *inode)
3825 : {
3826 1150136 : struct page *page;
3827 1150136 : int error;
3828 :
3829 1150136 : if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
3830 : return -EINVAL;
3831 :
3832 1150136 : do {
3833 1150136 : page = dax_layout_busy_page(inode->i_mapping);
3834 1150115 : if (!page)
3835 : return 0;
3836 :
3837 0 : error = ___wait_var_event(&page->_refcount,
3838 : atomic_read(&page->_refcount) == 1,
3839 : TASK_INTERRUPTIBLE, 0, 0,
3840 : ext4_wait_dax_page(inode));
3841 0 : } while (error == 0);
3842 :
3843 : return error;
3844 : }
3845 :
3846 : /*
3847 : * ext4_punch_hole: punches a hole in a file by releasing the blocks
3848 : * associated with the given offset and length
3849 : *
3850 : * @inode: File inode
3851 : * @offset: The offset where the hole will begin
3852 : * @len: The length of the hole
3853 : *
3854 : * Returns: 0 on success or negative on failure
3855 : */
3856 :
3857 266117 : int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3858 : {
3859 266117 : struct inode *inode = file_inode(file);
3860 266117 : struct super_block *sb = inode->i_sb;
3861 266117 : ext4_lblk_t first_block, stop_block;
3862 266117 : struct address_space *mapping = inode->i_mapping;
3863 266117 : loff_t first_block_offset, last_block_offset, max_length;
3864 266117 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3865 266117 : handle_t *handle;
3866 266117 : unsigned int credits;
3867 266117 : int ret = 0, ret2 = 0;
3868 :
3869 266117 : trace_ext4_punch_hole(inode, offset, length, 0);
3870 :
3871 : /*
3872 : * Write out all dirty pages to avoid race conditions
3873 : * Then release them.
3874 : */
3875 266117 : if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
3876 136323 : ret = filemap_write_and_wait_range(mapping, offset,
3877 136323 : offset + length - 1);
3878 136317 : if (ret)
3879 : return ret;
3880 : }
3881 :
3882 266108 : inode_lock(inode);
3883 :
3884 : /* No need to punch hole beyond i_size */
3885 266121 : if (offset >= inode->i_size)
3886 11443 : goto out_mutex;
3887 :
3888 : /*
3889 : * If the hole extends beyond i_size, set the hole
3890 : * to end after the page that contains i_size
3891 : */
3892 254678 : if (offset + length > inode->i_size) {
3893 2487 : length = inode->i_size +
3894 2487 : PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
3895 : offset;
3896 : }
3897 :
3898 : /*
3899 : * For punch hole the length + offset needs to be within one block
3900 : * before last range. Adjust the length if it goes beyond that limit.
3901 : */
3902 254678 : max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
3903 254678 : if (offset + length > max_length)
3904 0 : length = max_length - offset;
3905 :
3906 254678 : if (offset & (sb->s_blocksize - 1) ||
3907 56549 : (offset + length) & (sb->s_blocksize - 1)) {
3908 : /*
3909 : * Attach jinode to inode for jbd2 if we do any zeroing of
3910 : * partial block
3911 : */
3912 198163 : ret = ext4_inode_attach_jinode(inode);
3913 198167 : if (ret < 0)
3914 0 : goto out_mutex;
3915 :
3916 : }
3917 :
3918 : /* Wait all existing dio workers, newcomers will block on i_rwsem */
3919 254682 : inode_dio_wait(inode);
3920 :
3921 254678 : ret = file_modified(file);
3922 254684 : if (ret)
3923 0 : goto out_mutex;
3924 :
3925 : /*
3926 : * Prevent page faults from reinstantiating pages we have released from
3927 : * page cache.
3928 : */
3929 254684 : filemap_invalidate_lock(mapping);
3930 :
3931 254683 : ret = ext4_break_layouts(inode);
3932 254683 : if (ret)
3933 0 : goto out_dio;
3934 :
3935 254683 : first_block_offset = round_up(offset, sb->s_blocksize);
3936 254683 : last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
3937 :
3938 : /* Now release the pages and zero block aligned part of pages*/
3939 254683 : if (last_block_offset > first_block_offset) {
3940 238207 : ret = ext4_update_disksize_before_punch(inode, offset, length);
3941 238207 : if (ret)
3942 0 : goto out_dio;
3943 238207 : truncate_pagecache_range(inode, first_block_offset,
3944 : last_block_offset);
3945 : }
3946 :
3947 254684 : if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3948 254680 : credits = ext4_writepage_trans_blocks(inode);
3949 : else
3950 4 : credits = ext4_blocks_for_truncate(inode);
3951 254682 : handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
3952 254683 : if (IS_ERR(handle)) {
3953 0 : ret = PTR_ERR(handle);
3954 0 : ext4_std_error(sb, ret);
3955 0 : goto out_dio;
3956 : }
3957 :
3958 254683 : ret = ext4_zero_partial_blocks(handle, inode, offset,
3959 : length);
3960 254684 : if (ret)
3961 0 : goto out_stop;
3962 :
3963 0 : first_block = (offset + sb->s_blocksize - 1) >>
3964 254684 : EXT4_BLOCK_SIZE_BITS(sb);
3965 254684 : stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
3966 :
3967 : /* If there are blocks to remove, do it */
3968 254684 : if (stop_block > first_block) {
3969 :
3970 238209 : down_write(&EXT4_I(inode)->i_data_sem);
3971 238209 : ext4_discard_preallocations(inode, 0);
3972 :
3973 238208 : ext4_es_remove_extent(inode, first_block,
3974 : stop_block - first_block);
3975 :
3976 238208 : if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3977 238204 : ret = ext4_ext_remove_space(inode, first_block,
3978 : stop_block - 1);
3979 : else
3980 4 : ret = ext4_ind_remove_space(handle, inode, first_block,
3981 : stop_block);
3982 :
3983 238207 : up_write(&EXT4_I(inode)->i_data_sem);
3984 : }
3985 254682 : ext4_fc_track_range(handle, inode, first_block, stop_block);
3986 254682 : if (IS_SYNC(inode))
3987 0 : ext4_handle_sync(handle);
3988 :
3989 254682 : inode->i_mtime = inode->i_ctime = current_time(inode);
3990 254682 : ret2 = ext4_mark_inode_dirty(handle, inode);
3991 254684 : if (unlikely(ret2))
3992 0 : ret = ret2;
3993 254684 : if (ret >= 0)
3994 254684 : ext4_update_inode_fsync_trans(handle, inode, 1);
3995 0 : out_stop:
3996 254683 : ext4_journal_stop(handle);
3997 254684 : out_dio:
3998 254684 : filemap_invalidate_unlock(mapping);
3999 266127 : out_mutex:
4000 266127 : inode_unlock(inode);
4001 266127 : return ret;
4002 : }
4003 :
4004 5334784 : int ext4_inode_attach_jinode(struct inode *inode)
4005 : {
4006 5334784 : struct ext4_inode_info *ei = EXT4_I(inode);
4007 5334784 : struct jbd2_inode *jinode;
4008 :
4009 5334784 : if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal)
4010 : return 0;
4011 :
4012 1993720 : jinode = jbd2_alloc_inode(GFP_KERNEL);
4013 1998559 : spin_lock(&inode->i_lock);
4014 2001784 : if (!ei->jinode) {
4015 2001093 : if (!jinode) {
4016 0 : spin_unlock(&inode->i_lock);
4017 0 : return -ENOMEM;
4018 : }
4019 2001093 : ei->jinode = jinode;
4020 2001093 : jbd2_journal_init_jbd_inode(ei->jinode, inode);
4021 2001093 : jinode = NULL;
4022 : }
4023 1988595 : spin_unlock(&inode->i_lock);
4024 1999685 : if (unlikely(jinode != NULL))
4025 3 : jbd2_free_inode(jinode);
4026 : return 0;
4027 : }
4028 :
4029 : /*
4030 : * ext4_truncate()
4031 : *
4032 : * We block out ext4_get_block() block instantiations across the entire
4033 : * transaction, and VFS/VM ensures that ext4_truncate() cannot run
4034 : * simultaneously on behalf of the same inode.
4035 : *
4036 : * As we work through the truncate and commit bits of it to the journal there
4037 : * is one core, guiding principle: the file's tree must always be consistent on
4038 : * disk. We must be able to restart the truncate after a crash.
4039 : *
4040 : * The file's tree may be transiently inconsistent in memory (although it
4041 : * probably isn't), but whenever we close off and commit a journal transaction,
4042 : * the contents of (the filesystem + the journal) must be consistent and
4043 : * restartable. It's pretty simple, really: bottom up, right to left (although
4044 : * left-to-right works OK too).
4045 : *
4046 : * Note that at recovery time, journal replay occurs *before* the restart of
4047 : * truncate against the orphan inode list.
4048 : *
4049 : * The committed inode has the new, desired i_size (which is the same as
4050 : * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see
4051 : * that this inode's truncate did not complete and it will again call
4052 : * ext4_truncate() to have another go. So there will be instantiated blocks
4053 : * to the right of the truncation point in a crashed ext4 filesystem. But
4054 : * that's fine - as long as they are linked from the inode, the post-crash
4055 : * ext4_truncate() run will find them and release them.
4056 : */
4057 594740 : int ext4_truncate(struct inode *inode)
4058 : {
4059 594740 : struct ext4_inode_info *ei = EXT4_I(inode);
4060 594740 : unsigned int credits;
4061 594740 : int err = 0, err2;
4062 594740 : handle_t *handle;
4063 594740 : struct address_space *mapping = inode->i_mapping;
4064 :
4065 : /*
4066 : * There is a possibility that we're either freeing the inode
4067 : * or it's a completely new inode. In those cases we might not
4068 : * have i_rwsem locked because it's not necessary.
4069 : */
4070 594740 : if (!(inode->i_state & (I_NEW|I_FREEING)))
4071 312682 : WARN_ON(!inode_is_locked(inode));
4072 594740 : trace_ext4_truncate_enter(inode);
4073 :
4074 594519 : if (!ext4_can_truncate(inode))
4075 0 : goto out_trace;
4076 :
4077 594534 : if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4078 356454 : ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4079 :
4080 594692 : if (ext4_has_inline_data(inode)) {
4081 0 : int has_inline = 1;
4082 :
4083 0 : err = ext4_inline_data_truncate(inode, &has_inline);
4084 0 : if (err || has_inline)
4085 0 : goto out_trace;
4086 : }
4087 :
4088 : /* If we zero-out tail of the page, we have to create jinode for jbd2 */
4089 594692 : if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
4090 134610 : err = ext4_inode_attach_jinode(inode);
4091 134613 : if (err)
4092 0 : goto out_trace;
4093 : }
4094 :
4095 594695 : if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4096 594669 : credits = ext4_writepage_trans_blocks(inode);
4097 : else
4098 26 : credits = ext4_blocks_for_truncate(inode);
4099 :
4100 594439 : handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
4101 594903 : if (IS_ERR(handle)) {
4102 0 : err = PTR_ERR(handle);
4103 0 : goto out_trace;
4104 : }
4105 :
4106 594903 : if (inode->i_size & (inode->i_sb->s_blocksize - 1))
4107 134637 : ext4_block_truncate_page(handle, mapping, inode->i_size);
4108 :
4109 : /*
4110 : * We add the inode to the orphan list, so that if this
4111 : * truncate spans multiple transactions, and we crash, we will
4112 : * resume the truncate when the filesystem recovers. It also
4113 : * marks the inode dirty, to catch the new size.
4114 : *
4115 : * Implication: the file must always be in a sane, consistent
4116 : * truncatable state while each transaction commits.
4117 : */
4118 594908 : err = ext4_orphan_add(handle, inode);
4119 595062 : if (err)
4120 0 : goto out_stop;
4121 :
4122 595062 : down_write(&EXT4_I(inode)->i_data_sem);
4123 :
4124 595036 : ext4_discard_preallocations(inode, 0);
4125 :
4126 594900 : if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4127 594874 : err = ext4_ext_truncate(handle, inode);
4128 : else
4129 26 : ext4_ind_truncate(handle, inode);
4130 :
4131 594898 : up_write(&ei->i_data_sem);
4132 594799 : if (err)
4133 1 : goto out_stop;
4134 :
4135 594798 : if (IS_SYNC(inode))
4136 15 : ext4_handle_sync(handle);
4137 :
4138 594783 : out_stop:
4139 : /*
4140 : * If this was a simple ftruncate() and the file will remain alive,
4141 : * then we need to clear up the orphan record which we created above.
4142 : * However, if this was a real unlink then we were called by
4143 : * ext4_evict_inode(), and we allow that function to clean up the
4144 : * orphan info for us.
4145 : */
4146 594799 : if (inode->i_nlink)
4147 312774 : ext4_orphan_del(handle, inode);
4148 :
4149 595122 : inode->i_mtime = inode->i_ctime = current_time(inode);
4150 595117 : err2 = ext4_mark_inode_dirty(handle, inode);
4151 595144 : if (unlikely(err2 && !err))
4152 0 : err = err2;
4153 595144 : ext4_journal_stop(handle);
4154 :
4155 595135 : out_trace:
4156 595135 : trace_ext4_truncate_exit(inode);
4157 595129 : return err;
4158 : }
4159 :
4160 : static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
4161 : {
4162 73189929 : if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
4163 204 : return inode_peek_iversion_raw(inode);
4164 : else
4165 73189725 : return inode_peek_iversion(inode);
4166 : }
4167 :
4168 73163728 : static int ext4_inode_blocks_set(struct ext4_inode *raw_inode,
4169 : struct ext4_inode_info *ei)
4170 : {
4171 73163728 : struct inode *inode = &(ei->vfs_inode);
4172 73163728 : u64 i_blocks = READ_ONCE(inode->i_blocks);
4173 73163728 : struct super_block *sb = inode->i_sb;
4174 :
4175 73163728 : if (i_blocks <= ~0U) {
4176 : /*
4177 : * i_blocks can be represented in a 32 bit variable
4178 : * as multiple of 512 bytes
4179 : */
4180 73163728 : raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
4181 73163728 : raw_inode->i_blocks_high = 0;
4182 73163728 : ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
4183 73163728 : return 0;
4184 : }
4185 :
4186 : /*
4187 : * This should never happen since sb->s_maxbytes should not have
4188 : * allowed this, sb->s_maxbytes was set according to the huge_file
4189 : * feature in ext4_fill_super().
4190 : */
4191 0 : if (!ext4_has_feature_huge_file(sb))
4192 : return -EFSCORRUPTED;
4193 :
4194 0 : if (i_blocks <= 0xffffffffffffULL) {
4195 : /*
4196 : * i_blocks can be represented in a 48 bit variable
4197 : * as multiple of 512 bytes
4198 : */
4199 0 : raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
4200 0 : raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
4201 0 : ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
4202 : } else {
4203 0 : ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
4204 : /* i_block is stored in file system block size */
4205 0 : i_blocks = i_blocks >> (inode->i_blkbits - 9);
4206 0 : raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
4207 0 : raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
4208 : }
4209 : return 0;
4210 : }
4211 :
4212 73176014 : static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode)
4213 : {
4214 73176014 : struct ext4_inode_info *ei = EXT4_I(inode);
4215 73176014 : uid_t i_uid;
4216 73176014 : gid_t i_gid;
4217 73176014 : projid_t i_projid;
4218 73176014 : int block;
4219 73176014 : int err;
4220 :
4221 73176014 : err = ext4_inode_blocks_set(raw_inode, ei);
4222 :
4223 73333064 : raw_inode->i_mode = cpu_to_le16(inode->i_mode);
4224 73333064 : i_uid = i_uid_read(inode);
4225 73343343 : i_gid = i_gid_read(inode);
4226 73350444 : i_projid = from_kprojid(&init_user_ns, ei->i_projid);
4227 73271646 : if (!(test_opt(inode->i_sb, NO_UID32))) {
4228 73271646 : raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
4229 73271646 : raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
4230 : /*
4231 : * Fix up interoperability with old kernels. Otherwise,
4232 : * old inodes get re-used with the upper 16 bits of the
4233 : * uid/gid intact.
4234 : */
4235 73271646 : if (ei->i_dtime && list_empty(&ei->i_orphan)) {
4236 1966495 : raw_inode->i_uid_high = 0;
4237 1966495 : raw_inode->i_gid_high = 0;
4238 : } else {
4239 71305151 : raw_inode->i_uid_high =
4240 71305151 : cpu_to_le16(high_16_bits(i_uid));
4241 71305151 : raw_inode->i_gid_high =
4242 71305151 : cpu_to_le16(high_16_bits(i_gid));
4243 : }
4244 : } else {
4245 0 : raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
4246 0 : raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
4247 0 : raw_inode->i_uid_high = 0;
4248 0 : raw_inode->i_gid_high = 0;
4249 : }
4250 73271646 : raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
4251 :
4252 73271646 : EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
4253 73271646 : EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
4254 73271646 : EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
4255 73271646 : EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
4256 :
4257 73271646 : raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
4258 73271646 : raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
4259 73271646 : if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
4260 73271646 : raw_inode->i_file_acl_high =
4261 73271646 : cpu_to_le16(ei->i_file_acl >> 32);
4262 73271646 : raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
4263 73271646 : ext4_isize_set(raw_inode, ei->i_disksize);
4264 :
4265 73271646 : raw_inode->i_generation = cpu_to_le32(inode->i_generation);
4266 73271646 : if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
4267 1119042 : if (old_valid_dev(inode->i_rdev)) {
4268 559521 : raw_inode->i_block[0] =
4269 559521 : cpu_to_le32(old_encode_dev(inode->i_rdev));
4270 559521 : raw_inode->i_block[1] = 0;
4271 : } else {
4272 0 : raw_inode->i_block[0] = 0;
4273 0 : raw_inode->i_block[1] =
4274 0 : cpu_to_le32(new_encode_dev(inode->i_rdev));
4275 0 : raw_inode->i_block[2] = 0;
4276 : }
4277 72712125 : } else if (!ext4_has_inline_data(inode)) {
4278 1159007747 : for (block = 0; block < EXT4_N_BLOCKS; block++)
4279 1086377339 : raw_inode->i_block[block] = ei->i_data[block];
4280 : }
4281 :
4282 73189929 : if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
4283 73189929 : u64 ivers = ext4_inode_peek_iversion(inode);
4284 :
4285 73189929 : raw_inode->i_disk_version = cpu_to_le32(ivers);
4286 73189929 : if (ei->i_extra_isize) {
4287 73259627 : if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4288 73254210 : raw_inode->i_version_hi =
4289 73254210 : cpu_to_le32(ivers >> 32);
4290 73259627 : raw_inode->i_extra_isize =
4291 73259627 : cpu_to_le16(ei->i_extra_isize);
4292 : }
4293 : }
4294 :
4295 73189929 : if (i_projid != EXT4_DEF_PROJID &&
4296 53262 : !ext4_has_feature_project(inode->i_sb))
4297 0 : err = err ?: -EFSCORRUPTED;
4298 :
4299 73189929 : if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
4300 73227147 : EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
4301 73227423 : raw_inode->i_projid = cpu_to_le32(i_projid);
4302 :
4303 73189929 : ext4_inode_csum_set(inode, raw_inode, ei);
4304 73308143 : return err;
4305 : }
4306 :
4307 : /*
4308 : * ext4_get_inode_loc returns with an extra refcount against the inode's
4309 : * underlying buffer_head on success. If we pass 'inode' and it does not
4310 : * have in-inode xattr, we have all inode data in memory that is needed
4311 : * to recreate the on-disk version of this inode.
4312 : */
4313 74702105 : static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
4314 : struct inode *inode, struct ext4_iloc *iloc,
4315 : ext4_fsblk_t *ret_block)
4316 : {
4317 74702105 : struct ext4_group_desc *gdp;
4318 74702105 : struct buffer_head *bh;
4319 74702105 : ext4_fsblk_t block;
4320 74702105 : struct blk_plug plug;
4321 74702105 : int inodes_per_block, inode_offset;
4322 :
4323 74702105 : iloc->bh = NULL;
4324 74702105 : if (ino < EXT4_ROOT_INO ||
4325 74702105 : ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
4326 : return -EFSCORRUPTED;
4327 :
4328 74702105 : iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
4329 74702105 : gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
4330 74468106 : if (!gdp)
4331 : return -EIO;
4332 :
4333 : /*
4334 : * Figure out the offset within the block group inode table
4335 : */
4336 74468106 : inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
4337 0 : inode_offset = ((ino - 1) %
4338 74468106 : EXT4_INODES_PER_GROUP(sb));
4339 74468106 : iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
4340 :
4341 74468106 : block = ext4_inode_table(sb, gdp);
4342 149166394 : if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) ||
4343 : (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) {
4344 0 : ext4_error(sb, "Invalid inode table block %llu in "
4345 : "block_group %u", block, iloc->block_group);
4346 0 : return -EFSCORRUPTED;
4347 : }
4348 74583197 : block += (inode_offset / inodes_per_block);
4349 :
4350 74583197 : bh = sb_getblk(sb, block);
4351 74711580 : if (unlikely(!bh))
4352 : return -ENOMEM;
4353 74711580 : if (ext4_buffer_uptodate(bh))
4354 74316468 : goto has_buffer;
4355 :
4356 227382 : lock_buffer(bh);
4357 227090 : if (ext4_buffer_uptodate(bh)) {
4358 : /* Someone brought it uptodate while we waited */
4359 46693 : unlock_buffer(bh);
4360 47267 : goto has_buffer;
4361 : }
4362 :
4363 : /*
4364 : * If we have all information of the inode in memory and this
4365 : * is the only valid inode in the block, we need not read the
4366 : * block.
4367 : */
4368 180217 : if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
4369 177333 : struct buffer_head *bitmap_bh;
4370 177333 : int i, start;
4371 :
4372 177333 : start = inode_offset & ~(inodes_per_block - 1);
4373 :
4374 : /* Is the inode bitmap in cache? */
4375 177333 : bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
4376 177603 : if (unlikely(!bitmap_bh))
4377 0 : goto make_io;
4378 :
4379 : /*
4380 : * If the inode bitmap isn't in cache then the
4381 : * optimisation may end up performing two reads instead
4382 : * of one, so skip it.
4383 : */
4384 354945 : if (!buffer_uptodate(bitmap_bh)) {
4385 41 : brelse(bitmap_bh);
4386 41 : goto make_io;
4387 : }
4388 1759548 : for (i = start; i < start + inodes_per_block; i++) {
4389 1585515 : if (i == inode_offset)
4390 175916 : continue;
4391 1409599 : if (ext4_test_bit(i, bitmap_bh->b_data))
4392 : break;
4393 : }
4394 177597 : brelse(bitmap_bh);
4395 177476 : if (i == start + inodes_per_block) {
4396 173912 : struct ext4_inode *raw_inode =
4397 173912 : (struct ext4_inode *) (bh->b_data + iloc->offset);
4398 :
4399 : /* all other inodes are free, so skip I/O */
4400 173912 : memset(bh->b_data, 0, bh->b_size);
4401 173912 : if (!ext4_test_inode_state(inode, EXT4_STATE_NEW))
4402 0 : ext4_fill_raw_inode(inode, raw_inode);
4403 173912 : set_buffer_uptodate(bh);
4404 174111 : unlock_buffer(bh);
4405 174048 : goto has_buffer;
4406 : }
4407 : }
4408 :
4409 6448 : make_io:
4410 : /*
4411 : * If we need to do any I/O, try to pre-readahead extra
4412 : * blocks from the inode table.
4413 : */
4414 6489 : blk_start_plug(&plug);
4415 6489 : if (EXT4_SB(sb)->s_inode_readahead_blks) {
4416 6489 : ext4_fsblk_t b, end, table;
4417 6489 : unsigned num;
4418 6489 : __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
4419 :
4420 6489 : table = ext4_inode_table(sb, gdp);
4421 : /* s_inode_readahead_blks is always a power of 2 */
4422 6489 : b = block & ~((ext4_fsblk_t) ra_blks - 1);
4423 6489 : if (table > b)
4424 : b = table;
4425 6489 : end = b + ra_blks;
4426 6489 : num = EXT4_INODES_PER_GROUP(sb);
4427 6489 : if (ext4_has_group_desc_csum(sb))
4428 6299 : num -= ext4_itable_unused_count(sb, gdp);
4429 6490 : table += num / inodes_per_block;
4430 6490 : if (end > table)
4431 : end = table;
4432 91288 : while (b <= end)
4433 84798 : ext4_sb_breadahead_unmovable(sb, b++);
4434 : }
4435 :
4436 : /*
4437 : * There are other valid inodes in the buffer, this inode
4438 : * has in-inode xattrs, or we don't have this inode in memory.
4439 : * Read the block from disk.
4440 : */
4441 6490 : trace_ext4_load_inode(sb, ino);
4442 6490 : ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
4443 6488 : blk_finish_plug(&plug);
4444 6490 : wait_on_buffer(bh);
4445 6486 : ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO);
4446 12976 : if (!buffer_uptodate(bh)) {
4447 1 : if (ret_block)
4448 1 : *ret_block = block;
4449 1 : brelse(bh);
4450 1 : return -EIO;
4451 : }
4452 6487 : has_buffer:
4453 74544270 : iloc->bh = bh;
4454 74544270 : return 0;
4455 : }
4456 :
4457 315806 : static int __ext4_get_inode_loc_noinmem(struct inode *inode,
4458 : struct ext4_iloc *iloc)
4459 : {
4460 315806 : ext4_fsblk_t err_blk = 0;
4461 315806 : int ret;
4462 :
4463 315806 : ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc,
4464 : &err_blk);
4465 :
4466 315806 : if (ret == -EIO)
4467 0 : ext4_error_inode_block(inode, err_blk, EIO,
4468 : "unable to read itable block");
4469 :
4470 315806 : return ret;
4471 : }
4472 :
4473 74302568 : int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
4474 : {
4475 74302568 : ext4_fsblk_t err_blk = 0;
4476 74302568 : int ret;
4477 :
4478 74302568 : ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc,
4479 : &err_blk);
4480 :
4481 74228634 : if (ret == -EIO)
4482 1 : ext4_error_inode_block(inode, err_blk, EIO,
4483 : "unable to read itable block");
4484 :
4485 74228634 : return ret;
4486 : }
4487 :
4488 :
4489 0 : int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
4490 : struct ext4_iloc *iloc)
4491 : {
4492 0 : return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL);
4493 : }
4494 :
4495 2972349 : static bool ext4_should_enable_dax(struct inode *inode)
4496 : {
4497 2972349 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4498 :
4499 2972349 : if (test_opt2(inode->i_sb, DAX_NEVER))
4500 : return false;
4501 2972349 : if (!S_ISREG(inode->i_mode))
4502 : return false;
4503 2247773 : if (ext4_should_journal_data(inode))
4504 : return false;
4505 2243863 : if (ext4_has_inline_data(inode))
4506 : return false;
4507 2243863 : if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
4508 : return false;
4509 2243863 : if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
4510 : return false;
4511 2243863 : if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags))
4512 : return false;
4513 0 : if (test_opt(inode->i_sb, DAX_ALWAYS))
4514 : return true;
4515 :
4516 0 : return ext4_test_inode_flag(inode, EXT4_INODE_DAX);
4517 : }
4518 :
4519 2991711 : void ext4_set_inode_flags(struct inode *inode, bool init)
4520 : {
4521 2991711 : unsigned int flags = EXT4_I(inode)->i_flags;
4522 2991711 : unsigned int new_fl = 0;
4523 :
4524 5983422 : WARN_ON_ONCE(IS_DAX(inode) && init);
4525 :
4526 2991711 : if (flags & EXT4_SYNC_FL)
4527 23 : new_fl |= S_SYNC;
4528 2991711 : if (flags & EXT4_APPEND_FL)
4529 28 : new_fl |= S_APPEND;
4530 2991711 : if (flags & EXT4_IMMUTABLE_FL)
4531 154 : new_fl |= S_IMMUTABLE;
4532 2991711 : if (flags & EXT4_NOATIME_FL)
4533 7 : new_fl |= S_NOATIME;
4534 2991711 : if (flags & EXT4_DIRSYNC_FL)
4535 0 : new_fl |= S_DIRSYNC;
4536 :
4537 : /* Because of the way inode_set_flags() works we must preserve S_DAX
4538 : * here if already set. */
4539 2991711 : new_fl |= (inode->i_flags & S_DAX);
4540 2991711 : if (init && ext4_should_enable_dax(inode))
4541 0 : new_fl |= S_DAX;
4542 :
4543 2978656 : if (flags & EXT4_ENCRYPT_FL)
4544 0 : new_fl |= S_ENCRYPTED;
4545 2978656 : if (flags & EXT4_CASEFOLD_FL)
4546 0 : new_fl |= S_CASEFOLD;
4547 2978656 : if (flags & EXT4_VERITY_FL)
4548 0 : new_fl |= S_VERITY;
4549 2978656 : inode_set_flags(inode, new_fl,
4550 : S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
4551 : S_ENCRYPTED|S_CASEFOLD|S_VERITY);
4552 3002624 : }
4553 :
4554 214743 : static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
4555 : struct ext4_inode_info *ei)
4556 : {
4557 214743 : blkcnt_t i_blocks ;
4558 214743 : struct inode *inode = &(ei->vfs_inode);
4559 214743 : struct super_block *sb = inode->i_sb;
4560 :
4561 214743 : if (ext4_has_feature_huge_file(sb)) {
4562 : /* we are using combined 48 bit field */
4563 214273 : i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
4564 214273 : le32_to_cpu(raw_inode->i_blocks_lo);
4565 214273 : if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
4566 : /* i_blocks represent file system block size */
4567 0 : return i_blocks << (inode->i_blkbits - 9);
4568 : } else {
4569 : return i_blocks;
4570 : }
4571 : } else {
4572 470 : return le32_to_cpu(raw_inode->i_blocks_lo);
4573 : }
4574 : }
4575 :
4576 214407 : static inline int ext4_iget_extra_inode(struct inode *inode,
4577 : struct ext4_inode *raw_inode,
4578 : struct ext4_inode_info *ei)
4579 : {
4580 214407 : __le32 *magic = (void *)raw_inode +
4581 214407 : EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
4582 :
4583 214407 : if (EXT4_INODE_HAS_XATTR_SPACE(inode) &&
4584 214407 : *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
4585 2979 : int err;
4586 :
4587 2979 : ext4_set_inode_state(inode, EXT4_STATE_XATTR);
4588 2979 : err = ext4_find_inline_data_nolock(inode);
4589 2979 : if (!err && ext4_has_inline_data(inode))
4590 0 : ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
4591 2979 : return err;
4592 : } else
4593 211428 : EXT4_I(inode)->i_inline_off = 0;
4594 211428 : return 0;
4595 : }
4596 :
4597 154 : int ext4_get_projid(struct inode *inode, kprojid_t *projid)
4598 : {
4599 154 : if (!ext4_has_feature_project(inode->i_sb))
4600 : return -EOPNOTSUPP;
4601 154 : *projid = EXT4_I(inode)->i_projid;
4602 154 : return 0;
4603 : }
4604 :
4605 : /*
4606 : * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of
4607 : * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag
4608 : * set.
4609 : */
4610 : static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
4611 : {
4612 214741 : if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
4613 0 : inode_set_iversion_raw(inode, val);
4614 : else
4615 214741 : inode_set_iversion_queried(inode, val);
4616 : }
4617 :
4618 254583 : static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags)
4619 :
4620 : {
4621 254583 : if (flags & EXT4_IGET_EA_INODE) {
4622 85 : if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
4623 : return "missing EA_INODE flag";
4624 85 : if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
4625 85 : EXT4_I(inode)->i_file_acl)
4626 : return "ea_inode with extended attributes";
4627 : } else {
4628 254498 : if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
4629 : return "unexpected EA_INODE flag";
4630 : }
4631 254583 : if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD))
4632 0 : return "unexpected bad inode w/o EXT4_IGET_BAD";
4633 : return NULL;
4634 : }
4635 :
4636 255622 : struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
4637 : ext4_iget_flags flags, const char *function,
4638 : unsigned int line)
4639 : {
4640 255622 : struct ext4_iloc iloc;
4641 255622 : struct ext4_inode *raw_inode;
4642 255622 : struct ext4_inode_info *ei;
4643 255622 : struct ext4_super_block *es = EXT4_SB(sb)->s_es;
4644 255622 : struct inode *inode;
4645 255622 : const char *err_str;
4646 255622 : journal_t *journal = EXT4_SB(sb)->s_journal;
4647 255622 : long ret;
4648 255622 : loff_t size;
4649 255622 : int block;
4650 255622 : uid_t i_uid;
4651 255622 : gid_t i_gid;
4652 255622 : projid_t i_projid;
4653 :
4654 255622 : if ((!(flags & EXT4_IGET_SPECIAL) &&
4655 247949 : ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
4656 247949 : ino == le32_to_cpu(es->s_usr_quota_inum) ||
4657 247949 : ino == le32_to_cpu(es->s_grp_quota_inum) ||
4658 247949 : ino == le32_to_cpu(es->s_prj_quota_inum) ||
4659 255622 : ino == le32_to_cpu(es->s_orphan_file_inum))) ||
4660 255622 : (ino < EXT4_ROOT_INO) ||
4661 255622 : (ino > le32_to_cpu(es->s_inodes_count))) {
4662 10 : if (flags & EXT4_IGET_HANDLE)
4663 : return ERR_PTR(-ESTALE);
4664 0 : __ext4_error(sb, function, line, false, EFSCORRUPTED, 0,
4665 : "inode #%lu: comm %s: iget: illegal inode #",
4666 0 : ino, current->comm);
4667 0 : return ERR_PTR(-EFSCORRUPTED);
4668 : }
4669 :
4670 255612 : inode = iget_locked(sb, ino);
4671 255628 : if (!inode)
4672 : return ERR_PTR(-ENOMEM);
4673 255628 : if (!(inode->i_state & I_NEW)) {
4674 39846 : if ((err_str = check_igot_inode(inode, flags)) != NULL) {
4675 0 : ext4_error_inode(inode, function, line, 0, err_str);
4676 0 : iput(inode);
4677 0 : return ERR_PTR(-EFSCORRUPTED);
4678 : }
4679 : return inode;
4680 : }
4681 :
4682 215782 : ei = EXT4_I(inode);
4683 215782 : iloc.bh = NULL;
4684 :
4685 215782 : ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
4686 215782 : if (ret < 0)
4687 0 : goto bad_inode;
4688 215782 : raw_inode = ext4_raw_inode(&iloc);
4689 :
4690 215782 : if ((flags & EXT4_IGET_HANDLE) &&
4691 4213 : (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
4692 0 : ret = -ESTALE;
4693 0 : goto bad_inode;
4694 : }
4695 :
4696 215782 : if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4697 215446 : ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
4698 215446 : if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
4699 215446 : EXT4_INODE_SIZE(inode->i_sb) ||
4700 : (ei->i_extra_isize & 3)) {
4701 0 : ext4_error_inode(inode, function, line, 0,
4702 : "iget: bad extra_isize %u "
4703 : "(inode size %u)",
4704 : ei->i_extra_isize,
4705 : EXT4_INODE_SIZE(inode->i_sb));
4706 0 : ret = -EFSCORRUPTED;
4707 0 : goto bad_inode;
4708 : }
4709 : } else
4710 336 : ei->i_extra_isize = 0;
4711 :
4712 : /* Precompute checksum seed for inode metadata */
4713 215782 : if (ext4_has_metadata_csum(sb)) {
4714 215312 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4715 215312 : __u32 csum;
4716 215312 : __le32 inum = cpu_to_le32(inode->i_ino);
4717 215312 : __le32 gen = raw_inode->i_generation;
4718 215312 : csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
4719 : sizeof(inum));
4720 215310 : ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
4721 : sizeof(gen));
4722 : }
4723 :
4724 215780 : if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
4725 2 : ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) &&
4726 2 : (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) {
4727 2 : ext4_error_inode_err(inode, function, line, 0,
4728 : EFSBADCRC, "iget: checksum invalid");
4729 2 : ret = -EFSBADCRC;
4730 2 : goto bad_inode;
4731 : }
4732 :
4733 215776 : inode->i_mode = le16_to_cpu(raw_inode->i_mode);
4734 215776 : i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
4735 215776 : i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
4736 215776 : if (ext4_has_feature_project(sb) &&
4737 253 : EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE &&
4738 253 : EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
4739 253 : i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid);
4740 : else
4741 : i_projid = EXT4_DEF_PROJID;
4742 :
4743 215776 : if (!(test_opt(inode->i_sb, NO_UID32))) {
4744 215772 : i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
4745 215772 : i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
4746 : }
4747 215776 : i_uid_write(inode, i_uid);
4748 215776 : i_gid_write(inode, i_gid);
4749 215776 : ei->i_projid = make_kprojid(&init_user_ns, i_projid);
4750 215777 : set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
4751 :
4752 215776 : ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
4753 215776 : ei->i_inline_off = 0;
4754 215776 : ei->i_dir_start_lookup = 0;
4755 215776 : ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
4756 : /* We now have enough fields to check if the inode was active or not.
4757 : * This is needed because nfsd might try to access dead inodes
4758 : * the test is that same one that e2fsck uses
4759 : * NeilBrown 1999oct15
4760 : */
4761 215776 : if (inode->i_nlink == 0) {
4762 51631 : if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL ||
4763 51631 : !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
4764 : ino != EXT4_BOOT_LOADER_INO) {
4765 : /* this inode is deleted or unallocated */
4766 1035 : if (flags & EXT4_IGET_SPECIAL) {
4767 0 : ext4_error_inode(inode, function, line, 0,
4768 : "iget: special inode unallocated");
4769 0 : ret = -EFSCORRUPTED;
4770 : } else
4771 : ret = -ESTALE;
4772 1035 : goto bad_inode;
4773 : }
4774 : /* The only unlinked inodes we let through here have
4775 : * valid i_mode and are being read by the orphan
4776 : * recovery code: that's fine, we're about to complete
4777 : * the process of deleting those.
4778 : * OR it is the EXT4_BOOT_LOADER_INO which is
4779 : * not initialized on a new filesystem. */
4780 : }
4781 214741 : ei->i_flags = le32_to_cpu(raw_inode->i_flags);
4782 214741 : ext4_set_inode_flags(inode, true);
4783 214743 : inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
4784 214740 : ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
4785 214740 : if (ext4_has_feature_64bit(sb))
4786 214270 : ei->i_file_acl |=
4787 214270 : ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
4788 214740 : inode->i_size = ext4_isize(sb, raw_inode);
4789 214740 : if ((size = i_size_read(inode)) < 0) {
4790 0 : ext4_error_inode(inode, function, line, 0,
4791 : "iget: bad i_size value: %lld", size);
4792 0 : ret = -EFSCORRUPTED;
4793 0 : goto bad_inode;
4794 : }
4795 : /*
4796 : * If dir_index is not enabled but there's dir with INDEX flag set,
4797 : * we'd normally treat htree data as empty space. But with metadata
4798 : * checksumming that corrupts checksums so forbid that.
4799 : */
4800 214740 : if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) &&
4801 : ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
4802 0 : ext4_error_inode(inode, function, line, 0,
4803 : "iget: Dir with htree data on filesystem without dir_index feature.");
4804 0 : ret = -EFSCORRUPTED;
4805 0 : goto bad_inode;
4806 : }
4807 214740 : ei->i_disksize = inode->i_size;
4808 : #ifdef CONFIG_QUOTA
4809 214740 : ei->i_reserved_quota = 0;
4810 : #endif
4811 214740 : inode->i_generation = le32_to_cpu(raw_inode->i_generation);
4812 214740 : ei->i_block_group = iloc.block_group;
4813 214740 : ei->i_last_alloc_group = ~0;
4814 : /*
4815 : * NOTE! The in-memory inode i_data array is in little-endian order
4816 : * even on big-endian machines: we do NOT byteswap the block numbers!
4817 : */
4818 3435804 : for (block = 0; block < EXT4_N_BLOCKS; block++)
4819 3221062 : ei->i_data[block] = raw_inode->i_block[block];
4820 214742 : INIT_LIST_HEAD(&ei->i_orphan);
4821 214742 : ext4_fc_init_inode(&ei->vfs_inode);
4822 :
4823 : /*
4824 : * Set transaction id's of transactions that have to be committed
4825 : * to finish f[data]sync. We set them to currently running transaction
4826 : * as we cannot be sure that the inode or some of its metadata isn't
4827 : * part of the transaction - the inode could have been reclaimed and
4828 : * now it is reread from disk.
4829 : */
4830 214744 : if (journal) {
4831 212222 : transaction_t *transaction;
4832 212222 : tid_t tid;
4833 :
4834 212222 : read_lock(&journal->j_state_lock);
4835 212222 : if (journal->j_running_transaction)
4836 : transaction = journal->j_running_transaction;
4837 : else
4838 41147 : transaction = journal->j_committing_transaction;
4839 212222 : if (transaction)
4840 171076 : tid = transaction->t_tid;
4841 : else
4842 41146 : tid = journal->j_commit_sequence;
4843 212222 : read_unlock(&journal->j_state_lock);
4844 212221 : ei->i_sync_tid = tid;
4845 212221 : ei->i_datasync_tid = tid;
4846 : }
4847 :
4848 214743 : if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4849 214408 : if (ei->i_extra_isize == 0) {
4850 : /* The extra space is currently unused. Use it. */
4851 0 : BUILD_BUG_ON(sizeof(struct ext4_inode) & 3);
4852 0 : ei->i_extra_isize = sizeof(struct ext4_inode) -
4853 : EXT4_GOOD_OLD_INODE_SIZE;
4854 : } else {
4855 214408 : ret = ext4_iget_extra_inode(inode, raw_inode, ei);
4856 214406 : if (ret)
4857 0 : goto bad_inode;
4858 : }
4859 : }
4860 :
4861 214741 : EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
4862 214741 : EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
4863 214741 : EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
4864 214741 : EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
4865 :
4866 214741 : if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
4867 214741 : u64 ivers = le32_to_cpu(raw_inode->i_disk_version);
4868 :
4869 214741 : if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4870 214407 : if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4871 214407 : ivers |=
4872 214407 : (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4873 : }
4874 214741 : ext4_inode_set_iversion_queried(inode, ivers);
4875 : }
4876 :
4877 214741 : ret = 0;
4878 216426 : if (ei->i_file_acl &&
4879 1685 : !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) {
4880 0 : ext4_error_inode(inode, function, line, 0,
4881 : "iget: bad extended attribute block %llu",
4882 : ei->i_file_acl);
4883 0 : ret = -EFSCORRUPTED;
4884 0 : goto bad_inode;
4885 214741 : } else if (!ext4_has_inline_data(inode)) {
4886 : /* validate the block references in the inode */
4887 214741 : if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
4888 214742 : (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4889 4496 : (S_ISLNK(inode->i_mode) &&
4890 4496 : !ext4_inode_is_fast_symlink(inode)))) {
4891 202611 : if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4892 202105 : ret = ext4_ext_check_inode(inode);
4893 : else
4894 506 : ret = ext4_ind_check_inode(inode);
4895 : }
4896 : }
4897 202614 : if (ret)
4898 0 : goto bad_inode;
4899 :
4900 214744 : if (S_ISREG(inode->i_mode)) {
4901 163285 : inode->i_op = &ext4_file_inode_operations;
4902 163285 : inode->i_fop = &ext4_file_operations;
4903 163285 : ext4_set_aops(inode);
4904 51459 : } else if (S_ISDIR(inode->i_mode)) {
4905 34923 : inode->i_op = &ext4_dir_inode_operations;
4906 34923 : inode->i_fop = &ext4_dir_operations;
4907 16536 : } else if (S_ISLNK(inode->i_mode)) {
4908 : /* VFS does not allow setting these so must be corruption */
4909 4496 : if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
4910 0 : ext4_error_inode(inode, function, line, 0,
4911 : "iget: immutable or append flags "
4912 : "not allowed on symlinks");
4913 0 : ret = -EFSCORRUPTED;
4914 0 : goto bad_inode;
4915 : }
4916 4496 : if (IS_ENCRYPTED(inode)) {
4917 0 : inode->i_op = &ext4_encrypted_symlink_inode_operations;
4918 4496 : } else if (ext4_inode_is_fast_symlink(inode)) {
4919 91 : inode->i_link = (char *)ei->i_data;
4920 91 : inode->i_op = &ext4_fast_symlink_inode_operations;
4921 91 : nd_terminate_link(ei->i_data, inode->i_size,
4922 : sizeof(ei->i_data) - 1);
4923 : } else {
4924 4405 : inode->i_op = &ext4_symlink_inode_operations;
4925 : }
4926 12040 : } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
4927 0 : S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
4928 12040 : inode->i_op = &ext4_special_inode_operations;
4929 12040 : if (raw_inode->i_block[0])
4930 0 : init_special_inode(inode, inode->i_mode,
4931 : old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
4932 : else
4933 12040 : init_special_inode(inode, inode->i_mode,
4934 12040 : new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4935 0 : } else if (ino == EXT4_BOOT_LOADER_INO) {
4936 0 : make_bad_inode(inode);
4937 : } else {
4938 0 : ret = -EFSCORRUPTED;
4939 0 : ext4_error_inode(inode, function, line, 0,
4940 : "iget: bogus i_mode (%o)", inode->i_mode);
4941 0 : goto bad_inode;
4942 : }
4943 214740 : if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
4944 0 : ext4_error_inode(inode, function, line, 0,
4945 : "casefold flag without casefold feature");
4946 214740 : if ((err_str = check_igot_inode(inode, flags)) != NULL) {
4947 0 : ext4_error_inode(inode, function, line, 0, err_str);
4948 0 : ret = -EFSCORRUPTED;
4949 0 : goto bad_inode;
4950 : }
4951 :
4952 214741 : brelse(iloc.bh);
4953 214744 : unlock_new_inode(inode);
4954 214744 : return inode;
4955 :
4956 1037 : bad_inode:
4957 1037 : brelse(iloc.bh);
4958 1037 : iget_failed(inode);
4959 1037 : return ERR_PTR(ret);
4960 : }
4961 :
4962 375 : static void __ext4_update_other_inode_time(struct super_block *sb,
4963 : unsigned long orig_ino,
4964 : unsigned long ino,
4965 : struct ext4_inode *raw_inode)
4966 : {
4967 375 : struct inode *inode;
4968 :
4969 375 : inode = find_inode_by_ino_rcu(sb, ino);
4970 375 : if (!inode)
4971 : return;
4972 :
4973 50 : if (!inode_is_dirtytime_only(inode))
4974 : return;
4975 :
4976 0 : spin_lock(&inode->i_lock);
4977 0 : if (inode_is_dirtytime_only(inode)) {
4978 0 : struct ext4_inode_info *ei = EXT4_I(inode);
4979 :
4980 0 : inode->i_state &= ~I_DIRTY_TIME;
4981 0 : spin_unlock(&inode->i_lock);
4982 :
4983 0 : spin_lock(&ei->i_raw_lock);
4984 0 : EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
4985 0 : EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
4986 0 : EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
4987 0 : ext4_inode_csum_set(inode, raw_inode, ei);
4988 0 : spin_unlock(&ei->i_raw_lock);
4989 0 : trace_ext4_other_inode_update_time(inode, orig_ino);
4990 0 : return;
4991 : }
4992 0 : spin_unlock(&inode->i_lock);
4993 : }
4994 :
4995 : /*
4996 : * Opportunistically update the other time fields for other inodes in
4997 : * the same inode table block.
4998 : */
4999 25 : static void ext4_update_other_inodes_time(struct super_block *sb,
5000 : unsigned long orig_ino, char *buf)
5001 : {
5002 25 : unsigned long ino;
5003 25 : int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
5004 25 : int inode_size = EXT4_INODE_SIZE(sb);
5005 :
5006 : /*
5007 : * Calculate the first inode in the inode table block. Inode
5008 : * numbers are one-based. That is, the first inode in a block
5009 : * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1).
5010 : */
5011 25 : ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1;
5012 25 : rcu_read_lock();
5013 450 : for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
5014 400 : if (ino == orig_ino)
5015 25 : continue;
5016 375 : __ext4_update_other_inode_time(sb, orig_ino, ino,
5017 : (struct ext4_inode *)buf);
5018 : }
5019 25 : rcu_read_unlock();
5020 25 : }
5021 :
5022 : /*
5023 : * Post the struct inode info into an on-disk inode location in the
5024 : * buffer-cache. This gobbles the caller's reference to the
5025 : * buffer_head in the inode location struct.
5026 : *
5027 : * The caller must have write access to iloc->bh.
5028 : */
5029 73226092 : static int ext4_do_update_inode(handle_t *handle,
5030 : struct inode *inode,
5031 : struct ext4_iloc *iloc)
5032 : {
5033 73226092 : struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
5034 73226092 : struct ext4_inode_info *ei = EXT4_I(inode);
5035 73226092 : struct buffer_head *bh = iloc->bh;
5036 73226092 : struct super_block *sb = inode->i_sb;
5037 73226092 : int err;
5038 73226092 : int need_datasync = 0, set_large_file = 0;
5039 :
5040 73226092 : spin_lock(&ei->i_raw_lock);
5041 :
5042 : /*
5043 : * For fields not tracked in the in-memory inode, initialise them
5044 : * to zero for new inodes.
5045 : */
5046 73200861 : if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
5047 5552690 : memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
5048 :
5049 73200861 : if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode))
5050 2823561 : need_datasync = 1;
5051 73200861 : if (ei->i_disksize > 0x7fffffffULL) {
5052 2695289 : if (!ext4_has_feature_large_file(sb) ||
5053 2695286 : EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV))
5054 : set_large_file = 1;
5055 : }
5056 :
5057 73200861 : err = ext4_fill_raw_inode(inode, raw_inode);
5058 73286646 : spin_unlock(&ei->i_raw_lock);
5059 73357911 : if (err) {
5060 0 : EXT4_ERROR_INODE(inode, "corrupted inode contents");
5061 0 : goto out_brelse;
5062 : }
5063 :
5064 73357911 : if (inode->i_sb->s_flags & SB_LAZYTIME)
5065 25 : ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
5066 : bh->b_data);
5067 :
5068 73357911 : BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
5069 73357911 : err = ext4_handle_dirty_metadata(handle, NULL, bh);
5070 73182863 : if (err)
5071 0 : goto out_error;
5072 73182863 : ext4_clear_inode_state(inode, EXT4_STATE_NEW);
5073 73348367 : if (set_large_file) {
5074 0 : BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
5075 0 : err = ext4_journal_get_write_access(handle, sb,
5076 : EXT4_SB(sb)->s_sbh,
5077 : EXT4_JTR_NONE);
5078 0 : if (err)
5079 0 : goto out_error;
5080 0 : lock_buffer(EXT4_SB(sb)->s_sbh);
5081 0 : ext4_set_feature_large_file(sb);
5082 0 : ext4_superblock_csum_set(sb);
5083 0 : unlock_buffer(EXT4_SB(sb)->s_sbh);
5084 0 : ext4_handle_sync(handle);
5085 0 : err = ext4_handle_dirty_metadata(handle, NULL,
5086 : EXT4_SB(sb)->s_sbh);
5087 : }
5088 73348367 : ext4_update_inode_fsync_trans(handle, inode, need_datasync);
5089 73318843 : out_error:
5090 73318843 : ext4_std_error(inode->i_sb, err);
5091 73318843 : out_brelse:
5092 73318843 : brelse(bh);
5093 73323938 : return err;
5094 : }
5095 :
5096 : /*
5097 : * ext4_write_inode()
5098 : *
5099 : * We are called from a few places:
5100 : *
5101 : * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
5102 : * Here, there will be no transaction running. We wait for any running
5103 : * transaction to commit.
5104 : *
5105 : * - Within flush work (sys_sync(), kupdate and such).
5106 : * We wait on commit, if told to.
5107 : *
5108 : * - Within iput_final() -> write_inode_now()
5109 : * We wait on commit, if told to.
5110 : *
5111 : * In all cases it is actually safe for us to return without doing anything,
5112 : * because the inode has been copied into a raw inode buffer in
5113 : * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
5114 : * writeback.
5115 : *
5116 : * Note that we are absolutely dependent upon all inode dirtiers doing the
5117 : * right thing: they *must* call mark_inode_dirty() after dirtying info in
5118 : * which we are interested.
5119 : *
5120 : * It would be a bug for them to not do this. The code:
5121 : *
5122 : * mark_inode_dirty(inode)
5123 : * stuff();
5124 : * inode->i_size = expr;
5125 : *
5126 : * is in error because write_inode() could occur while `stuff()' is running,
5127 : * and the new i_size will be lost. Plus the inode will no longer be on the
5128 : * superblock's dirty inode list.
5129 : */
5130 818771 : int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5131 : {
5132 818771 : int err;
5133 :
5134 818771 : if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) ||
5135 818771 : sb_rdonly(inode->i_sb))
5136 : return 0;
5137 :
5138 1637542 : if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
5139 : return -EIO;
5140 :
5141 791932 : if (EXT4_SB(inode->i_sb)->s_journal) {
5142 691908 : if (ext4_journal_current_handle()) {
5143 0 : ext4_debug("called recursively, non-PF_MEMALLOC!\n");
5144 0 : dump_stack();
5145 0 : return -EIO;
5146 : }
5147 :
5148 : /*
5149 : * No need to force transaction in WB_SYNC_NONE mode. Also
5150 : * ext4_sync_fs() will force the commit after everything is
5151 : * written.
5152 : */
5153 691908 : if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
5154 : return 0;
5155 :
5156 2 : err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
5157 2 : EXT4_I(inode)->i_sync_tid);
5158 : } else {
5159 100024 : struct ext4_iloc iloc;
5160 :
5161 100024 : err = __ext4_get_inode_loc_noinmem(inode, &iloc);
5162 100024 : if (err)
5163 0 : return err;
5164 : /*
5165 : * sync(2) will flush the whole buffer cache. No need to do
5166 : * it here separately for each inode.
5167 : */
5168 100024 : if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
5169 21 : sync_dirty_buffer(iloc.bh);
5170 200251 : if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5171 0 : ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO,
5172 : "IO error syncing inode");
5173 0 : err = -EIO;
5174 : }
5175 100024 : brelse(iloc.bh);
5176 : }
5177 : return err;
5178 : }
5179 :
5180 : /*
5181 : * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate
5182 : * buffers that are attached to a folio straddling i_size and are undergoing
5183 : * commit. In that case we have to wait for commit to finish and try again.
5184 : */
5185 1051 : static void ext4_wait_for_tail_page_commit(struct inode *inode)
5186 : {
5187 1051 : unsigned offset;
5188 1051 : journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
5189 1051 : tid_t commit_tid = 0;
5190 1051 : int ret;
5191 :
5192 1051 : offset = inode->i_size & (PAGE_SIZE - 1);
5193 : /*
5194 : * If the folio is fully truncated, we don't need to wait for any commit
5195 : * (and we even should not as __ext4_journalled_invalidate_folio() may
5196 : * strip all buffers from the folio but keep the folio dirty which can then
5197 : * confuse e.g. concurrent ext4_writepages() seeing dirty folio without
5198 : * buffers). Also we don't need to wait for any commit if all buffers in
5199 : * the folio remain valid. This is most beneficial for the common case of
5200 : * blocksize == PAGESIZE.
5201 : */
5202 1051 : if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
5203 1051 : return;
5204 0 : while (1) {
5205 0 : struct folio *folio = filemap_lock_folio(inode->i_mapping,
5206 0 : inode->i_size >> PAGE_SHIFT);
5207 0 : if (IS_ERR(folio))
5208 : return;
5209 0 : ret = __ext4_journalled_invalidate_folio(folio, offset,
5210 0 : folio_size(folio) - offset);
5211 0 : folio_unlock(folio);
5212 0 : folio_put(folio);
5213 0 : if (ret != -EBUSY)
5214 : return;
5215 0 : commit_tid = 0;
5216 0 : read_lock(&journal->j_state_lock);
5217 0 : if (journal->j_committing_transaction)
5218 0 : commit_tid = journal->j_committing_transaction->t_tid;
5219 0 : read_unlock(&journal->j_state_lock);
5220 0 : if (commit_tid)
5221 0 : jbd2_log_wait_commit(journal, commit_tid);
5222 : }
5223 : }
5224 :
5225 : /*
5226 : * ext4_setattr()
5227 : *
5228 : * Called from notify_change.
5229 : *
5230 : * We want to trap VFS attempts to truncate the file as soon as
5231 : * possible. In particular, we want to make sure that when the VFS
5232 : * shrinks i_size, we put the inode on the orphan list and modify
5233 : * i_disksize immediately, so that during the subsequent flushing of
5234 : * dirty pages and freeing of disk blocks, we can guarantee that any
5235 : * commit will leave the blocks being flushed in an unused state on
5236 : * disk. (On recovery, the inode will get truncated and the blocks will
5237 : * be freed, so we have a strong guarantee that no future commit will
5238 : * leave these blocks visible to the user.)
5239 : *
5240 : * Another thing we have to assure is that if we are in ordered mode
5241 : * and inode is still attached to the committing transaction, we must
5242 : * we start writeout of all the dirty pages which are being truncated.
5243 : * This way we are sure that all the data written in the previous
5244 : * transaction are already on disk (truncate waits for pages under
5245 : * writeback).
5246 : *
5247 : * Called with inode->i_rwsem down.
5248 : */
5249 2457367 : int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
5250 : struct iattr *attr)
5251 : {
5252 2457367 : struct inode *inode = d_inode(dentry);
5253 2457367 : int error, rc = 0;
5254 2457367 : int orphan = 0;
5255 2457367 : const unsigned int ia_valid = attr->ia_valid;
5256 2457367 : bool inc_ivers = true;
5257 :
5258 4914734 : if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
5259 : return -EIO;
5260 :
5261 2457336 : if (unlikely(IS_IMMUTABLE(inode)))
5262 : return -EPERM;
5263 :
5264 2457336 : if (unlikely(IS_APPEND(inode) &&
5265 : (ia_valid & (ATTR_MODE | ATTR_UID |
5266 : ATTR_GID | ATTR_TIMES_SET))))
5267 : return -EPERM;
5268 :
5269 2457336 : error = setattr_prepare(idmap, dentry, attr);
5270 2457298 : if (error)
5271 : return error;
5272 :
5273 2457196 : error = fscrypt_prepare_setattr(dentry, attr);
5274 2457196 : if (error)
5275 : return error;
5276 :
5277 2457196 : error = fsverity_prepare_setattr(dentry, attr);
5278 2457196 : if (error)
5279 : return error;
5280 :
5281 2457196 : if (is_quota_modification(idmap, inode, attr)) {
5282 1387770 : error = dquot_initialize(inode);
5283 1387770 : if (error)
5284 : return error;
5285 : }
5286 :
5287 3896719 : if (i_uid_needs_update(idmap, attr, inode) ||
5288 1439543 : i_gid_needs_update(idmap, attr, inode)) {
5289 1018370 : handle_t *handle;
5290 :
5291 : /* (user+group)*(old+new) structure, inode write (sb,
5292 : * inode block, ? - but truncate inode update has it) */
5293 1025091 : handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
5294 : (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
5295 : EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
5296 1018368 : if (IS_ERR(handle)) {
5297 0 : error = PTR_ERR(handle);
5298 0 : goto err_out;
5299 : }
5300 :
5301 : /* dquot_transfer() calls back ext4_get_inode_usage() which
5302 : * counts xattr inode references.
5303 : */
5304 1018368 : down_read(&EXT4_I(inode)->xattr_sem);
5305 1018366 : error = dquot_transfer(idmap, inode, attr);
5306 1018363 : up_read(&EXT4_I(inode)->xattr_sem);
5307 :
5308 1018364 : if (error) {
5309 1 : ext4_journal_stop(handle);
5310 1 : return error;
5311 : }
5312 : /* Update corresponding info in inode so that everything is in
5313 : * one transaction */
5314 1018363 : i_uid_update(idmap, attr, inode);
5315 1018362 : i_gid_update(idmap, attr, inode);
5316 1018362 : error = ext4_mark_inode_dirty(handle, inode);
5317 1018370 : ext4_journal_stop(handle);
5318 1018354 : if (unlikely(error)) {
5319 : return error;
5320 : }
5321 : }
5322 :
5323 2457209 : if (attr->ia_valid & ATTR_SIZE) {
5324 369405 : handle_t *handle;
5325 369405 : loff_t oldsize = inode->i_size;
5326 369405 : loff_t old_disksize;
5327 369405 : int shrink = (attr->ia_size < inode->i_size);
5328 :
5329 369405 : if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
5330 36 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5331 :
5332 36 : if (attr->ia_size > sbi->s_bitmap_maxbytes) {
5333 : return -EFBIG;
5334 : }
5335 : }
5336 369405 : if (!S_ISREG(inode->i_mode)) {
5337 : return -EINVAL;
5338 : }
5339 :
5340 369405 : if (attr->ia_size == inode->i_size)
5341 16463 : inc_ivers = false;
5342 :
5343 369405 : if (shrink) {
5344 202310 : if (ext4_should_order_data(inode)) {
5345 199606 : error = ext4_begin_ordered_truncate(inode,
5346 : attr->ia_size);
5347 199615 : if (error)
5348 0 : goto err_out;
5349 : }
5350 : /*
5351 : * Blocks are going to be removed from the inode. Wait
5352 : * for dio in flight.
5353 : */
5354 202312 : inode_dio_wait(inode);
5355 : }
5356 :
5357 369406 : filemap_invalidate_lock(inode->i_mapping);
5358 :
5359 369427 : rc = ext4_break_layouts(inode);
5360 369407 : if (rc) {
5361 0 : filemap_invalidate_unlock(inode->i_mapping);
5362 0 : goto err_out;
5363 : }
5364 :
5365 369407 : if (attr->ia_size != inode->i_size) {
5366 352946 : handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
5367 352964 : if (IS_ERR(handle)) {
5368 0 : error = PTR_ERR(handle);
5369 0 : goto out_mmap_sem;
5370 : }
5371 352964 : if (ext4_handle_valid(handle) && shrink) {
5372 202309 : error = ext4_orphan_add(handle, inode);
5373 202309 : orphan = 1;
5374 : }
5375 : /*
5376 : * Update c/mtime on truncate up, ext4_truncate() will
5377 : * update c/mtime in shrink case below
5378 : */
5379 352976 : if (!shrink) {
5380 150654 : inode->i_mtime = current_time(inode);
5381 150653 : inode->i_ctime = inode->i_mtime;
5382 : }
5383 :
5384 352975 : if (shrink)
5385 404644 : ext4_fc_track_range(handle, inode,
5386 202322 : (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
5387 202322 : inode->i_sb->s_blocksize_bits,
5388 : EXT_MAX_BLOCKS - 1);
5389 : else
5390 150653 : ext4_fc_track_range(
5391 : handle, inode,
5392 270984 : (oldsize > 0 ? oldsize - 1 : oldsize) >>
5393 150653 : inode->i_sb->s_blocksize_bits,
5394 150653 : (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
5395 150653 : inode->i_sb->s_blocksize_bits);
5396 :
5397 352962 : down_write(&EXT4_I(inode)->i_data_sem);
5398 352976 : old_disksize = EXT4_I(inode)->i_disksize;
5399 352976 : EXT4_I(inode)->i_disksize = attr->ia_size;
5400 352976 : rc = ext4_mark_inode_dirty(handle, inode);
5401 352979 : if (!error)
5402 352981 : error = rc;
5403 : /*
5404 : * We have to update i_size under i_data_sem together
5405 : * with i_disksize to avoid races with writeback code
5406 : * running ext4_wb_update_i_disksize().
5407 : */
5408 352979 : if (!error)
5409 352979 : i_size_write(inode, attr->ia_size);
5410 : else
5411 0 : EXT4_I(inode)->i_disksize = old_disksize;
5412 352979 : up_write(&EXT4_I(inode)->i_data_sem);
5413 352978 : ext4_journal_stop(handle);
5414 352978 : if (error)
5415 0 : goto out_mmap_sem;
5416 352978 : if (!shrink) {
5417 150658 : pagecache_isize_extended(inode, oldsize,
5418 : inode->i_size);
5419 202320 : } else if (ext4_should_journal_data(inode)) {
5420 1051 : ext4_wait_for_tail_page_commit(inode);
5421 : }
5422 : }
5423 :
5424 : /*
5425 : * Truncate pagecache after we've waited for commit
5426 : * in data=journal mode to make pages freeable.
5427 : */
5428 369429 : truncate_pagecache(inode, inode->i_size);
5429 : /*
5430 : * Call ext4_truncate() even if i_size didn't change to
5431 : * truncate possible preallocated blocks.
5432 : */
5433 369422 : if (attr->ia_size <= oldsize) {
5434 218776 : rc = ext4_truncate(inode);
5435 218803 : if (rc)
5436 0 : error = rc;
5437 : }
5438 369449 : out_mmap_sem:
5439 369449 : filemap_invalidate_unlock(inode->i_mapping);
5440 : }
5441 :
5442 2457254 : if (!error) {
5443 2457265 : if (inc_ivers)
5444 2440776 : inode_inc_iversion(inode);
5445 2457287 : setattr_copy(idmap, inode, attr);
5446 2457235 : mark_inode_dirty(inode);
5447 : }
5448 :
5449 : /*
5450 : * If the call to ext4_truncate failed to get a transaction handle at
5451 : * all, we need to clean up the in-core orphan list manually.
5452 : */
5453 2457314 : if (orphan && inode->i_nlink)
5454 202320 : ext4_orphan_del(NULL, inode);
5455 :
5456 2457316 : if (!error && (ia_valid & ATTR_MODE))
5457 40164 : rc = posix_acl_chmod(idmap, dentry, inode->i_mode);
5458 :
5459 2417152 : err_out:
5460 2457301 : if (error)
5461 0 : ext4_std_error(inode->i_sb, error);
5462 2457301 : if (!error)
5463 2457305 : error = rc;
5464 : return error;
5465 : }
5466 :
5467 2948138 : u32 ext4_dio_alignment(struct inode *inode)
5468 : {
5469 2948138 : if (fsverity_active(inode))
5470 : return 0;
5471 2948138 : if (ext4_should_journal_data(inode))
5472 : return 0;
5473 2947496 : if (ext4_has_inline_data(inode))
5474 : return 0;
5475 2947496 : if (IS_ENCRYPTED(inode)) {
5476 0 : if (!fscrypt_dio_supported(inode))
5477 : return 0;
5478 0 : return i_blocksize(inode);
5479 : }
5480 : return 1; /* use the iomap defaults */
5481 : }
5482 :
5483 15275088 : int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
5484 : struct kstat *stat, u32 request_mask, unsigned int query_flags)
5485 : {
5486 15275088 : struct inode *inode = d_inode(path->dentry);
5487 15275088 : struct ext4_inode *raw_inode;
5488 15275088 : struct ext4_inode_info *ei = EXT4_I(inode);
5489 15275088 : unsigned int flags;
5490 :
5491 15275088 : if ((request_mask & STATX_BTIME) &&
5492 274 : EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
5493 274 : stat->result_mask |= STATX_BTIME;
5494 274 : stat->btime.tv_sec = ei->i_crtime.tv_sec;
5495 274 : stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
5496 : }
5497 :
5498 : /*
5499 : * Return the DIO alignment restrictions if requested. We only return
5500 : * this information when requested, since on encrypted files it might
5501 : * take a fair bit of work to get if the file wasn't opened recently.
5502 : */
5503 15275088 : if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
5504 0 : u32 dio_align = ext4_dio_alignment(inode);
5505 :
5506 0 : stat->result_mask |= STATX_DIOALIGN;
5507 0 : if (dio_align == 1) {
5508 0 : struct block_device *bdev = inode->i_sb->s_bdev;
5509 :
5510 : /* iomap defaults */
5511 0 : stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
5512 0 : stat->dio_offset_align = bdev_logical_block_size(bdev);
5513 : } else {
5514 0 : stat->dio_mem_align = dio_align;
5515 0 : stat->dio_offset_align = dio_align;
5516 : }
5517 : }
5518 :
5519 15275088 : flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
5520 15275088 : if (flags & EXT4_APPEND_FL)
5521 185 : stat->attributes |= STATX_ATTR_APPEND;
5522 15275088 : if (flags & EXT4_COMPR_FL)
5523 139 : stat->attributes |= STATX_ATTR_COMPRESSED;
5524 15275088 : if (flags & EXT4_ENCRYPT_FL)
5525 0 : stat->attributes |= STATX_ATTR_ENCRYPTED;
5526 15275088 : if (flags & EXT4_IMMUTABLE_FL)
5527 193 : stat->attributes |= STATX_ATTR_IMMUTABLE;
5528 15275088 : if (flags & EXT4_NODUMP_FL)
5529 134 : stat->attributes |= STATX_ATTR_NODUMP;
5530 15275088 : if (flags & EXT4_VERITY_FL)
5531 0 : stat->attributes |= STATX_ATTR_VERITY;
5532 :
5533 15275088 : stat->attributes_mask |= (STATX_ATTR_APPEND |
5534 : STATX_ATTR_COMPRESSED |
5535 : STATX_ATTR_ENCRYPTED |
5536 : STATX_ATTR_IMMUTABLE |
5537 : STATX_ATTR_NODUMP |
5538 : STATX_ATTR_VERITY);
5539 :
5540 15275088 : generic_fillattr(idmap, inode, stat);
5541 15287693 : return 0;
5542 : }
5543 :
5544 9358132 : int ext4_file_getattr(struct mnt_idmap *idmap,
5545 : const struct path *path, struct kstat *stat,
5546 : u32 request_mask, unsigned int query_flags)
5547 : {
5548 9358132 : struct inode *inode = d_inode(path->dentry);
5549 9358132 : u64 delalloc_blocks;
5550 :
5551 9358132 : ext4_getattr(idmap, path, stat, request_mask, query_flags);
5552 :
5553 : /*
5554 : * If there is inline data in the inode, the inode will normally not
5555 : * have data blocks allocated (it may have an external xattr block).
5556 : * Report at least one sector for such files, so tools like tar, rsync,
5557 : * others don't incorrectly think the file is completely sparse.
5558 : */
5559 9364449 : if (unlikely(ext4_has_inline_data(inode)))
5560 0 : stat->blocks += (stat->size + 511) >> 9;
5561 :
5562 : /*
5563 : * We can't update i_blocks if the block allocation is delayed
5564 : * otherwise in the case of system crash before the real block
5565 : * allocation is done, we will have i_blocks inconsistent with
5566 : * on-disk file blocks.
5567 : * We always keep i_blocks updated together with real
5568 : * allocation. But to not confuse with user, stat
5569 : * will return the blocks that include the delayed allocation
5570 : * blocks for this file.
5571 : */
5572 9364449 : delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
5573 : EXT4_I(inode)->i_reserved_data_blocks);
5574 9364449 : stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
5575 9364449 : return 0;
5576 : }
5577 :
5578 8131208 : static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
5579 : int pextents)
5580 : {
5581 8131208 : if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5582 5125 : return ext4_ind_trans_blocks(inode, lblocks);
5583 8126083 : return ext4_ext_index_trans_blocks(inode, pextents);
5584 : }
5585 :
5586 : /*
5587 : * Account for index blocks, block groups bitmaps and block group
5588 : * descriptor blocks if modify datablocks and index blocks
5589 : * worse case, the indexs blocks spread over different block groups
5590 : *
5591 : * If datablocks are discontiguous, they are possible to spread over
5592 : * different block groups too. If they are contiguous, with flexbg,
5593 : * they could still across block group boundary.
5594 : *
5595 : * Also account for superblock, inode, quota and xattr blocks
5596 : */
5597 8132062 : static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
5598 : int pextents)
5599 : {
5600 8132062 : ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
5601 8131566 : int gdpblocks;
5602 8131566 : int idxblocks;
5603 8131566 : int ret;
5604 :
5605 : /*
5606 : * How many index blocks need to touch to map @lblocks logical blocks
5607 : * to @pextents physical extents?
5608 : */
5609 8131566 : idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
5610 :
5611 8131737 : ret = idxblocks;
5612 :
5613 : /*
5614 : * Now let's see how many group bitmaps and group descriptors need
5615 : * to account
5616 : */
5617 8131737 : groups = idxblocks + pextents;
5618 8131737 : gdpblocks = groups;
5619 8131737 : if (groups > ngroups)
5620 : groups = ngroups;
5621 8131737 : if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
5622 403801 : gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
5623 :
5624 : /* bitmaps and block group descriptor blocks */
5625 8131737 : ret += groups + gdpblocks;
5626 :
5627 : /* Blocks for super block, inode, quota and xattr blocks */
5628 8131737 : ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
5629 :
5630 8131737 : return ret;
5631 : }
5632 :
5633 : /*
5634 : * Calculate the total number of credits to reserve to fit
5635 : * the modification of a single pages into a single transaction,
5636 : * which may include multiple chunks of block allocations.
5637 : *
5638 : * This could be called via ext4_write_begin()
5639 : *
5640 : * We need to consider the worse case, when
5641 : * one new block per extent.
5642 : */
5643 2839335 : int ext4_writepage_trans_blocks(struct inode *inode)
5644 : {
5645 2839335 : int bpp = ext4_journal_blocks_per_page(inode);
5646 2838863 : int ret;
5647 :
5648 2838863 : ret = ext4_meta_trans_blocks(inode, bpp, bpp);
5649 :
5650 : /* Account for data blocks for journalled mode */
5651 2838828 : if (ext4_should_journal_data(inode))
5652 194561 : ret += bpp;
5653 2838586 : return ret;
5654 : }
5655 :
5656 : /*
5657 : * Calculate the journal credits for a chunk of data modification.
5658 : *
5659 : * This is called from DIO, fallocate or whoever calling
5660 : * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
5661 : *
5662 : * journal buffers for data blocks are not included here, as DIO
5663 : * and fallocate do no need to journal data buffers.
5664 : */
5665 1357700 : int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
5666 : {
5667 4131147 : return ext4_meta_trans_blocks(inode, nrblocks, 1);
5668 : }
5669 :
5670 : /*
5671 : * The caller must have previously called ext4_reserve_inode_write().
5672 : * Give this, we know that the caller already has write access to iloc->bh.
5673 : */
5674 73210571 : int ext4_mark_iloc_dirty(handle_t *handle,
5675 : struct inode *inode, struct ext4_iloc *iloc)
5676 : {
5677 73210571 : int err = 0;
5678 :
5679 146421142 : if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
5680 0 : put_bh(iloc->bh);
5681 0 : return -EIO;
5682 : }
5683 73210571 : ext4_fc_track_inode(handle, inode);
5684 :
5685 : /* the do_update_inode consumes one bh->b_count */
5686 73128752 : get_bh(iloc->bh);
5687 :
5688 : /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
5689 73249704 : err = ext4_do_update_inode(handle, inode, iloc);
5690 73316075 : put_bh(iloc->bh);
5691 73316075 : return err;
5692 : }
5693 :
5694 : /*
5695 : * On success, We end up with an outstanding reference count against
5696 : * iloc->bh. This _must_ be cleaned up later.
5697 : */
5698 :
5699 : int
5700 74093787 : ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
5701 : struct ext4_iloc *iloc)
5702 : {
5703 74093787 : int err;
5704 :
5705 148187574 : if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
5706 : return -EIO;
5707 :
5708 74093787 : err = ext4_get_inode_loc(inode, iloc);
5709 74085809 : if (!err) {
5710 74105491 : BUFFER_TRACE(iloc->bh, "get_write_access");
5711 74105491 : err = ext4_journal_get_write_access(handle, inode->i_sb,
5712 : iloc->bh, EXT4_JTR_NONE);
5713 74216549 : if (err) {
5714 0 : brelse(iloc->bh);
5715 0 : iloc->bh = NULL;
5716 : }
5717 : }
5718 74196867 : ext4_std_error(inode->i_sb, err);
5719 : return err;
5720 : }
5721 :
5722 20 : static int __ext4_expand_extra_isize(struct inode *inode,
5723 : unsigned int new_extra_isize,
5724 : struct ext4_iloc *iloc,
5725 : handle_t *handle, int *no_expand)
5726 : {
5727 20 : struct ext4_inode *raw_inode;
5728 20 : struct ext4_xattr_ibody_header *header;
5729 20 : unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
5730 20 : struct ext4_inode_info *ei = EXT4_I(inode);
5731 20 : int error;
5732 :
5733 : /* this was checked at iget time, but double check for good measure */
5734 20 : if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
5735 : (ei->i_extra_isize & 3)) {
5736 0 : EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
5737 : ei->i_extra_isize,
5738 : EXT4_INODE_SIZE(inode->i_sb));
5739 0 : return -EFSCORRUPTED;
5740 : }
5741 20 : if ((new_extra_isize < ei->i_extra_isize) ||
5742 20 : (new_extra_isize < 4) ||
5743 20 : (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
5744 : return -EINVAL; /* Should never happen */
5745 :
5746 20 : raw_inode = ext4_raw_inode(iloc);
5747 :
5748 20 : header = IHDR(inode, raw_inode);
5749 :
5750 : /* No extended attributes present */
5751 20 : if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
5752 18 : header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
5753 2 : memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE +
5754 : EXT4_I(inode)->i_extra_isize, 0,
5755 : new_extra_isize - EXT4_I(inode)->i_extra_isize);
5756 2 : EXT4_I(inode)->i_extra_isize = new_extra_isize;
5757 2 : return 0;
5758 : }
5759 :
5760 : /*
5761 : * We may need to allocate external xattr block so we need quotas
5762 : * initialized. Here we can be called with various locks held so we
5763 : * cannot affort to initialize quotas ourselves. So just bail.
5764 : */
5765 18 : if (dquot_initialize_needed(inode))
5766 : return -EAGAIN;
5767 :
5768 : /* try to expand with EAs present */
5769 18 : error = ext4_expand_extra_isize_ea(inode, new_extra_isize,
5770 : raw_inode, handle);
5771 18 : if (error) {
5772 : /*
5773 : * Inode size expansion failed; don't try again
5774 : */
5775 1 : *no_expand = 1;
5776 : }
5777 :
5778 : return error;
5779 : }
5780 :
5781 : /*
5782 : * Expand an inode by new_extra_isize bytes.
5783 : * Returns 0 on success or negative error number on failure.
5784 : */
5785 32 : static int ext4_try_to_expand_extra_isize(struct inode *inode,
5786 : unsigned int new_extra_isize,
5787 : struct ext4_iloc iloc,
5788 : handle_t *handle)
5789 : {
5790 32 : int no_expand;
5791 32 : int error;
5792 :
5793 32 : if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND))
5794 : return -EOVERFLOW;
5795 :
5796 : /*
5797 : * In nojournal mode, we can immediately attempt to expand
5798 : * the inode. When journaled, we first need to obtain extra
5799 : * buffer credits since we may write into the EA block
5800 : * with this same handle. If journal_extend fails, then it will
5801 : * only result in a minor loss of functionality for that inode.
5802 : * If this is felt to be critical, then e2fsck should be run to
5803 : * force a large enough s_min_extra_isize.
5804 : */
5805 20 : if (ext4_journal_extend(handle,
5806 40 : EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
5807 : return -ENOSPC;
5808 :
5809 20 : if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
5810 : return -EBUSY;
5811 :
5812 20 : error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc,
5813 : handle, &no_expand);
5814 20 : ext4_write_unlock_xattr(inode, &no_expand);
5815 :
5816 20 : return error;
5817 : }
5818 :
5819 0 : int ext4_expand_extra_isize(struct inode *inode,
5820 : unsigned int new_extra_isize,
5821 : struct ext4_iloc *iloc)
5822 : {
5823 0 : handle_t *handle;
5824 0 : int no_expand;
5825 0 : int error, rc;
5826 :
5827 0 : if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
5828 0 : brelse(iloc->bh);
5829 0 : return -EOVERFLOW;
5830 : }
5831 :
5832 0 : handle = ext4_journal_start(inode, EXT4_HT_INODE,
5833 : EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
5834 0 : if (IS_ERR(handle)) {
5835 0 : error = PTR_ERR(handle);
5836 0 : brelse(iloc->bh);
5837 0 : return error;
5838 : }
5839 :
5840 0 : ext4_write_lock_xattr(inode, &no_expand);
5841 :
5842 0 : BUFFER_TRACE(iloc->bh, "get_write_access");
5843 0 : error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh,
5844 : EXT4_JTR_NONE);
5845 0 : if (error) {
5846 0 : brelse(iloc->bh);
5847 0 : goto out_unlock;
5848 : }
5849 :
5850 0 : error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc,
5851 : handle, &no_expand);
5852 :
5853 0 : rc = ext4_mark_iloc_dirty(handle, inode, iloc);
5854 0 : if (!error)
5855 0 : error = rc;
5856 :
5857 0 : out_unlock:
5858 0 : ext4_write_unlock_xattr(inode, &no_expand);
5859 0 : ext4_journal_stop(handle);
5860 0 : return error;
5861 : }
5862 :
5863 : /*
5864 : * What we do here is to mark the in-core inode as clean with respect to inode
5865 : * dirtiness (it may still be data-dirty).
5866 : * This means that the in-core inode may be reaped by prune_icache
5867 : * without having to perform any I/O. This is a very good thing,
5868 : * because *any* task may call prune_icache - even ones which
5869 : * have a transaction open against a different journal.
5870 : *
5871 : * Is this cheating? Not really. Sure, we haven't written the
5872 : * inode out, but prune_icache isn't a user-visible syncing function.
5873 : * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
5874 : * we start and wait on commits.
5875 : */
5876 67369900 : int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
5877 : const char *func, unsigned int line)
5878 : {
5879 67369900 : struct ext4_iloc iloc;
5880 67369900 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5881 67369900 : int err;
5882 :
5883 67369900 : might_sleep();
5884 67334329 : trace_ext4_mark_inode_dirty(inode, _RET_IP_);
5885 67285281 : err = ext4_reserve_inode_write(handle, inode, &iloc);
5886 67387508 : if (err)
5887 1 : goto out;
5888 :
5889 67387507 : if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize)
5890 32 : ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize,
5891 : iloc, handle);
5892 :
5893 67387507 : err = ext4_mark_iloc_dirty(handle, inode, &iloc);
5894 67470621 : out:
5895 67470621 : if (unlikely(err))
5896 1 : ext4_error_inode_err(inode, func, line, 0, err,
5897 : "mark_inode_dirty error");
5898 67470621 : return err;
5899 : }
5900 :
5901 : /*
5902 : * ext4_dirty_inode() is called from __mark_inode_dirty()
5903 : *
5904 : * We're really interested in the case where a file is being extended.
5905 : * i_size has been changed by generic_commit_write() and we thus need
5906 : * to include the updated inode in the current transaction.
5907 : *
5908 : * Also, dquot_alloc_block() will always dirty the inode when blocks
5909 : * are allocated to the file.
5910 : *
5911 : * If the inode is marked synchronous, we don't honour that here - doing
5912 : * so would cause a commit on atime updates, which we don't bother doing.
5913 : * We handle synchronous inodes at the highest possible level.
5914 : */
5915 36015708 : void ext4_dirty_inode(struct inode *inode, int flags)
5916 : {
5917 36015708 : handle_t *handle;
5918 :
5919 36015708 : handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
5920 35981462 : if (IS_ERR(handle))
5921 : return;
5922 35981452 : ext4_mark_inode_dirty(handle, inode);
5923 36068076 : ext4_journal_stop(handle);
5924 : }
5925 :
5926 25 : int ext4_change_inode_journal_flag(struct inode *inode, int val)
5927 : {
5928 25 : journal_t *journal;
5929 25 : handle_t *handle;
5930 25 : int err;
5931 25 : int alloc_ctx;
5932 :
5933 : /*
5934 : * We have to be very careful here: changing a data block's
5935 : * journaling status dynamically is dangerous. If we write a
5936 : * data block to the journal, change the status and then delete
5937 : * that block, we risk forgetting to revoke the old log record
5938 : * from the journal and so a subsequent replay can corrupt data.
5939 : * So, first we make sure that the journal is empty and that
5940 : * nobody is changing anything.
5941 : */
5942 :
5943 25 : journal = EXT4_JOURNAL(inode);
5944 25 : if (!journal)
5945 : return 0;
5946 25 : if (is_journal_aborted(journal))
5947 : return -EROFS;
5948 :
5949 : /* Wait for all existing dio workers */
5950 25 : inode_dio_wait(inode);
5951 :
5952 : /*
5953 : * Before flushing the journal and switching inode's aops, we have
5954 : * to flush all dirty data the inode has. There can be outstanding
5955 : * delayed allocations, there can be unwritten extents created by
5956 : * fallocate or buffered writes in dioread_nolock mode covered by
5957 : * dirty data which can be converted only after flushing the dirty
5958 : * data (and journalled aops don't know how to handle these cases).
5959 : */
5960 25 : if (val) {
5961 13 : filemap_invalidate_lock(inode->i_mapping);
5962 13 : err = filemap_write_and_wait(inode->i_mapping);
5963 13 : if (err < 0) {
5964 0 : filemap_invalidate_unlock(inode->i_mapping);
5965 0 : return err;
5966 : }
5967 : }
5968 :
5969 25 : alloc_ctx = ext4_writepages_down_write(inode->i_sb);
5970 25 : jbd2_journal_lock_updates(journal);
5971 :
5972 : /*
5973 : * OK, there are no updates running now, and all cached data is
5974 : * synced to disk. We are now in a completely consistent state
5975 : * which doesn't have anything in the journal, and we know that
5976 : * no filesystem updates are running, so it is safe to modify
5977 : * the inode's in-core data-journaling state flag now.
5978 : */
5979 :
5980 25 : if (val)
5981 13 : ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5982 : else {
5983 12 : err = jbd2_journal_flush(journal, 0);
5984 12 : if (err < 0) {
5985 0 : jbd2_journal_unlock_updates(journal);
5986 0 : ext4_writepages_up_write(inode->i_sb, alloc_ctx);
5987 0 : return err;
5988 : }
5989 12 : ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5990 : }
5991 25 : ext4_set_aops(inode);
5992 :
5993 25 : jbd2_journal_unlock_updates(journal);
5994 25 : ext4_writepages_up_write(inode->i_sb, alloc_ctx);
5995 :
5996 25 : if (val)
5997 13 : filemap_invalidate_unlock(inode->i_mapping);
5998 :
5999 : /* Finally we can mark the inode as dirty. */
6000 :
6001 25 : handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
6002 25 : if (IS_ERR(handle))
6003 0 : return PTR_ERR(handle);
6004 :
6005 25 : ext4_fc_mark_ineligible(inode->i_sb,
6006 : EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle);
6007 25 : err = ext4_mark_inode_dirty(handle, inode);
6008 25 : ext4_handle_sync(handle);
6009 25 : ext4_journal_stop(handle);
6010 25 : ext4_std_error(inode->i_sb, err);
6011 :
6012 : return err;
6013 : }
6014 :
6015 143 : static int ext4_bh_unmapped(handle_t *handle, struct inode *inode,
6016 : struct buffer_head *bh)
6017 : {
6018 143 : return !buffer_mapped(bh);
6019 : }
6020 :
6021 8055512 : vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
6022 : {
6023 8055512 : struct vm_area_struct *vma = vmf->vma;
6024 8055512 : struct folio *folio = page_folio(vmf->page);
6025 8036587 : loff_t size;
6026 8036587 : unsigned long len;
6027 8036587 : int err;
6028 8036587 : vm_fault_t ret;
6029 8036587 : struct file *file = vma->vm_file;
6030 8036587 : struct inode *inode = file_inode(file);
6031 8036587 : struct address_space *mapping = inode->i_mapping;
6032 8036587 : handle_t *handle;
6033 8036587 : get_block_t *get_block;
6034 8036587 : int retries = 0;
6035 :
6036 8036587 : if (unlikely(IS_IMMUTABLE(inode)))
6037 : return VM_FAULT_SIGBUS;
6038 :
6039 8036587 : sb_start_pagefault(inode->i_sb);
6040 8023098 : file_update_time(vma->vm_file);
6041 :
6042 8030965 : filemap_invalidate_lock_shared(mapping);
6043 :
6044 8038987 : err = ext4_convert_inline_data(inode);
6045 7992341 : if (err)
6046 0 : goto out_ret;
6047 :
6048 : /*
6049 : * On data journalling we skip straight to the transaction handle:
6050 : * there's no delalloc; page truncated will be checked later; the
6051 : * early return w/ all buffers mapped (calculates size/len) can't
6052 : * be used; and there's no dioread_nolock, so only ext4_get_block.
6053 : */
6054 8000677 : if (ext4_should_journal_data(inode))
6055 0 : goto retry_alloc;
6056 :
6057 : /* Delalloc case is easy... */
6058 16033591 : if (test_opt(inode->i_sb, DELALLOC) &&
6059 8018841 : !ext4_nonda_switch(inode->i_sb)) {
6060 8021300 : do {
6061 8021300 : err = block_page_mkwrite(vma, vmf,
6062 : ext4_da_get_block_prep);
6063 8018835 : } while (err == -ENOSPC &&
6064 3181 : ext4_should_retry_alloc(inode->i_sb, &retries));
6065 8014032 : goto out_ret;
6066 : }
6067 :
6068 208 : folio_lock(folio);
6069 1630 : size = i_size_read(inode);
6070 : /* Page got truncated from under us? */
6071 1630 : if (folio->mapping != mapping || folio_pos(folio) > size) {
6072 0 : folio_unlock(folio);
6073 0 : ret = VM_FAULT_NOPAGE;
6074 0 : goto out;
6075 : }
6076 :
6077 1630 : len = folio_size(folio);
6078 1630 : if (folio_pos(folio) + len > size)
6079 1 : len = size - folio_pos(folio);
6080 : /*
6081 : * Return if we have all the buffers mapped. This avoids the need to do
6082 : * journal_start/journal_stop which can block and take a long time
6083 : *
6084 : * This cannot be done for data journalling, as we have to add the
6085 : * inode to the transaction's list to writeprotect pages on commit.
6086 : */
6087 1630 : if (folio_buffers(folio)) {
6088 143 : if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio),
6089 : 0, len, NULL,
6090 : ext4_bh_unmapped)) {
6091 : /* Wait so that we don't change page under IO */
6092 139 : folio_wait_stable(folio);
6093 139 : ret = VM_FAULT_LOCKED;
6094 139 : goto out;
6095 : }
6096 : }
6097 1491 : folio_unlock(folio);
6098 : /* OK, we need to fill the hole... */
6099 1492 : if (ext4_should_dioread_nolock(inode))
6100 : get_block = ext4_get_block_unwritten;
6101 : else
6102 193 : get_block = ext4_get_block;
6103 1491 : retry_alloc:
6104 1666 : handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
6105 : ext4_writepage_trans_blocks(inode));
6106 1664 : if (IS_ERR(handle)) {
6107 0 : ret = VM_FAULT_SIGBUS;
6108 0 : goto out;
6109 : }
6110 : /*
6111 : * Data journalling can't use block_page_mkwrite() because it
6112 : * will set_buffer_dirty() before do_journal_get_write_access()
6113 : * thus might hit warning messages for dirty metadata buffers.
6114 : */
6115 1664 : if (!ext4_should_journal_data(inode)) {
6116 1664 : err = block_page_mkwrite(vma, vmf, get_block);
6117 : } else {
6118 0 : folio_lock(folio);
6119 0 : size = i_size_read(inode);
6120 : /* Page got truncated from under us? */
6121 0 : if (folio->mapping != mapping || folio_pos(folio) > size) {
6122 0 : ret = VM_FAULT_NOPAGE;
6123 0 : goto out_error;
6124 : }
6125 :
6126 0 : len = folio_size(folio);
6127 0 : if (folio_pos(folio) + len > size)
6128 0 : len = size - folio_pos(folio);
6129 :
6130 0 : err = __block_write_begin(&folio->page, 0, len, ext4_get_block);
6131 0 : if (!err) {
6132 0 : ret = VM_FAULT_SIGBUS;
6133 0 : if (ext4_journal_folio_buffers(handle, folio, len))
6134 0 : goto out_error;
6135 : } else {
6136 0 : folio_unlock(folio);
6137 : }
6138 : }
6139 1667 : ext4_journal_stop(handle);
6140 1667 : if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
6141 175 : goto retry_alloc;
6142 1492 : out_ret:
6143 8007188 : ret = block_page_mkwrite_return(err);
6144 8007327 : out:
6145 8007327 : filemap_invalidate_unlock_shared(mapping);
6146 8037147 : sb_end_pagefault(inode->i_sb);
6147 8037147 : return ret;
6148 0 : out_error:
6149 0 : folio_unlock(folio);
6150 0 : ext4_journal_stop(handle);
6151 0 : goto out;
6152 : }
|