Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (C) 2010 Red Hat, Inc.
4 : * Copyright (C) 2016-2019 Christoph Hellwig.
5 : */
6 : #include <linux/module.h>
7 : #include <linux/compiler.h>
8 : #include <linux/fs.h>
9 : #include <linux/iomap.h>
10 : #include <linux/pagemap.h>
11 : #include <linux/uio.h>
12 : #include <linux/buffer_head.h>
13 : #include <linux/dax.h>
14 : #include <linux/writeback.h>
15 : #include <linux/list_sort.h>
16 : #include <linux/swap.h>
17 : #include <linux/bio.h>
18 : #include <linux/sched/signal.h>
19 : #include <linux/migrate.h>
20 : #include "trace.h"
21 :
22 : #include "../internal.h"
23 :
24 : #define IOEND_BATCH_SIZE 4096
25 :
26 : /*
27 : * Structure allocated for each folio when block size < folio size
28 : * to track sub-folio uptodate status and I/O completions.
29 : */
30 : struct iomap_page {
31 : atomic_t read_bytes_pending;
32 : atomic_t write_bytes_pending;
33 : spinlock_t uptodate_lock;
34 : unsigned long uptodate[];
35 : };
36 :
37 : static inline struct iomap_page *to_iomap_page(struct folio *folio)
38 : {
39 9994565099 : if (folio_test_private(folio))
40 280229088 : return folio_get_private(folio);
41 : return NULL;
42 : }
43 :
44 : static struct bio_set iomap_ioend_bioset;
45 :
46 : static struct iomap_page *
47 3994413451 : iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
48 : {
49 3994413451 : struct iomap_page *iop = to_iomap_page(folio);
50 3994413451 : unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
51 3993380947 : gfp_t gfp;
52 :
53 3993380947 : if (iop || nr_blocks <= 1)
54 : return iop;
55 :
56 136832825 : if (flags & IOMAP_NOWAIT)
57 : gfp = GFP_NOWAIT;
58 : else
59 136833413 : gfp = GFP_NOFS | __GFP_NOFAIL;
60 :
61 136832825 : iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
62 : gfp);
63 136835436 : if (iop) {
64 136835436 : spin_lock_init(&iop->uptodate_lock);
65 136837568 : if (folio_test_uptodate(folio))
66 35 : bitmap_fill(iop->uptodate, nr_blocks);
67 136837533 : folio_attach_private(folio, iop);
68 : }
69 : return iop;
70 : }
71 :
72 136848882 : static void iomap_page_release(struct folio *folio)
73 : {
74 136848882 : struct iomap_page *iop = folio_detach_private(folio);
75 136850552 : struct inode *inode = folio->mapping->host;
76 136850552 : unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
77 :
78 136848708 : if (!iop)
79 : return;
80 136848708 : WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending));
81 136848708 : WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending));
82 273696206 : WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
83 : folio_test_uptodate(folio));
84 136847498 : kfree(iop);
85 : }
86 :
87 : /*
88 : * Calculate the range inside the folio that we actually need to read.
89 : */
90 3314776898 : static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
91 : loff_t *pos, loff_t length, size_t *offp, size_t *lenp)
92 : {
93 3314776898 : struct iomap_page *iop = to_iomap_page(folio);
94 3314776898 : loff_t orig_pos = *pos;
95 3314776898 : loff_t isize = i_size_read(inode);
96 3314776898 : unsigned block_bits = inode->i_blkbits;
97 3314776898 : unsigned block_size = (1 << block_bits);
98 3314776898 : size_t poff = offset_in_folio(folio, *pos);
99 3313756944 : size_t plen = min_t(loff_t, folio_size(folio) - poff, length);
100 3313877545 : unsigned first = poff >> block_bits;
101 3313877545 : unsigned last = (poff + plen - 1) >> block_bits;
102 :
103 : /*
104 : * If the block size is smaller than the page size, we need to check the
105 : * per-block uptodate status and adjust the offset and length if needed
106 : * to avoid reading in already uptodate ranges.
107 : */
108 3313877545 : if (iop) {
109 : unsigned int i;
110 :
111 : /* move forward for each leading block marked uptodate */
112 180500581 : for (i = first; i <= last; i++) {
113 333880082 : if (!test_bit(i, iop->uptodate))
114 : break;
115 14347066 : *pos += block_size;
116 14347066 : poff += block_size;
117 14347066 : plen -= block_size;
118 14347066 : first++;
119 : }
120 :
121 : /* truncate len if we find any trailing uptodate block(s) */
122 860079362 : for ( ; i <= last; i++) {
123 1387048607 : if (test_bit(i, iop->uptodate)) {
124 9 : plen -= (last - i + 1) * block_size;
125 9 : last = i - 1;
126 9 : break;
127 : }
128 : }
129 : }
130 :
131 : /*
132 : * If the extent spans the block that contains the i_size, we need to
133 : * handle both halves separately so that we properly zero data in the
134 : * page cache for blocks that are entirely outside of i_size.
135 : */
136 3314282234 : if (orig_pos <= isize && orig_pos + length > isize) {
137 550307021 : unsigned end = offset_in_folio(folio, isize - 1) >> block_bits;
138 :
139 550488483 : if (first <= end && last > end)
140 5710760 : plen -= (last - end) * block_size;
141 : }
142 :
143 3314463696 : *offp = poff;
144 3314463696 : *lenp = plen;
145 3314463696 : }
146 :
147 196180807 : static void iomap_iop_set_range_uptodate(struct folio *folio,
148 : struct iomap_page *iop, size_t off, size_t len)
149 : {
150 196180807 : struct inode *inode = folio->mapping->host;
151 196180807 : unsigned first = off >> inode->i_blkbits;
152 196180807 : unsigned last = (off + len - 1) >> inode->i_blkbits;
153 196180807 : unsigned long flags;
154 :
155 196180807 : spin_lock_irqsave(&iop->uptodate_lock, flags);
156 196185502 : bitmap_set(iop->uptodate, first, last - first + 1);
157 196182276 : if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio)))
158 166987162 : folio_mark_uptodate(folio);
159 196186678 : spin_unlock_irqrestore(&iop->uptodate_lock, flags);
160 196178612 : }
161 :
162 4574359909 : static void iomap_set_range_uptodate(struct folio *folio,
163 : struct iomap_page *iop, size_t off, size_t len)
164 : {
165 4574359909 : if (iop)
166 196181318 : iomap_iop_set_range_uptodate(folio, iop, off, len);
167 : else
168 4378178591 : folio_mark_uptodate(folio);
169 4576901501 : }
170 :
171 89502171 : static void iomap_finish_folio_read(struct folio *folio, size_t offset,
172 : size_t len, int error)
173 : {
174 89502171 : struct iomap_page *iop = to_iomap_page(folio);
175 :
176 89502171 : if (unlikely(error)) {
177 7272 : folio_clear_uptodate(folio);
178 7272 : folio_set_error(folio);
179 : } else {
180 89494899 : iomap_set_range_uptodate(folio, iop, offset, len);
181 : }
182 :
183 89502171 : if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending))
184 86085000 : folio_unlock(folio);
185 89502171 : }
186 :
187 35121279 : static void iomap_read_end_io(struct bio *bio)
188 : {
189 35121279 : int error = blk_status_to_errno(bio->bi_status);
190 35121279 : struct folio_iter fi;
191 :
192 124623450 : bio_for_each_folio_all(fi, bio)
193 89502171 : iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
194 35121279 : bio_put(bio);
195 35121279 : }
196 :
197 : struct iomap_readpage_ctx {
198 : struct folio *cur_folio;
199 : bool cur_folio_in_bio;
200 : struct bio *bio;
201 : struct readahead_control *rac;
202 : };
203 :
204 : /**
205 : * iomap_read_inline_data - copy inline data into the page cache
206 : * @iter: iteration structure
207 : * @folio: folio to copy to
208 : *
209 : * Copy the inline data in @iter into @folio and zero out the rest of the folio.
210 : * Only a single IOMAP_INLINE extent is allowed at the end of each file.
211 : * Returns zero for success to complete the read, or the usual negative errno.
212 : */
213 0 : static int iomap_read_inline_data(const struct iomap_iter *iter,
214 : struct folio *folio)
215 : {
216 0 : struct iomap_page *iop;
217 0 : const struct iomap *iomap = iomap_iter_srcmap(iter);
218 0 : size_t size = i_size_read(iter->inode) - iomap->offset;
219 0 : size_t poff = offset_in_page(iomap->offset);
220 0 : size_t offset = offset_in_folio(folio, iomap->offset);
221 0 : void *addr;
222 :
223 0 : if (folio_test_uptodate(folio))
224 : return 0;
225 :
226 0 : if (WARN_ON_ONCE(size > PAGE_SIZE - poff))
227 : return -EIO;
228 0 : if (WARN_ON_ONCE(size > PAGE_SIZE -
229 : offset_in_page(iomap->inline_data)))
230 : return -EIO;
231 0 : if (WARN_ON_ONCE(size > iomap->length))
232 : return -EIO;
233 0 : if (offset > 0)
234 0 : iop = iomap_page_create(iter->inode, folio, iter->flags);
235 : else
236 0 : iop = to_iomap_page(folio);
237 :
238 0 : addr = kmap_local_folio(folio, offset);
239 0 : memcpy(addr, iomap->inline_data, size);
240 0 : memset(addr + size, 0, PAGE_SIZE - poff - size);
241 0 : kunmap_local(addr);
242 0 : iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff);
243 0 : return 0;
244 : }
245 :
246 2658814289 : static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
247 : loff_t pos)
248 : {
249 2658814289 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
250 :
251 2756515834 : return srcmap->type != IOMAP_MAPPED ||
252 2658814289 : (srcmap->flags & IOMAP_F_NEW) ||
253 97699869 : pos >= i_size_read(iter->inode);
254 : }
255 :
256 2579896786 : static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
257 : struct iomap_readpage_ctx *ctx, loff_t offset)
258 : {
259 2579896786 : const struct iomap *iomap = &iter->iomap;
260 2579896786 : loff_t pos = iter->pos + offset;
261 2579896786 : loff_t length = iomap_length(iter) - offset;
262 2579896786 : struct folio *folio = ctx->cur_folio;
263 2579896786 : struct iomap_page *iop;
264 2579896786 : loff_t orig_pos = pos;
265 2579896786 : size_t poff, plen;
266 2579896786 : sector_t sector;
267 :
268 2579896786 : if (iomap->type == IOMAP_INLINE)
269 0 : return iomap_read_inline_data(iter, folio);
270 :
271 : /* zero post-eof blocks as the page may be mapped */
272 2579896786 : iop = iomap_page_create(iter->inode, folio, iter->flags);
273 2579643536 : iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
274 2579725503 : if (plen == 0)
275 157 : goto done;
276 :
277 2579725346 : if (iomap_block_needs_zeroing(iter, pos)) {
278 2489508711 : folio_zero_range(folio, poff, plen);
279 2490006116 : iomap_set_range_uptodate(folio, iop, poff, plen);
280 2490945580 : goto done;
281 : }
282 :
283 90216635 : ctx->cur_folio_in_bio = true;
284 90216635 : if (iop)
285 15057388 : atomic_add(plen, &iop->read_bytes_pending);
286 :
287 90216650 : sector = iomap_sector(iomap, pos);
288 90216650 : if (!ctx->bio ||
289 120336277 : bio_end_sector(ctx->bio) != sector ||
290 55227953 : !bio_add_folio(ctx->bio, folio, plen, poff)) {
291 35107396 : gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
292 35107396 : gfp_t orig_gfp = gfp;
293 35107396 : unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
294 :
295 35107396 : if (ctx->bio)
296 9999689 : submit_bio(ctx->bio);
297 :
298 35108438 : if (ctx->rac) /* same as readahead_gfp_mask */
299 17071434 : gfp |= __GFP_NORETRY | __GFP_NOWARN;
300 35108438 : ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
301 : REQ_OP_READ, gfp);
302 : /*
303 : * If the bio_alloc fails, try it again for a single page to
304 : * avoid having to deal with partial page reads. This emulates
305 : * what do_mpage_read_folio does.
306 : */
307 35116326 : if (!ctx->bio) {
308 0 : ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
309 : orig_gfp);
310 : }
311 35116326 : if (ctx->rac)
312 17071754 : ctx->bio->bi_opf |= REQ_RAHEAD;
313 35116326 : ctx->bio->bi_iter.bi_sector = sector;
314 35116326 : ctx->bio->bi_end_io = iomap_read_end_io;
315 35116326 : bio_add_folio_nofail(ctx->bio, folio, plen, poff);
316 : }
317 :
318 55109162 : done:
319 : /*
320 : * Move the caller beyond our range so that it keeps making progress.
321 : * For that, we have to include any leading non-uptodate ranges, but
322 : * we can skip trailing ones as they will be handled in the next
323 : * iteration.
324 : */
325 2581168094 : return pos - orig_pos + plen;
326 : }
327 :
328 1649225438 : int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
329 : {
330 3297400695 : struct iomap_iter iter = {
331 1649225438 : .inode = folio->mapping->host,
332 : .pos = folio_pos(folio),
333 1649225438 : .len = folio_size(folio),
334 : };
335 1648175257 : struct iomap_readpage_ctx ctx = {
336 : .cur_folio = folio,
337 : };
338 1648175257 : int ret;
339 :
340 1648175257 : trace_iomap_readpage(iter.inode, 1);
341 :
342 3297112997 : while ((ret = iomap_iter(&iter, ops)) > 0)
343 1647404921 : iter.processed = iomap_readpage_iter(&iter, &ctx, 0);
344 :
345 1648548850 : if (ret < 0)
346 289 : folio_set_error(folio);
347 :
348 1648548850 : if (ctx.bio) {
349 18038559 : submit_bio(ctx.bio);
350 18049254 : WARN_ON_ONCE(!ctx.cur_folio_in_bio);
351 : } else {
352 1630510291 : WARN_ON_ONCE(ctx.cur_folio_in_bio);
353 1630510291 : folio_unlock(folio);
354 : }
355 :
356 : /*
357 : * Just like mpage_readahead and block_read_full_folio, we always
358 : * return 0 and just set the folio error flag on errors. This
359 : * should be cleaned up throughout the stack eventually.
360 : */
361 1649243550 : return 0;
362 : }
363 : EXPORT_SYMBOL_GPL(iomap_read_folio);
364 :
365 67285491 : static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
366 : struct iomap_readpage_ctx *ctx)
367 : {
368 67285491 : loff_t length = iomap_length(iter);
369 67285491 : loff_t done, ret;
370 :
371 999670272 : for (done = 0; done < length; done += ret) {
372 932384177 : if (ctx->cur_folio &&
373 893629694 : offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) {
374 878265913 : if (!ctx->cur_folio_in_bio)
375 814820338 : folio_unlock(ctx->cur_folio);
376 878291317 : ctx->cur_folio = NULL;
377 : }
378 932381518 : if (!ctx->cur_folio) {
379 917032016 : ctx->cur_folio = readahead_folio(ctx->rac);
380 917046837 : ctx->cur_folio_in_bio = false;
381 : }
382 932396339 : ret = iomap_readpage_iter(iter, ctx, done);
383 932384781 : if (ret <= 0)
384 0 : return ret;
385 : }
386 :
387 : return done;
388 : }
389 :
390 : /**
391 : * iomap_readahead - Attempt to read pages from a file.
392 : * @rac: Describes the pages to be read.
393 : * @ops: The operations vector for the filesystem.
394 : *
395 : * This function is for filesystems to call to implement their readahead
396 : * address_space operation.
397 : *
398 : * Context: The @ops callbacks may submit I/O (eg to read the addresses of
399 : * blocks from disc), and may wait for it. The caller may be trying to
400 : * access a different page, and so sleeping excessively should be avoided.
401 : * It may allocate memory, but should avoid costly allocations. This
402 : * function is called with memalloc_nofs set, so allocations will not cause
403 : * the filesystem to be reentered.
404 : */
405 38755150 : void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
406 : {
407 38755150 : struct iomap_iter iter = {
408 38755150 : .inode = rac->mapping->host,
409 : .pos = readahead_pos(rac),
410 : .len = readahead_length(rac),
411 : };
412 38755150 : struct iomap_readpage_ctx ctx = {
413 : .rac = rac,
414 : };
415 :
416 38755150 : trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
417 :
418 106041235 : while (iomap_iter(&iter, ops) > 0)
419 67285555 : iter.processed = iomap_readahead_iter(&iter, &ctx);
420 :
421 38755074 : if (ctx.bio)
422 7071980 : submit_bio(ctx.bio);
423 38754976 : if (ctx.cur_folio) {
424 38754857 : if (!ctx.cur_folio_in_bio)
425 34167501 : folio_unlock(ctx.cur_folio);
426 : }
427 38755082 : }
428 : EXPORT_SYMBOL_GPL(iomap_readahead);
429 :
430 : /*
431 : * iomap_is_partially_uptodate checks whether blocks within a folio are
432 : * uptodate or not.
433 : *
434 : * Returns true if all blocks which correspond to the specified part
435 : * of the folio are uptodate.
436 : */
437 27119 : bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
438 : {
439 27119 : struct iomap_page *iop = to_iomap_page(folio);
440 27119 : struct inode *inode = folio->mapping->host;
441 27119 : unsigned first, last, i;
442 :
443 27119 : if (!iop)
444 : return false;
445 :
446 : /* Caller's range may extend past the end of this folio */
447 43 : count = min(folio_size(folio) - from, count);
448 :
449 : /* First and last blocks in range within folio */
450 43 : first = from >> inode->i_blkbits;
451 43 : last = (from + count - 1) >> inode->i_blkbits;
452 :
453 152 : for (i = first; i <= last; i++)
454 296 : if (!test_bit(i, iop->uptodate))
455 : return false;
456 : return true;
457 : }
458 : EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
459 :
460 : /**
461 : * iomap_get_folio - get a folio reference for writing
462 : * @iter: iteration structure
463 : * @pos: start offset of write
464 : *
465 : * Returns a locked reference to the folio at @pos, or an error pointer if the
466 : * folio could not be obtained.
467 : */
468 1914438836 : struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
469 : {
470 1914438836 : unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
471 :
472 1914438836 : if (iter->flags & IOMAP_NOWAIT)
473 0 : fgp |= FGP_NOWAIT;
474 :
475 3829610723 : return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
476 1914438836 : fgp, mapping_gfp_mask(iter->inode->i_mapping));
477 : }
478 : EXPORT_SYMBOL_GPL(iomap_get_folio);
479 :
480 19180485 : bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags)
481 : {
482 19180482 : trace_iomap_release_folio(folio->mapping->host, folio_pos(folio),
483 19180485 : folio_size(folio));
484 :
485 : /*
486 : * mm accommodates an old ext3 case where clean folios might
487 : * not have had the dirty bit cleared. Thus, it can send actual
488 : * dirty folios to ->release_folio() via shrink_active_list();
489 : * skip those here.
490 : */
491 19180473 : if (folio_test_dirty(folio) || folio_test_writeback(folio))
492 : return false;
493 19180469 : iomap_page_release(folio);
494 19180469 : return true;
495 : }
496 : EXPORT_SYMBOL_GPL(iomap_release_folio);
497 :
498 117669434 : void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
499 : {
500 117669434 : trace_iomap_invalidate_folio(folio->mapping->host,
501 117669434 : folio_pos(folio) + offset, len);
502 :
503 : /*
504 : * If we're invalidating the entire folio, clear the dirty state
505 : * from it and release it to avoid unnecessary buildup of the LRU.
506 : */
507 117669292 : if (offset == 0 && len == folio_size(folio)) {
508 109118143 : WARN_ON_ONCE(folio_test_writeback(folio));
509 109118143 : folio_cancel_dirty(folio);
510 109118493 : iomap_page_release(folio);
511 8550935 : } else if (folio_test_large(folio)) {
512 : /* Must release the iop so the page can be split */
513 17101727 : WARN_ON_ONCE(!folio_test_uptodate(folio) &&
514 : folio_test_dirty(folio));
515 8550863 : iomap_page_release(folio);
516 : }
517 117669974 : }
518 : EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
519 :
520 : static void
521 119015 : iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
522 : {
523 119015 : loff_t i_size = i_size_read(inode);
524 :
525 : /*
526 : * Only truncate newly allocated pages beyoned EOF, even if the
527 : * write started inside the existing inode size.
528 : */
529 119015 : if (pos + len > i_size)
530 56323 : truncate_pagecache_range(inode, max(pos, i_size),
531 : pos + len - 1);
532 119015 : }
533 :
534 7467396 : static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
535 : size_t poff, size_t plen, const struct iomap *iomap)
536 : {
537 7467396 : struct bio_vec bvec;
538 7467396 : struct bio bio;
539 :
540 7467396 : bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
541 7467411 : bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
542 7467411 : bio_add_folio_nofail(&bio, folio, plen, poff);
543 7467362 : return submit_bio_wait(&bio);
544 : }
545 :
546 1914894082 : static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
547 : size_t len, struct folio *folio)
548 : {
549 1914894082 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
550 1914894082 : struct iomap_page *iop;
551 1914894082 : loff_t block_size = i_blocksize(iter->inode);
552 1915093644 : loff_t block_start = round_down(pos, block_size);
553 1915093644 : loff_t block_end = round_up(pos + len, block_size);
554 1915093644 : unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
555 1915217872 : size_t from = offset_in_folio(folio, pos), to = from + len;
556 1915333123 : size_t poff, plen;
557 :
558 3096486112 : if (folio_test_uptodate(folio))
559 : return 0;
560 734202058 : folio_clear_error(folio);
561 :
562 735092700 : iop = iomap_page_create(iter->inode, folio, iter->flags);
563 734332641 : if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1)
564 : return -EAGAIN;
565 :
566 734346119 : do {
567 734346119 : iomap_adjust_read_range(iter->inode, folio, &block_start,
568 : block_end - block_start, &poff, &plen);
569 734892409 : if (plen == 0)
570 : break;
571 :
572 721132044 : if (!(iter->flags & IOMAP_UNSHARE) &&
573 720959024 : (from <= poff || from >= poff + plen) &&
574 694202957 : (to <= poff || to >= poff + plen))
575 642176238 : continue;
576 :
577 78955806 : if (iomap_block_needs_zeroing(iter, block_start)) {
578 71488404 : if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE))
579 : return -EIO;
580 71488404 : folio_zero_segments(folio, poff, from, to, poff + plen);
581 : } else {
582 7467402 : int status;
583 :
584 7467402 : if (iter->flags & IOMAP_NOWAIT)
585 : return -EAGAIN;
586 :
587 7467402 : status = iomap_read_folio_sync(block_start, folio,
588 : poff, plen, srcmap);
589 7467221 : if (status)
590 1636 : return status;
591 : }
592 79033174 : iomap_set_range_uptodate(folio, iop, poff, plen);
593 721250042 : } while ((block_start += plen) < block_end);
594 :
595 : return 0;
596 : }
597 :
598 1914288094 : static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
599 : size_t len)
600 : {
601 1914288094 : const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
602 :
603 1914288094 : if (folio_ops && folio_ops->get_folio)
604 0 : return folio_ops->get_folio(iter, pos, len);
605 : else
606 1914288094 : return iomap_get_folio(iter, pos);
607 : }
608 :
609 1916816773 : static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret,
610 : struct folio *folio)
611 : {
612 1916816773 : const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
613 :
614 1916816773 : if (folio_ops && folio_ops->put_folio) {
615 0 : folio_ops->put_folio(iter->inode, pos, ret, folio);
616 : } else {
617 1916816773 : folio_unlock(folio);
618 1917433827 : folio_put(folio);
619 : }
620 1917496783 : }
621 :
622 0 : static int iomap_write_begin_inline(const struct iomap_iter *iter,
623 : struct folio *folio)
624 : {
625 : /* needs more work for the tailpacking case; disable for now */
626 0 : if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0))
627 : return -EIO;
628 0 : return iomap_read_inline_data(iter, folio);
629 : }
630 :
631 1915446445 : static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
632 : size_t len, struct folio **foliop)
633 : {
634 1915446445 : const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
635 1915446445 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
636 1915446445 : struct folio *folio;
637 1915446445 : int status = 0;
638 :
639 1915446445 : BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
640 1915446445 : if (srcmap != &iter->iomap)
641 98347413 : BUG_ON(pos + len > srcmap->offset + srcmap->length);
642 :
643 1915446445 : if (fatal_signal_pending(current))
644 : return -EINTR;
645 :
646 3828317194 : if (!mapping_large_folio_support(iter->inode->i_mapping))
647 0 : len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
648 :
649 1914158597 : folio = __iomap_get_folio(iter, pos, len);
650 1914410487 : if (IS_ERR(folio))
651 0 : return PTR_ERR(folio);
652 :
653 : /*
654 : * Now we have a locked folio, before we do anything with it we need to
655 : * check that the iomap we have cached is not stale. The inode extent
656 : * mapping can change due to concurrent IO in flight (e.g.
657 : * IOMAP_UNWRITTEN state can change and memory reclaim could have
658 : * reclaimed a previously partially written page at this index after IO
659 : * completion before this write reaches this file offset) and hence we
660 : * could do the wrong thing here (zero a page range incorrectly or fail
661 : * to zero) and corrupt data.
662 : */
663 1914410487 : if (folio_ops && folio_ops->iomap_valid) {
664 1914410487 : bool iomap_valid = folio_ops->iomap_valid(iter->inode,
665 : &iter->iomap);
666 1914488075 : if (!iomap_valid) {
667 117378 : iter->iomap.flags |= IOMAP_F_STALE;
668 117378 : status = 0;
669 117378 : goto out_unlock;
670 : }
671 : }
672 :
673 1914370697 : if (pos + len > folio_pos(folio) + folio_size(folio))
674 6463189 : len = folio_pos(folio) + folio_size(folio) - pos;
675 :
676 1914667698 : if (srcmap->type == IOMAP_INLINE)
677 0 : status = iomap_write_begin_inline(iter, folio);
678 1914667698 : else if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
679 0 : status = __block_write_begin_int(folio, pos, len, NULL, srcmap);
680 : else
681 1914667698 : status = __iomap_write_begin(iter, pos, len, folio);
682 :
683 1915916659 : if (unlikely(status))
684 1635 : goto out_unlock;
685 :
686 1915915024 : *foliop = folio;
687 1915915024 : return 0;
688 :
689 119013 : out_unlock:
690 119013 : __iomap_put_folio(iter, pos, 0, folio);
691 119014 : iomap_write_failed(iter->inode, pos, len);
692 :
693 119014 : return status;
694 : }
695 :
696 1916553457 : static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
697 : size_t copied, struct folio *folio)
698 : {
699 1916553457 : struct iomap_page *iop = to_iomap_page(folio);
700 1916553457 : flush_dcache_folio(folio);
701 :
702 : /*
703 : * The blocks that were entirely written will now be uptodate, so we
704 : * don't have to worry about a read_folio reading them and overwriting a
705 : * partial write. However, if we've encountered a short write and only
706 : * partially written into a block, it will not be marked uptodate, so a
707 : * read_folio might come in and destroy our partial write.
708 : *
709 : * Do the simplest thing and just treat any short write to a
710 : * non-uptodate page as a zero-length write, and force the caller to
711 : * redo the whole thing.
712 : */
713 1916553458 : if (unlikely(copied < len && !folio_test_uptodate(folio)))
714 : return 0;
715 1916553456 : iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len);
716 1916940755 : filemap_dirty_folio(inode->i_mapping, folio);
717 1916940755 : return copied;
718 : }
719 :
720 0 : static size_t iomap_write_end_inline(const struct iomap_iter *iter,
721 : struct folio *folio, loff_t pos, size_t copied)
722 : {
723 0 : const struct iomap *iomap = &iter->iomap;
724 0 : void *addr;
725 :
726 0 : WARN_ON_ONCE(!folio_test_uptodate(folio));
727 0 : BUG_ON(!iomap_inline_data_valid(iomap));
728 :
729 0 : flush_dcache_folio(folio);
730 0 : addr = kmap_local_folio(folio, pos);
731 0 : memcpy(iomap_inline_data(iomap, pos), addr, copied);
732 0 : kunmap_local(addr);
733 :
734 0 : mark_inode_dirty(iter->inode);
735 0 : return copied;
736 : }
737 :
738 : /* Returns the number of bytes copied. May be 0. Cannot be an errno. */
739 1917129299 : static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
740 : size_t copied, struct folio *folio)
741 : {
742 1917129299 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
743 1917129299 : loff_t old_size = iter->inode->i_size;
744 1917129299 : size_t ret;
745 :
746 1917129299 : if (srcmap->type == IOMAP_INLINE) {
747 0 : ret = iomap_write_end_inline(iter, folio, pos, copied);
748 1917129299 : } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
749 0 : ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
750 : copied, &folio->page, NULL);
751 : } else {
752 1917129299 : ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
753 : }
754 :
755 : /*
756 : * Update the in-memory inode size after copying the data into the page
757 : * cache. It's up to the file system to write the updated size to disk,
758 : * preferably after I/O completion so that no stale data is exposed.
759 : */
760 1916943652 : if (pos + ret > old_size) {
761 621872177 : i_size_write(iter->inode, pos + ret);
762 621872177 : iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
763 : }
764 1916943652 : __iomap_put_folio(iter, pos, ret, folio);
765 :
766 1917353796 : if (old_size < pos)
767 38979662 : pagecache_isize_extended(iter->inode, old_size, pos);
768 1917321969 : if (ret < len)
769 1 : iomap_write_failed(iter->inode, pos + ret, len - ret);
770 1917321969 : return ret;
771 : }
772 :
773 259660899 : static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
774 : {
775 259660899 : loff_t length = iomap_length(iter);
776 259660899 : loff_t pos = iter->pos;
777 259660899 : ssize_t written = 0;
778 259660899 : long status = 0;
779 259660899 : struct address_space *mapping = iter->inode->i_mapping;
780 259660899 : unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
781 :
782 1871357519 : do {
783 1871357519 : struct folio *folio;
784 1871357519 : struct page *page;
785 1871357519 : unsigned long offset; /* Offset into pagecache page */
786 1871357519 : unsigned long bytes; /* Bytes to write to page */
787 1871357519 : size_t copied; /* Bytes copied from user */
788 :
789 1871357519 : offset = offset_in_page(pos);
790 1871357519 : bytes = min_t(unsigned long, PAGE_SIZE - offset,
791 : iov_iter_count(i));
792 1871357520 : again:
793 1871357520 : status = balance_dirty_pages_ratelimited_flags(mapping,
794 : bdp_flags);
795 1870251141 : if (unlikely(status))
796 : break;
797 :
798 1870251141 : if (bytes > length)
799 : bytes = length;
800 :
801 : /*
802 : * Bring in the user page that we'll copy from _first_.
803 : * Otherwise there's a nasty deadlock on copying from the
804 : * same page as we're writing to, without it being marked
805 : * up-to-date.
806 : *
807 : * For async buffered writes the assumption is that the user
808 : * page has already been faulted in. This can be optimized by
809 : * faulting the user page.
810 : */
811 1870251141 : if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
812 : status = -EFAULT;
813 : break;
814 : }
815 :
816 1871848433 : status = iomap_write_begin(iter, pos, bytes, &folio);
817 1871070705 : if (unlikely(status))
818 : break;
819 1871069577 : if (iter->iomap.flags & IOMAP_F_STALE)
820 : break;
821 :
822 1870952881 : page = folio_file_page(folio, pos >> PAGE_SHIFT);
823 1870032500 : if (mapping_writably_mapped(mapping))
824 : flush_dcache_page(page);
825 :
826 1870032500 : copied = copy_page_from_iter_atomic(page, offset, bytes, i);
827 :
828 1872133860 : status = iomap_write_end(iter, pos, bytes, copied, folio);
829 :
830 1871927249 : if (unlikely(copied != status))
831 0 : iov_iter_revert(i, copied - status);
832 :
833 1871927249 : cond_resched();
834 1871802556 : if (unlikely(status == 0)) {
835 : /*
836 : * A short copy made iomap_write_end() reject the
837 : * thing entirely. Might be memory poisoning
838 : * halfway through, might be a race with munmap,
839 : * might be severe memory pressure.
840 : */
841 1 : if (copied)
842 0 : bytes = copied;
843 1 : goto again;
844 : }
845 1871802555 : pos += status;
846 1871802555 : written += status;
847 1871802555 : length -= status;
848 1871802555 : } while (iov_iter_count(i) && length);
849 :
850 260223759 : if (status == -EAGAIN) {
851 0 : iov_iter_revert(i, written);
852 0 : return -EAGAIN;
853 : }
854 260223759 : return written ? written : status;
855 : }
856 :
857 : ssize_t
858 249533008 : iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
859 : const struct iomap_ops *ops)
860 : {
861 249533008 : struct iomap_iter iter = {
862 249533008 : .inode = iocb->ki_filp->f_mapping->host,
863 249533008 : .pos = iocb->ki_pos,
864 : .len = iov_iter_count(i),
865 : .flags = IOMAP_WRITE,
866 : };
867 249533008 : ssize_t ret;
868 :
869 249533008 : if (iocb->ki_flags & IOCB_NOWAIT)
870 0 : iter.flags |= IOMAP_NOWAIT;
871 :
872 509714428 : while ((ret = iomap_iter(&iter, ops)) > 0)
873 259707111 : iter.processed = iomap_write_iter(&iter, i);
874 :
875 249811280 : if (unlikely(iter.pos == iocb->ki_pos))
876 : return ret;
877 247676616 : ret = iter.pos - iocb->ki_pos;
878 247676616 : iocb->ki_pos = iter.pos;
879 247676616 : return ret;
880 : }
881 : EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
882 :
883 : /*
884 : * Scan the data range passed to us for dirty page cache folios. If we find a
885 : * dirty folio, punch out the preceeding range and update the offset from which
886 : * the next punch will start from.
887 : *
888 : * We can punch out storage reservations under clean pages because they either
889 : * contain data that has been written back - in which case the delalloc punch
890 : * over that range is a no-op - or they have been read faults in which case they
891 : * contain zeroes and we can remove the delalloc backing range and any new
892 : * writes to those pages will do the normal hole filling operation...
893 : *
894 : * This makes the logic simple: we only need to keep the delalloc extents only
895 : * over the dirty ranges of the page cache.
896 : *
897 : * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
898 : * simplify range iterations.
899 : */
900 1851 : static int iomap_write_delalloc_scan(struct inode *inode,
901 : loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
902 : int (*punch)(struct inode *inode, loff_t offset, loff_t length))
903 : {
904 11445 : while (start_byte < end_byte) {
905 9594 : struct folio *folio;
906 :
907 : /* grab locked page */
908 9594 : folio = filemap_lock_folio(inode->i_mapping,
909 9594 : start_byte >> PAGE_SHIFT);
910 9594 : if (IS_ERR(folio)) {
911 333 : start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) +
912 : PAGE_SIZE;
913 333 : continue;
914 : }
915 :
916 : /* if dirty, punch up to offset */
917 9261 : if (folio_test_dirty(folio)) {
918 281 : if (start_byte > *punch_start_byte) {
919 0 : int error;
920 :
921 0 : error = punch(inode, *punch_start_byte,
922 : start_byte - *punch_start_byte);
923 0 : if (error) {
924 0 : folio_unlock(folio);
925 0 : folio_put(folio);
926 0 : return error;
927 : }
928 : }
929 :
930 : /*
931 : * Make sure the next punch start is correctly bound to
932 : * the end of this data range, not the end of the folio.
933 : */
934 558 : *punch_start_byte = min_t(loff_t, end_byte,
935 : folio_next_index(folio) << PAGE_SHIFT);
936 : }
937 :
938 : /* move offset to start of next folio in range */
939 9261 : start_byte = folio_next_index(folio) << PAGE_SHIFT;
940 9261 : folio_unlock(folio);
941 9261 : folio_put(folio);
942 : }
943 : return 0;
944 : }
945 :
946 : /*
947 : * Punch out all the delalloc blocks in the range given except for those that
948 : * have dirty data still pending in the page cache - those are going to be
949 : * written and so must still retain the delalloc backing for writeback.
950 : *
951 : * As we are scanning the page cache for data, we don't need to reimplement the
952 : * wheel - mapping_seek_hole_data() does exactly what we need to identify the
953 : * start and end of data ranges correctly even for sub-folio block sizes. This
954 : * byte range based iteration is especially convenient because it means we
955 : * don't have to care about variable size folios, nor where the start or end of
956 : * the data range lies within a folio, if they lie within the same folio or even
957 : * if there are multiple discontiguous data ranges within the folio.
958 : *
959 : * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so
960 : * can return data ranges that exist in the cache beyond EOF. e.g. a page fault
961 : * spanning EOF will initialise the post-EOF data to zeroes and mark it up to
962 : * date. A write page fault can then mark it dirty. If we then fail a write()
963 : * beyond EOF into that up to date cached range, we allocate a delalloc block
964 : * beyond EOF and then have to punch it out. Because the range is up to date,
965 : * mapping_seek_hole_data() will return it, and we will skip the punch because
966 : * the folio is dirty. THis is incorrect - we always need to punch out delalloc
967 : * beyond EOF in this case as writeback will never write back and covert that
968 : * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF,
969 : * resulting in always punching out the range from the EOF to the end of the
970 : * range the iomap spans.
971 : *
972 : * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it
973 : * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA
974 : * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte)
975 : * returns the end of the data range (data_end). Using closed intervals would
976 : * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
977 : * the code to subtle off-by-one bugs....
978 : */
979 37931 : static int iomap_write_delalloc_release(struct inode *inode,
980 : loff_t start_byte, loff_t end_byte,
981 : int (*punch)(struct inode *inode, loff_t pos, loff_t length))
982 : {
983 37931 : loff_t punch_start_byte = start_byte;
984 37931 : loff_t scan_end_byte = min(i_size_read(inode), end_byte);
985 37931 : int error = 0;
986 :
987 : /*
988 : * Lock the mapping to avoid races with page faults re-instantiating
989 : * folios and dirtying them via ->page_mkwrite whilst we walk the
990 : * cache and perform delalloc extent removal. Failing to do this can
991 : * leave dirty pages with no space reservation in the cache.
992 : */
993 37931 : filemap_invalidate_lock(inode->i_mapping);
994 39782 : while (start_byte < scan_end_byte) {
995 28957 : loff_t data_end;
996 :
997 28957 : start_byte = mapping_seek_hole_data(inode->i_mapping,
998 : start_byte, scan_end_byte, SEEK_DATA);
999 : /*
1000 : * If there is no more data to scan, all that is left is to
1001 : * punch out the remaining range.
1002 : */
1003 28957 : if (start_byte == -ENXIO || start_byte == scan_end_byte)
1004 : break;
1005 1851 : if (start_byte < 0) {
1006 0 : error = start_byte;
1007 0 : goto out_unlock;
1008 : }
1009 1851 : WARN_ON_ONCE(start_byte < punch_start_byte);
1010 1851 : WARN_ON_ONCE(start_byte > scan_end_byte);
1011 :
1012 : /*
1013 : * We find the end of this contiguous cached data range by
1014 : * seeking from start_byte to the beginning of the next hole.
1015 : */
1016 1851 : data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
1017 : scan_end_byte, SEEK_HOLE);
1018 1851 : if (data_end < 0) {
1019 0 : error = data_end;
1020 0 : goto out_unlock;
1021 : }
1022 1851 : WARN_ON_ONCE(data_end <= start_byte);
1023 1851 : WARN_ON_ONCE(data_end > scan_end_byte);
1024 :
1025 1851 : error = iomap_write_delalloc_scan(inode, &punch_start_byte,
1026 : start_byte, data_end, punch);
1027 1851 : if (error)
1028 0 : goto out_unlock;
1029 :
1030 : /* The next data search starts at the end of this one. */
1031 : start_byte = data_end;
1032 : }
1033 :
1034 37931 : if (punch_start_byte < end_byte)
1035 37789 : error = punch(inode, punch_start_byte,
1036 : end_byte - punch_start_byte);
1037 142 : out_unlock:
1038 37929 : filemap_invalidate_unlock(inode->i_mapping);
1039 37929 : return error;
1040 : }
1041 :
1042 : /*
1043 : * When a short write occurs, the filesystem may need to remove reserved space
1044 : * that was allocated in ->iomap_begin from it's ->iomap_end method. For
1045 : * filesystems that use delayed allocation, we need to punch out delalloc
1046 : * extents from the range that are not dirty in the page cache. As the write can
1047 : * race with page faults, there can be dirty pages over the delalloc extent
1048 : * outside the range of a short write but still within the delalloc extent
1049 : * allocated for this iomap.
1050 : *
1051 : * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
1052 : * simplify range iterations.
1053 : *
1054 : * The punch() callback *must* only punch delalloc extents in the range passed
1055 : * to it. It must skip over all other types of extents in the range and leave
1056 : * them completely unchanged. It must do this punch atomically with respect to
1057 : * other extent modifications.
1058 : *
1059 : * The punch() callback may be called with a folio locked to prevent writeback
1060 : * extent allocation racing at the edge of the range we are currently punching.
1061 : * The locked folio may or may not cover the range being punched, so it is not
1062 : * safe for the punch() callback to lock folios itself.
1063 : *
1064 : * Lock order is:
1065 : *
1066 : * inode->i_rwsem (shared or exclusive)
1067 : * inode->i_mapping->invalidate_lock (exclusive)
1068 : * folio_lock()
1069 : * ->punch
1070 : * internal filesystem allocation lock
1071 : */
1072 405058885 : int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
1073 : struct iomap *iomap, loff_t pos, loff_t length,
1074 : ssize_t written,
1075 : int (*punch)(struct inode *inode, loff_t pos, loff_t length))
1076 : {
1077 405058885 : loff_t start_byte;
1078 405058885 : loff_t end_byte;
1079 405058885 : unsigned int blocksize = i_blocksize(inode);
1080 :
1081 404695795 : if (iomap->type != IOMAP_DELALLOC)
1082 : return 0;
1083 :
1084 : /* If we didn't reserve the blocks, we're not allowed to punch them. */
1085 169463742 : if (!(iomap->flags & IOMAP_F_NEW))
1086 : return 0;
1087 :
1088 : /*
1089 : * start_byte refers to the first unused block after a short write. If
1090 : * nothing was written, round offset down to point at the first block in
1091 : * the range.
1092 : */
1093 31501236 : if (unlikely(!written))
1094 19096 : start_byte = round_down(pos, blocksize);
1095 : else
1096 31482140 : start_byte = round_up(pos + written, blocksize);
1097 31501236 : end_byte = round_up(pos + length, blocksize);
1098 :
1099 : /* Nothing to do if we've written the entire delalloc extent */
1100 31501236 : if (start_byte >= end_byte)
1101 : return 0;
1102 :
1103 37931 : return iomap_write_delalloc_release(inode, start_byte, end_byte,
1104 : punch);
1105 : }
1106 : EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
1107 :
1108 51272 : static loff_t iomap_unshare_iter(struct iomap_iter *iter)
1109 : {
1110 51272 : struct iomap *iomap = &iter->iomap;
1111 51272 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
1112 51272 : loff_t pos = iter->pos;
1113 51272 : loff_t length = iomap_length(iter);
1114 51272 : long status = 0;
1115 51272 : loff_t written = 0;
1116 :
1117 : /* don't bother with blocks that are not shared to start with */
1118 51272 : if (!(iomap->flags & IOMAP_F_SHARED))
1119 : return length;
1120 : /* don't bother with holes or unwritten extents */
1121 23289 : if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
1122 : return length;
1123 :
1124 71021 : do {
1125 71021 : unsigned long offset = offset_in_page(pos);
1126 71021 : unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length);
1127 71021 : struct folio *folio;
1128 :
1129 71021 : status = iomap_write_begin(iter, pos, bytes, &folio);
1130 71021 : if (unlikely(status))
1131 0 : return status;
1132 71021 : if (iter->iomap.flags & IOMAP_F_STALE)
1133 : break;
1134 :
1135 71021 : status = iomap_write_end(iter, pos, bytes, bytes, folio);
1136 71021 : if (WARN_ON_ONCE(status == 0))
1137 : return -EIO;
1138 :
1139 71021 : cond_resched();
1140 :
1141 71021 : pos += status;
1142 71021 : written += status;
1143 71021 : length -= status;
1144 :
1145 71021 : balance_dirty_pages_ratelimited(iter->inode->i_mapping);
1146 71021 : } while (length);
1147 :
1148 : return written;
1149 : }
1150 :
1151 : int
1152 116 : iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
1153 : const struct iomap_ops *ops)
1154 : {
1155 116 : struct iomap_iter iter = {
1156 : .inode = inode,
1157 : .pos = pos,
1158 : .len = len,
1159 : .flags = IOMAP_WRITE | IOMAP_UNSHARE,
1160 : };
1161 116 : int ret;
1162 :
1163 51388 : while ((ret = iomap_iter(&iter, ops)) > 0)
1164 51272 : iter.processed = iomap_unshare_iter(&iter);
1165 116 : return ret;
1166 : }
1167 : EXPORT_SYMBOL_GPL(iomap_file_unshare);
1168 :
1169 145303566 : static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
1170 : {
1171 145303566 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
1172 145303566 : loff_t pos = iter->pos;
1173 145303566 : loff_t length = iomap_length(iter);
1174 145303566 : loff_t written = 0;
1175 :
1176 : /* already zeroed? we're done. */
1177 145303566 : if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
1178 : return length;
1179 :
1180 45255632 : do {
1181 45255632 : struct folio *folio;
1182 45255632 : int status;
1183 45255632 : size_t offset;
1184 45255632 : size_t bytes = min_t(u64, SIZE_MAX, length);
1185 :
1186 45255632 : status = iomap_write_begin(iter, pos, bytes, &folio);
1187 45298863 : if (status)
1188 1303 : return status;
1189 45297560 : if (iter->iomap.flags & IOMAP_F_STALE)
1190 : break;
1191 :
1192 45296878 : offset = offset_in_folio(folio, pos);
1193 45295788 : if (bytes > folio_size(folio) - offset)
1194 6463171 : bytes = folio_size(folio) - offset;
1195 :
1196 45292352 : folio_zero_range(folio, offset, bytes);
1197 45244727 : folio_mark_accessed(folio);
1198 :
1199 45456640 : bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
1200 45371300 : if (WARN_ON_ONCE(bytes == 0))
1201 : return -EIO;
1202 :
1203 45371300 : pos += bytes;
1204 45371300 : length -= bytes;
1205 45371300 : written += bytes;
1206 45371300 : } while (length > 0);
1207 :
1208 38893112 : if (did_zero)
1209 5843768 : *did_zero = true;
1210 : return written;
1211 : }
1212 :
1213 : int
1214 124559642 : iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
1215 : const struct iomap_ops *ops)
1216 : {
1217 124559642 : struct iomap_iter iter = {
1218 : .inode = inode,
1219 : .pos = pos,
1220 : .len = len,
1221 : .flags = IOMAP_ZERO,
1222 : };
1223 124559642 : int ret;
1224 :
1225 269976855 : while ((ret = iomap_iter(&iter, ops)) > 0)
1226 145316152 : iter.processed = iomap_zero_iter(&iter, did_zero);
1227 124411848 : return ret;
1228 : }
1229 : EXPORT_SYMBOL_GPL(iomap_zero_range);
1230 :
1231 : int
1232 6771065 : iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
1233 : const struct iomap_ops *ops)
1234 : {
1235 6771065 : unsigned int blocksize = i_blocksize(inode);
1236 6771042 : unsigned int off = pos & (blocksize - 1);
1237 :
1238 : /* Block boundary? Nothing to do */
1239 6771042 : if (!off)
1240 : return 0;
1241 4618613 : return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
1242 : }
1243 : EXPORT_SYMBOL_GPL(iomap_truncate_page);
1244 :
1245 86217666 : static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
1246 : struct folio *folio)
1247 : {
1248 86217666 : loff_t length = iomap_length(iter);
1249 86217666 : int ret;
1250 :
1251 86217666 : if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) {
1252 0 : ret = __block_write_begin_int(folio, iter->pos, length, NULL,
1253 0 : &iter->iomap);
1254 0 : if (ret)
1255 0 : return ret;
1256 0 : block_commit_write(&folio->page, 0, length);
1257 : } else {
1258 172424848 : WARN_ON_ONCE(!folio_test_uptodate(folio));
1259 86217123 : folio_mark_dirty(folio);
1260 : }
1261 :
1262 : return length;
1263 : }
1264 :
1265 83770901 : vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
1266 : {
1267 83770901 : struct iomap_iter iter = {
1268 83770901 : .inode = file_inode(vmf->vma->vm_file),
1269 : .flags = IOMAP_WRITE | IOMAP_FAULT,
1270 : };
1271 83770901 : struct folio *folio = page_folio(vmf->page);
1272 83389808 : ssize_t ret;
1273 :
1274 83389808 : folio_lock(folio);
1275 83725418 : ret = folio_mkwrite_check_truncate(folio, iter.inode);
1276 83753965 : if (ret < 0)
1277 13845 : goto out_unlock;
1278 83740120 : iter.pos = folio_pos(folio);
1279 83740120 : iter.len = ret;
1280 170277878 : while ((ret = iomap_iter(&iter, ops)) > 0)
1281 86191516 : iter.processed = iomap_folio_mkwrite_iter(&iter, folio);
1282 :
1283 83688955 : if (ret < 0)
1284 42080 : goto out_unlock;
1285 83646875 : folio_wait_stable(folio);
1286 83646875 : return VM_FAULT_LOCKED;
1287 55925 : out_unlock:
1288 55925 : folio_unlock(folio);
1289 55926 : return block_page_mkwrite_return(ret);
1290 : }
1291 : EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
1292 :
1293 679292003 : static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
1294 : size_t len, int error)
1295 : {
1296 679292003 : struct iomap_page *iop = to_iomap_page(folio);
1297 :
1298 679292003 : if (error) {
1299 3463861 : folio_set_error(folio);
1300 3463872 : mapping_set_error(inode->i_mapping, error);
1301 : }
1302 :
1303 1358584018 : WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !iop);
1304 692160482 : WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0);
1305 :
1306 679292005 : if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending))
1307 678514298 : folio_end_writeback(folio);
1308 679292012 : }
1309 :
1310 : /*
1311 : * We're now finished for good with this ioend structure. Update the page
1312 : * state, release holds on bios, and finally free up memory. Do not use the
1313 : * ioend after this.
1314 : */
1315 : static u32
1316 70788072 : iomap_finish_ioend(struct iomap_ioend *ioend, int error)
1317 : {
1318 70788072 : struct inode *inode = ioend->io_inode;
1319 70788072 : struct bio *bio = &ioend->io_inline_bio;
1320 70788072 : struct bio *last = ioend->io_bio, *next;
1321 70788072 : u64 start = bio->bi_iter.bi_sector;
1322 70788072 : loff_t offset = ioend->io_offset;
1323 70788072 : bool quiet = bio_flagged(bio, BIO_QUIET);
1324 70788072 : u32 folio_count = 0;
1325 :
1326 141686615 : for (bio = &ioend->io_inline_bio; bio; bio = next) {
1327 70898543 : struct folio_iter fi;
1328 :
1329 : /*
1330 : * For the last bio, bi_private points to the ioend, so we
1331 : * need to explicitly end the iteration here.
1332 : */
1333 70898543 : if (bio == last)
1334 : next = NULL;
1335 : else
1336 110471 : next = bio->bi_private;
1337 :
1338 : /* walk all folios in bio, ending page IO on them */
1339 750190545 : bio_for_each_folio_all(fi, bio) {
1340 679292003 : iomap_finish_folio_write(inode, fi.folio, fi.length,
1341 : error);
1342 679292012 : folio_count++;
1343 : }
1344 70898542 : bio_put(bio);
1345 : }
1346 : /* The ioend has been freed by bio_put() */
1347 :
1348 70788072 : if (unlikely(error && !quiet)) {
1349 57142 : printk_ratelimited(KERN_ERR
1350 : "%s: writeback error on inode %lu, offset %lld, sector %llu",
1351 : inode->i_sb->s_id, inode->i_ino, offset, start);
1352 : }
1353 70788072 : return folio_count;
1354 : }
1355 :
1356 : /*
1357 : * Ioend completion routine for merged bios. This can only be called from task
1358 : * contexts as merged ioends can be of unbound length. Hence we have to break up
1359 : * the writeback completions into manageable chunks to avoid long scheduler
1360 : * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
1361 : * good batch processing throughput without creating adverse scheduler latency
1362 : * conditions.
1363 : */
1364 : void
1365 61686736 : iomap_finish_ioends(struct iomap_ioend *ioend, int error)
1366 : {
1367 61686736 : struct list_head tmp;
1368 61686736 : u32 completions;
1369 :
1370 61686736 : might_sleep();
1371 :
1372 61686736 : list_replace_init(&ioend->io_list, &tmp);
1373 61686736 : completions = iomap_finish_ioend(ioend, error);
1374 :
1375 61818264 : while (!list_empty(&tmp)) {
1376 131528 : if (completions > IOEND_BATCH_SIZE * 8) {
1377 0 : cond_resched();
1378 0 : completions = 0;
1379 : }
1380 131528 : ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
1381 131528 : list_del_init(&ioend->io_list);
1382 131528 : completions += iomap_finish_ioend(ioend, error);
1383 : }
1384 61686736 : }
1385 : EXPORT_SYMBOL_GPL(iomap_finish_ioends);
1386 :
1387 : /*
1388 : * We can merge two adjacent ioends if they have the same set of work to do.
1389 : */
1390 : static bool
1391 11388464 : iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
1392 : {
1393 11388464 : if (ioend->io_bio->bi_status != next->io_bio->bi_status)
1394 : return false;
1395 11388450 : if ((ioend->io_flags & IOMAP_F_SHARED) ^
1396 11388450 : (next->io_flags & IOMAP_F_SHARED))
1397 : return false;
1398 11288895 : if ((ioend->io_type == IOMAP_UNWRITTEN) ^
1399 11288895 : (next->io_type == IOMAP_UNWRITTEN))
1400 : return false;
1401 9942453 : if (ioend->io_offset + ioend->io_size != next->io_offset)
1402 : return false;
1403 : /*
1404 : * Do not merge physically discontiguous ioends. The filesystem
1405 : * completion functions will have to iterate the physical
1406 : * discontiguities even if we merge the ioends at a logical level, so
1407 : * we don't gain anything by merging physical discontiguities here.
1408 : *
1409 : * We cannot use bio->bi_iter.bi_sector here as it is modified during
1410 : * submission so does not point to the start sector of the bio at
1411 : * completion.
1412 : */
1413 3328730 : if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
1414 3197202 : return false;
1415 : return true;
1416 : }
1417 :
1418 : void
1419 61686736 : iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
1420 : {
1421 61686736 : struct iomap_ioend *next;
1422 :
1423 61686736 : INIT_LIST_HEAD(&ioend->io_list);
1424 :
1425 61818264 : while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
1426 : io_list))) {
1427 11388464 : if (!iomap_ioend_can_merge(ioend, next))
1428 : break;
1429 131528 : list_move_tail(&next->io_list, &ioend->io_list);
1430 131528 : ioend->io_size += next->io_size;
1431 : }
1432 61686736 : }
1433 : EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
1434 :
1435 : static int
1436 29440593 : iomap_ioend_compare(void *priv, const struct list_head *a,
1437 : const struct list_head *b)
1438 : {
1439 29440593 : struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
1440 29440593 : struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
1441 :
1442 29440593 : if (ia->io_offset < ib->io_offset)
1443 : return -1;
1444 12842741 : if (ia->io_offset > ib->io_offset)
1445 12841295 : return 1;
1446 : return 0;
1447 : }
1448 :
1449 : void
1450 50429800 : iomap_sort_ioends(struct list_head *ioend_list)
1451 : {
1452 50429800 : list_sort(NULL, ioend_list, iomap_ioend_compare);
1453 50429800 : }
1454 : EXPORT_SYMBOL_GPL(iomap_sort_ioends);
1455 :
1456 8969808 : static void iomap_writepage_end_bio(struct bio *bio)
1457 : {
1458 8969808 : struct iomap_ioend *ioend = bio->bi_private;
1459 :
1460 8969808 : iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status));
1461 8969808 : }
1462 :
1463 : /*
1464 : * Submit the final bio for an ioend.
1465 : *
1466 : * If @error is non-zero, it means that we have a situation where some part of
1467 : * the submission process has failed after we've marked pages for writeback
1468 : * and unlocked them. In this situation, we need to fail the bio instead of
1469 : * submitting it. This typically only happens on a filesystem shutdown.
1470 : */
1471 : static int
1472 70787437 : iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
1473 : int error)
1474 : {
1475 70787437 : ioend->io_bio->bi_private = ioend;
1476 70787437 : ioend->io_bio->bi_end_io = iomap_writepage_end_bio;
1477 :
1478 70787437 : if (wpc->ops->prepare_ioend)
1479 70787437 : error = wpc->ops->prepare_ioend(ioend, error);
1480 70787402 : if (error) {
1481 : /*
1482 : * If we're failing the IO now, just mark the ioend with an
1483 : * error and finish it. This will run IO completion immediately
1484 : * as there is only one reference to the ioend at this point in
1485 : * time.
1486 : */
1487 374 : ioend->io_bio->bi_status = errno_to_blk_status(error);
1488 374 : bio_endio(ioend->io_bio);
1489 374 : return error;
1490 : }
1491 :
1492 70787028 : submit_bio(ioend->io_bio);
1493 70787028 : return 0;
1494 : }
1495 :
1496 : static struct iomap_ioend *
1497 70787581 : iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
1498 : loff_t offset, sector_t sector, struct writeback_control *wbc)
1499 : {
1500 70787581 : struct iomap_ioend *ioend;
1501 70787581 : struct bio *bio;
1502 :
1503 111512480 : bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
1504 : REQ_OP_WRITE | wbc_to_write_flags(wbc),
1505 : GFP_NOFS, &iomap_ioend_bioset);
1506 70787805 : bio->bi_iter.bi_sector = sector;
1507 70787805 : wbc_init_bio(wbc, bio);
1508 :
1509 70787694 : ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
1510 70787694 : INIT_LIST_HEAD(&ioend->io_list);
1511 70787694 : ioend->io_type = wpc->iomap.type;
1512 70787694 : ioend->io_flags = wpc->iomap.flags;
1513 70787694 : ioend->io_inode = inode;
1514 70787694 : ioend->io_size = 0;
1515 70787694 : ioend->io_folios = 0;
1516 70787694 : ioend->io_offset = offset;
1517 70787694 : ioend->io_bio = bio;
1518 70787694 : ioend->io_sector = sector;
1519 70787694 : return ioend;
1520 : }
1521 :
1522 : /*
1523 : * Allocate a new bio, and chain the old bio to the new one.
1524 : *
1525 : * Note that we have to perform the chaining in this unintuitive order
1526 : * so that the bi_private linkage is set up in the right direction for the
1527 : * traversal in iomap_finish_ioend().
1528 : */
1529 : static struct bio *
1530 110471 : iomap_chain_bio(struct bio *prev)
1531 : {
1532 110471 : struct bio *new;
1533 :
1534 110471 : new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
1535 110471 : bio_clone_blkg_association(new, prev);
1536 110471 : new->bi_iter.bi_sector = bio_end_sector(prev);
1537 :
1538 110471 : bio_chain(prev, new);
1539 110471 : bio_get(prev); /* for iomap_finish_ioend */
1540 110471 : submit_bio(prev);
1541 110471 : return new;
1542 : }
1543 :
1544 : static bool
1545 681714556 : iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
1546 : sector_t sector)
1547 : {
1548 681714556 : if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
1549 681714556 : (wpc->ioend->io_flags & IOMAP_F_SHARED))
1550 : return false;
1551 681247695 : if (wpc->iomap.type != wpc->ioend->io_type)
1552 : return false;
1553 672008480 : if (offset != wpc->ioend->io_offset + wpc->ioend->io_size)
1554 : return false;
1555 661643124 : if (sector != bio_end_sector(wpc->ioend->io_bio))
1556 : return false;
1557 : /*
1558 : * Limit ioend bio chain lengths to minimise IO completion latency. This
1559 : * also prevents long tight loops ending page writeback on all the
1560 : * folios in the ioend.
1561 : */
1562 653364259 : if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE)
1563 23289 : return false;
1564 : return true;
1565 : }
1566 :
1567 : /*
1568 : * Test to see if we have an existing ioend structure that we could append to
1569 : * first; otherwise finish off the current ioend and start another.
1570 : */
1571 : static void
1572 724128509 : iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
1573 : struct iomap_page *iop, struct iomap_writepage_ctx *wpc,
1574 : struct writeback_control *wbc, struct list_head *iolist)
1575 : {
1576 724128509 : sector_t sector = iomap_sector(&wpc->iomap, pos);
1577 724128509 : unsigned len = i_blocksize(inode);
1578 724127547 : size_t poff = offset_in_folio(folio, pos);
1579 :
1580 724126993 : if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) {
1581 70787477 : if (wpc->ioend)
1582 28374382 : list_add(&wpc->ioend->io_list, iolist);
1583 70787467 : wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc);
1584 : }
1585 :
1586 724127110 : if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) {
1587 110471 : wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio);
1588 110471 : bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff);
1589 : }
1590 :
1591 724145131 : if (iop)
1592 57737476 : atomic_add(len, &iop->write_bytes_pending);
1593 724145646 : wpc->ioend->io_size += len;
1594 724145646 : wbc_account_cgroup_owner(wbc, &folio->page, len);
1595 724134280 : }
1596 :
1597 : /*
1598 : * We implement an immediate ioend submission policy here to avoid needing to
1599 : * chain multiple ioends and hence nest mempool allocations which can violate
1600 : * the forward progress guarantees we need to provide. The current ioend we're
1601 : * adding blocks to is cached in the writepage context, and if the new block
1602 : * doesn't append to the cached ioend, it will create a new ioend and cache that
1603 : * instead.
1604 : *
1605 : * If a new ioend is created and cached, the old ioend is returned and queued
1606 : * locally for submission once the entire page is processed or an error has been
1607 : * detected. While ioends are submitted immediately after they are completed,
1608 : * batching optimisations are provided by higher level block plugging.
1609 : *
1610 : * At the end of a writeback pass, there will be a cached ioend remaining on the
1611 : * writepage context that the caller will need to submit.
1612 : */
1613 : static int
1614 679963423 : iomap_writepage_map(struct iomap_writepage_ctx *wpc,
1615 : struct writeback_control *wbc, struct inode *inode,
1616 : struct folio *folio, u64 end_pos)
1617 : {
1618 679963423 : struct iomap_page *iop = iomap_page_create(inode, folio, 0);
1619 679961771 : struct iomap_ioend *ioend, *next;
1620 679961771 : unsigned len = i_blocksize(inode);
1621 679954751 : unsigned nblocks = i_blocks_per_folio(inode, folio);
1622 679951708 : u64 pos = folio_pos(folio);
1623 679951708 : int error = 0, count = 0, i;
1624 679951708 : LIST_HEAD(submit_list);
1625 :
1626 692060283 : WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0);
1627 :
1628 : /*
1629 : * Walk through the folio to find areas to write back. If we
1630 : * run off the end of the current map or find the current map
1631 : * invalid, grab a new one.
1632 : */
1633 1405956654 : for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
1634 787080487 : if (iop && !test_bit(i, iop->uptodate))
1635 1429 : continue;
1636 :
1637 727459773 : error = wpc->ops->map_blocks(wpc, inode, pos);
1638 727468935 : if (error)
1639 : break;
1640 725996394 : trace_iomap_writepage_map(inode, &wpc->iomap);
1641 725997747 : if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
1642 0 : continue;
1643 725997747 : if (wpc->iomap.type == IOMAP_HOLE)
1644 1869308 : continue;
1645 724128439 : iomap_add_to_ioend(inode, pos, folio, iop, wpc, wbc,
1646 : &submit_list);
1647 724134209 : count++;
1648 : }
1649 679968057 : if (count)
1650 678489497 : wpc->ioend->io_folios++;
1651 :
1652 681441329 : WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
1653 679968057 : WARN_ON_ONCE(!folio_test_locked(folio));
1654 679968057 : WARN_ON_ONCE(folio_test_writeback(folio));
1655 679968057 : WARN_ON_ONCE(folio_test_dirty(folio));
1656 :
1657 : /*
1658 : * We cannot cancel the ioend directly here on error. We may have
1659 : * already set other pages under writeback and hence we have to run I/O
1660 : * completion to mark the error state of the pages under writeback
1661 : * appropriately.
1662 : */
1663 679968057 : if (unlikely(error)) {
1664 : /*
1665 : * Let the filesystem know what portion of the current page
1666 : * failed to map. If the page hasn't been added to ioend, it
1667 : * won't be affected by I/O completion and we must unlock it
1668 : * now.
1669 : */
1670 1472540 : if (wpc->ops->discard_folio)
1671 1472540 : wpc->ops->discard_folio(folio, pos);
1672 1472541 : if (!count) {
1673 1472540 : folio_unlock(folio);
1674 1472540 : goto done;
1675 : }
1676 : }
1677 :
1678 678495518 : folio_start_writeback(folio);
1679 678507829 : folio_unlock(folio);
1680 :
1681 : /*
1682 : * Preserve the original error if there was one; catch
1683 : * submission errors here and propagate into subsequent ioend
1684 : * submissions.
1685 : */
1686 706888210 : list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
1687 28375132 : int error2;
1688 :
1689 28375132 : list_del_init(&ioend->io_list);
1690 28374298 : error2 = iomap_submit_ioend(wpc, ioend, error);
1691 28374514 : if (error2 && !error)
1692 0 : error = error2;
1693 : }
1694 :
1695 : /*
1696 : * We can end up here with no error and nothing to write only if we race
1697 : * with a partial page truncate on a sub-page block sized filesystem.
1698 : */
1699 678513078 : if (!count)
1700 6020 : folio_end_writeback(folio);
1701 678507058 : done:
1702 679985618 : mapping_set_error(inode->i_mapping, error);
1703 679985361 : return error;
1704 : }
1705 :
1706 : /*
1707 : * Write out a dirty page.
1708 : *
1709 : * For delalloc space on the page, we need to allocate space and flush it.
1710 : * For unwritten space on the page, we need to start the conversion to
1711 : * regular allocated space.
1712 : */
1713 679998433 : static int iomap_do_writepage(struct folio *folio,
1714 : struct writeback_control *wbc, void *data)
1715 : {
1716 679998433 : struct iomap_writepage_ctx *wpc = data;
1717 679998433 : struct inode *inode = folio->mapping->host;
1718 679998433 : u64 end_pos, isize;
1719 :
1720 679998433 : trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio));
1721 :
1722 : /*
1723 : * Refuse to write the folio out if we're called from reclaim context.
1724 : *
1725 : * This avoids stack overflows when called from deeply used stacks in
1726 : * random callers for direct reclaim or memcg reclaim. We explicitly
1727 : * allow reclaim from kswapd as the stack usage there is relatively low.
1728 : *
1729 : * This should never happen except in the case of a VM regression so
1730 : * warn about it.
1731 : */
1732 679991057 : if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
1733 : PF_MEMALLOC))
1734 0 : goto redirty;
1735 :
1736 : /*
1737 : * Is this folio beyond the end of the file?
1738 : *
1739 : * The folio index is less than the end_index, adjust the end_pos
1740 : * to the highest offset that this folio should represent.
1741 : * -----------------------------------------------------
1742 : * | file mapping | <EOF> |
1743 : * -----------------------------------------------------
1744 : * | Page ... | Page N-2 | Page N-1 | Page N | |
1745 : * ^--------------------------------^----------|--------
1746 : * | desired writeback range | see else |
1747 : * ---------------------------------^------------------|
1748 : */
1749 679991057 : isize = i_size_read(inode);
1750 679991057 : end_pos = folio_pos(folio) + folio_size(folio);
1751 679991446 : if (end_pos > isize) {
1752 : /*
1753 : * Check whether the page to write out is beyond or straddles
1754 : * i_size or not.
1755 : * -------------------------------------------------------
1756 : * | file mapping | <EOF> |
1757 : * -------------------------------------------------------
1758 : * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
1759 : * ^--------------------------------^-----------|---------
1760 : * | | Straddles |
1761 : * ---------------------------------^-----------|--------|
1762 : */
1763 16769140 : size_t poff = offset_in_folio(folio, isize);
1764 16769139 : pgoff_t end_index = isize >> PAGE_SHIFT;
1765 :
1766 : /*
1767 : * Skip the page if it's fully outside i_size, e.g.
1768 : * due to a truncate operation that's in progress. We've
1769 : * cleaned this page and truncate will finish things off for
1770 : * us.
1771 : *
1772 : * Note that the end_index is unsigned long. If the given
1773 : * offset is greater than 16TB on a 32-bit system then if we
1774 : * checked if the page is fully outside i_size with
1775 : * "if (page->index >= end_index + 1)", "end_index + 1" would
1776 : * overflow and evaluate to 0. Hence this page would be
1777 : * redirtied and written out repeatedly, which would result in
1778 : * an infinite loop; the user program performing this operation
1779 : * would hang. Instead, we can detect this situation by
1780 : * checking if the page is totally beyond i_size or if its
1781 : * offset is just equal to the EOF.
1782 : */
1783 16769139 : if (folio->index > end_index ||
1784 16726142 : (folio->index == end_index && poff == 0))
1785 30494 : goto unlock;
1786 :
1787 : /*
1788 : * The page straddles i_size. It must be zeroed out on each
1789 : * and every writepage invocation because it may be mmapped.
1790 : * "A file is mapped in multiples of the page size. For a file
1791 : * that is not a multiple of the page size, the remaining
1792 : * memory is zeroed when mapped, and writes to that region are
1793 : * not written out to the file."
1794 : */
1795 16738645 : folio_zero_segment(folio, poff, folio_size(folio));
1796 16738643 : end_pos = isize;
1797 : }
1798 :
1799 679960978 : return iomap_writepage_map(wpc, wbc, inode, folio, end_pos);
1800 :
1801 : redirty:
1802 0 : folio_redirty_for_writepage(wbc, folio);
1803 30494 : unlock:
1804 30494 : folio_unlock(folio);
1805 30494 : return 0;
1806 : }
1807 :
1808 : int
1809 77129362 : iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
1810 : struct iomap_writepage_ctx *wpc,
1811 : const struct iomap_writeback_ops *ops)
1812 : {
1813 77129362 : int ret;
1814 :
1815 77129362 : wpc->ops = ops;
1816 77129362 : ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
1817 77128167 : if (!wpc->ioend)
1818 : return ret;
1819 42413333 : return iomap_submit_ioend(wpc, wpc->ioend, ret);
1820 : }
1821 : EXPORT_SYMBOL_GPL(iomap_writepages);
1822 :
1823 0 : static int __init iomap_init(void)
1824 : {
1825 0 : return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
1826 : offsetof(struct iomap_ioend, io_inline_bio),
1827 : BIOSET_NEED_BVECS);
1828 : }
1829 : fs_initcall(iomap_init);
|