Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (C) 2010 Red Hat, Inc.
4 : * Copyright (C) 2016-2019 Christoph Hellwig.
5 : */
6 : #include <linux/module.h>
7 : #include <linux/compiler.h>
8 : #include <linux/fs.h>
9 : #include <linux/iomap.h>
10 : #include <linux/pagemap.h>
11 : #include <linux/uio.h>
12 : #include <linux/buffer_head.h>
13 : #include <linux/dax.h>
14 : #include <linux/writeback.h>
15 : #include <linux/list_sort.h>
16 : #include <linux/swap.h>
17 : #include <linux/bio.h>
18 : #include <linux/sched/signal.h>
19 : #include <linux/migrate.h>
20 : #include "trace.h"
21 :
22 : #include "../internal.h"
23 :
24 : #define IOEND_BATCH_SIZE 4096
25 :
26 : /*
27 : * Structure allocated for each folio when block size < folio size
28 : * to track sub-folio uptodate status and I/O completions.
29 : */
30 : struct iomap_page {
31 : atomic_t read_bytes_pending;
32 : atomic_t write_bytes_pending;
33 : spinlock_t uptodate_lock;
34 : unsigned long uptodate[];
35 : };
36 :
37 : static inline struct iomap_page *to_iomap_page(struct folio *folio)
38 : {
39 1345925206 : if (folio_test_private(folio))
40 969408291 : return folio_get_private(folio);
41 : return NULL;
42 : }
43 :
44 : static struct bio_set iomap_ioend_bioset;
45 :
46 : static struct iomap_page *
47 555245452 : iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
48 : {
49 555245452 : struct iomap_page *iop = to_iomap_page(folio);
50 555245452 : unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
51 555245452 : gfp_t gfp;
52 :
53 555245452 : if (iop || nr_blocks <= 1)
54 : return iop;
55 :
56 376195606 : if (flags & IOMAP_NOWAIT)
57 : gfp = GFP_NOWAIT;
58 : else
59 376192696 : gfp = GFP_NOFS | __GFP_NOFAIL;
60 :
61 376195606 : iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
62 : gfp);
63 376203096 : if (iop) {
64 376203096 : spin_lock_init(&iop->uptodate_lock);
65 376470045 : if (folio_test_uptodate(folio))
66 266684 : bitmap_fill(iop->uptodate, nr_blocks);
67 376203361 : folio_attach_private(folio, iop);
68 : }
69 : return iop;
70 : }
71 :
72 376185226 : static void iomap_page_release(struct folio *folio)
73 : {
74 376185226 : struct iomap_page *iop = folio_detach_private(folio);
75 376205655 : struct inode *inode = folio->mapping->host;
76 376205655 : unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
77 :
78 376205655 : if (!iop)
79 : return;
80 376205655 : WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending));
81 376205655 : WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending));
82 752428567 : WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
83 : folio_test_uptodate(folio));
84 376222912 : kfree(iop);
85 : }
86 :
87 : /*
88 : * Calculate the range inside the folio that we actually need to read.
89 : */
90 498074879 : static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
91 : loff_t *pos, loff_t length, size_t *offp, size_t *lenp)
92 : {
93 498074879 : struct iomap_page *iop = to_iomap_page(folio);
94 498074879 : loff_t orig_pos = *pos;
95 498074879 : loff_t isize = i_size_read(inode);
96 498074879 : unsigned block_bits = inode->i_blkbits;
97 498074879 : unsigned block_size = (1 << block_bits);
98 498074879 : size_t poff = offset_in_folio(folio, *pos);
99 498074879 : size_t plen = min_t(loff_t, folio_size(folio) - poff, length);
100 498074879 : unsigned first = poff >> block_bits;
101 498074879 : unsigned last = (poff + plen - 1) >> block_bits;
102 :
103 : /*
104 : * If the block size is smaller than the page size, we need to check the
105 : * per-block uptodate status and adjust the offset and length if needed
106 : * to avoid reading in already uptodate ranges.
107 : */
108 498074879 : if (iop) {
109 : unsigned int i;
110 :
111 : /* move forward for each leading block marked uptodate */
112 558213179 : for (i = first; i <= last; i++) {
113 522658053 : if (!test_bit(i, iop->uptodate))
114 : break;
115 60138960 : *pos += block_size;
116 60138960 : poff += block_size;
117 60138960 : plen -= block_size;
118 60138960 : first++;
119 : }
120 :
121 : /* truncate len if we find any trailing uptodate block(s) */
122 6557612407 : for ( ; i <= last; i++) {
123 6060246469 : if (test_bit(i, iop->uptodate)) {
124 708281 : plen -= (last - i + 1) * block_size;
125 708281 : last = i - 1;
126 708281 : break;
127 : }
128 : }
129 : }
130 :
131 : /*
132 : * If the extent spans the block that contains the i_size, we need to
133 : * handle both halves separately so that we properly zero data in the
134 : * page cache for blocks that are entirely outside of i_size.
135 : */
136 498074879 : if (orig_pos <= isize && orig_pos + length > isize) {
137 64884554 : unsigned end = offset_in_folio(folio, isize - 1) >> block_bits;
138 :
139 64884554 : if (first <= end && last > end)
140 2318463 : plen -= (last - end) * block_size;
141 : }
142 :
143 498074879 : *offp = poff;
144 498074879 : *lenp = plen;
145 498074879 : }
146 :
147 613820965 : static void iomap_iop_set_range_uptodate(struct folio *folio,
148 : struct iomap_page *iop, size_t off, size_t len)
149 : {
150 613820965 : struct inode *inode = folio->mapping->host;
151 613820965 : unsigned first = off >> inode->i_blkbits;
152 613820965 : unsigned last = (off + len - 1) >> inode->i_blkbits;
153 613820965 : unsigned long flags;
154 :
155 613820965 : spin_lock_irqsave(&iop->uptodate_lock, flags);
156 613909825 : bitmap_set(iop->uptodate, first, last - first + 1);
157 631446257 : if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio)))
158 422623772 : folio_mark_uptodate(folio);
159 613901271 : spin_unlock_irqrestore(&iop->uptodate_lock, flags);
160 613874317 : }
161 :
162 614088665 : static void iomap_set_range_uptodate(struct folio *folio,
163 : struct iomap_page *iop, size_t off, size_t len)
164 : {
165 614088665 : if (iop)
166 613802936 : iomap_iop_set_range_uptodate(folio, iop, off, len);
167 : else
168 285729 : folio_mark_uptodate(folio);
169 614140707 : }
170 :
171 41653850 : static void iomap_finish_folio_read(struct folio *folio, size_t offset,
172 : size_t len, int error)
173 : {
174 41653850 : struct iomap_page *iop = to_iomap_page(folio);
175 :
176 41653850 : if (unlikely(error)) {
177 1195 : folio_clear_uptodate(folio);
178 1195 : folio_set_error(folio);
179 : } else {
180 41652655 : iomap_set_range_uptodate(folio, iop, offset, len);
181 : }
182 :
183 83307690 : if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending))
184 30304401 : folio_unlock(folio);
185 41653850 : }
186 :
187 40369368 : static void iomap_read_end_io(struct bio *bio)
188 : {
189 40369368 : int error = blk_status_to_errno(bio->bi_status);
190 40369368 : struct folio_iter fi;
191 :
192 82023218 : bio_for_each_folio_all(fi, bio)
193 41653850 : iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
194 40369368 : bio_put(bio);
195 40369368 : }
196 :
197 : struct iomap_readpage_ctx {
198 : struct folio *cur_folio;
199 : bool cur_folio_in_bio;
200 : struct bio *bio;
201 : struct readahead_control *rac;
202 : };
203 :
204 : /**
205 : * iomap_read_inline_data - copy inline data into the page cache
206 : * @iter: iteration structure
207 : * @folio: folio to copy to
208 : *
209 : * Copy the inline data in @iter into @folio and zero out the rest of the folio.
210 : * Only a single IOMAP_INLINE extent is allowed at the end of each file.
211 : * Returns zero for success to complete the read, or the usual negative errno.
212 : */
213 0 : static int iomap_read_inline_data(const struct iomap_iter *iter,
214 : struct folio *folio)
215 : {
216 0 : struct iomap_page *iop;
217 0 : const struct iomap *iomap = iomap_iter_srcmap(iter);
218 0 : size_t size = i_size_read(iter->inode) - iomap->offset;
219 0 : size_t poff = offset_in_page(iomap->offset);
220 0 : size_t offset = offset_in_folio(folio, iomap->offset);
221 0 : void *addr;
222 :
223 0 : if (folio_test_uptodate(folio))
224 : return 0;
225 :
226 0 : if (WARN_ON_ONCE(size > PAGE_SIZE - poff))
227 : return -EIO;
228 0 : if (WARN_ON_ONCE(size > PAGE_SIZE -
229 : offset_in_page(iomap->inline_data)))
230 : return -EIO;
231 0 : if (WARN_ON_ONCE(size > iomap->length))
232 : return -EIO;
233 0 : if (offset > 0)
234 0 : iop = iomap_page_create(iter->inode, folio, iter->flags);
235 : else
236 0 : iop = to_iomap_page(folio);
237 :
238 0 : addr = kmap_local_folio(folio, offset);
239 0 : memcpy(addr, iomap->inline_data, size);
240 0 : memset(addr + size, 0, PAGE_SIZE - poff - size);
241 0 : kunmap_local(addr);
242 0 : iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff);
243 0 : return 0;
244 : }
245 :
246 435341755 : static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
247 : loff_t pos)
248 : {
249 435341755 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
250 :
251 482578787 : return srcmap->type != IOMAP_MAPPED ||
252 435341755 : (srcmap->flags & IOMAP_F_NEW) ||
253 47236919 : pos >= i_size_read(iter->inode);
254 : }
255 :
256 398391156 : static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
257 : struct iomap_readpage_ctx *ctx, loff_t offset)
258 : {
259 398391156 : const struct iomap *iomap = &iter->iomap;
260 398391156 : loff_t pos = iter->pos + offset;
261 398391156 : loff_t length = iomap_length(iter) - offset;
262 398391156 : struct folio *folio = ctx->cur_folio;
263 398391156 : struct iomap_page *iop;
264 398391156 : loff_t orig_pos = pos;
265 398391156 : size_t poff, plen;
266 398391156 : sector_t sector;
267 :
268 398391156 : if (iomap->type == IOMAP_INLINE)
269 0 : return iomap_read_inline_data(iter, folio);
270 :
271 : /* zero post-eof blocks as the page may be mapped */
272 398391156 : iop = iomap_page_create(iter->inode, folio, iter->flags);
273 398384858 : iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
274 398396928 : if (plen == 0)
275 7272263 : goto done;
276 :
277 391124665 : if (iomap_block_needs_zeroing(iter, pos)) {
278 349354099 : folio_zero_range(folio, poff, plen);
279 349360977 : iomap_set_range_uptodate(folio, iop, poff, plen);
280 349364626 : goto done;
281 : }
282 :
283 41770566 : ctx->cur_folio_in_bio = true;
284 41770566 : if (iop)
285 41770556 : atomic_add(plen, &iop->read_bytes_pending);
286 :
287 41769963 : sector = iomap_sector(iomap, pos);
288 41769963 : if (!ctx->bio ||
289 15134947 : bio_end_sector(ctx->bio) != sector ||
290 1401789 : !bio_add_folio(ctx->bio, folio, plen, poff)) {
291 40368337 : gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
292 40368337 : gfp_t orig_gfp = gfp;
293 40368337 : unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
294 :
295 40368337 : if (ctx->bio)
296 12331537 : submit_bio(ctx->bio);
297 :
298 40367988 : if (ctx->rac) /* same as readahead_gfp_mask */
299 7569038 : gfp |= __GFP_NORETRY | __GFP_NOWARN;
300 40367988 : ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
301 : REQ_OP_READ, gfp);
302 : /*
303 : * If the bio_alloc fails, try it again for a single page to
304 : * avoid having to deal with partial page reads. This emulates
305 : * what do_mpage_read_folio does.
306 : */
307 40368848 : if (!ctx->bio) {
308 0 : ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
309 : orig_gfp);
310 : }
311 40368848 : if (ctx->rac)
312 7569064 : ctx->bio->bi_opf |= REQ_RAHEAD;
313 40368848 : ctx->bio->bi_iter.bi_sector = sector;
314 40368848 : ctx->bio->bi_end_io = iomap_read_end_io;
315 40368848 : bio_add_folio_nofail(ctx->bio, folio, plen, poff);
316 : }
317 :
318 1401622 : done:
319 : /*
320 : * Move the caller beyond our range so that it keeps making progress.
321 : * For that, we have to include any leading non-uptodate ranges, but
322 : * we can skip trailing ones as they will be handled in the next
323 : * iteration.
324 : */
325 398406726 : return pos - orig_pos + plen;
326 : }
327 :
328 285135143 : int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
329 : {
330 570270286 : struct iomap_iter iter = {
331 285135143 : .inode = folio->mapping->host,
332 : .pos = folio_pos(folio),
333 : .len = folio_size(folio),
334 : };
335 285135143 : struct iomap_readpage_ctx ctx = {
336 : .cur_folio = folio,
337 : };
338 285135143 : int ret;
339 :
340 285135143 : trace_iomap_readpage(iter.inode, 1);
341 :
342 623412021 : while ((ret = iomap_iter(&iter, ops)) > 0)
343 338278775 : iter.processed = iomap_readpage_iter(&iter, &ctx, 0);
344 :
345 285140868 : if (ret < 0)
346 66 : folio_set_error(folio);
347 :
348 285140868 : if (ctx.bio) {
349 24979091 : submit_bio(ctx.bio);
350 24979396 : WARN_ON_ONCE(!ctx.cur_folio_in_bio);
351 : } else {
352 260161777 : WARN_ON_ONCE(ctx.cur_folio_in_bio);
353 260161777 : folio_unlock(folio);
354 : }
355 :
356 : /*
357 : * Just like mpage_readahead and block_read_full_folio, we always
358 : * return 0 and just set the folio error flag on errors. This
359 : * should be cleaned up throughout the stack eventually.
360 : */
361 285132524 : return 0;
362 : }
363 : EXPORT_SYMBOL_GPL(iomap_read_folio);
364 :
365 27010625 : static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
366 : struct iomap_readpage_ctx *ctx)
367 : {
368 27010625 : loff_t length = iomap_length(iter);
369 27010625 : loff_t done, ret;
370 :
371 87144596 : for (done = 0; done < length; done += ret) {
372 60133871 : if (ctx->cur_folio &&
373 62525840 : offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) {
374 32088595 : if (!ctx->cur_folio_in_bio)
375 28972624 : folio_unlock(ctx->cur_folio);
376 32088279 : ctx->cur_folio = NULL;
377 : }
378 60133555 : if (!ctx->cur_folio) {
379 45270242 : ctx->cur_folio = readahead_folio(ctx->rac);
380 45271046 : ctx->cur_folio_in_bio = false;
381 : }
382 60134359 : ret = iomap_readpage_iter(iter, ctx, done);
383 60133971 : if (ret <= 0)
384 0 : return ret;
385 : }
386 :
387 : return done;
388 : }
389 :
390 : /**
391 : * iomap_readahead - Attempt to read pages from a file.
392 : * @rac: Describes the pages to be read.
393 : * @ops: The operations vector for the filesystem.
394 : *
395 : * This function is for filesystems to call to implement their readahead
396 : * address_space operation.
397 : *
398 : * Context: The @ops callbacks may submit I/O (eg to read the addresses of
399 : * blocks from disc), and may wait for it. The caller may be trying to
400 : * access a different page, and so sleeping excessively should be avoided.
401 : * It may allocate memory, but should avoid costly allocations. This
402 : * function is called with memalloc_nofs set, so allocations will not cause
403 : * the filesystem to be reentered.
404 : */
405 13182642 : void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
406 : {
407 13182642 : struct iomap_iter iter = {
408 13182642 : .inode = rac->mapping->host,
409 : .pos = readahead_pos(rac),
410 : .len = readahead_length(rac),
411 : };
412 13182642 : struct iomap_readpage_ctx ctx = {
413 : .rac = rac,
414 : };
415 :
416 13182642 : trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
417 :
418 40193355 : while (iomap_iter(&iter, ops) > 0)
419 27010573 : iter.processed = iomap_readahead_iter(&iter, &ctx);
420 :
421 13182679 : if (ctx.bio)
422 3058388 : submit_bio(ctx.bio);
423 13182657 : if (ctx.cur_folio) {
424 13182638 : if (!ctx.cur_folio_in_bio)
425 10973683 : folio_unlock(ctx.cur_folio);
426 : }
427 13182617 : }
428 : EXPORT_SYMBOL_GPL(iomap_readahead);
429 :
430 : /*
431 : * iomap_is_partially_uptodate checks whether blocks within a folio are
432 : * uptodate or not.
433 : *
434 : * Returns true if all blocks which correspond to the specified part
435 : * of the folio are uptodate.
436 : */
437 3535019 : bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
438 : {
439 3535019 : struct iomap_page *iop = to_iomap_page(folio);
440 3535019 : struct inode *inode = folio->mapping->host;
441 3535019 : unsigned first, last, i;
442 :
443 3535019 : if (!iop)
444 : return false;
445 :
446 : /* Caller's range may extend past the end of this folio */
447 3493762 : count = min(folio_size(folio) - from, count);
448 :
449 : /* First and last blocks in range within folio */
450 3493762 : first = from >> inode->i_blkbits;
451 3493762 : last = (from + count - 1) >> inode->i_blkbits;
452 :
453 7780020 : for (i = first; i <= last; i++)
454 6866011 : if (!test_bit(i, iop->uptodate))
455 : return false;
456 : return true;
457 : }
458 : EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
459 :
460 : /**
461 : * iomap_get_folio - get a folio reference for writing
462 : * @iter: iteration structure
463 : * @pos: start offset of write
464 : *
465 : * Returns a locked reference to the folio at @pos, or an error pointer if the
466 : * folio could not be obtained.
467 : */
468 178917267 : struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
469 : {
470 178917267 : unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
471 :
472 178917267 : if (iter->flags & IOMAP_NOWAIT)
473 0 : fgp |= FGP_NOWAIT;
474 :
475 178917267 : return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
476 : fgp, mapping_gfp_mask(iter->inode->i_mapping));
477 : }
478 : EXPORT_SYMBOL_GPL(iomap_get_folio);
479 :
480 78698950 : bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags)
481 : {
482 83058901 : trace_iomap_release_folio(folio->mapping->host, folio_pos(folio),
483 : folio_size(folio));
484 :
485 : /*
486 : * mm accommodates an old ext3 case where clean folios might
487 : * not have had the dirty bit cleared. Thus, it can send actual
488 : * dirty folios to ->release_folio() via shrink_active_list();
489 : * skip those here.
490 : */
491 78698865 : if (folio_test_dirty(folio) || folio_test_writeback(folio))
492 : return false;
493 78135659 : iomap_page_release(folio);
494 78135659 : return true;
495 : }
496 : EXPORT_SYMBOL_GPL(iomap_release_folio);
497 :
498 299349992 : void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
499 : {
500 299349992 : trace_iomap_invalidate_folio(folio->mapping->host,
501 299349992 : folio_pos(folio) + offset, len);
502 :
503 : /*
504 : * If we're invalidating the entire folio, clear the dirty state
505 : * from it and release it to avoid unnecessary buildup of the LRU.
506 : */
507 595776845 : if (offset == 0 && len == folio_size(folio)) {
508 295716537 : WARN_ON_ONCE(folio_test_writeback(folio));
509 295716537 : folio_cancel_dirty(folio);
510 295716818 : iomap_page_release(folio);
511 3632265 : } else if (folio_test_large(folio)) {
512 : /* Must release the iop so the page can be split */
513 4687529 : WARN_ON_ONCE(!folio_test_uptodate(folio) &&
514 : folio_test_dirty(folio));
515 2343765 : iomap_page_release(folio);
516 : }
517 299359315 : }
518 : EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
519 :
520 : static void
521 13007 : iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
522 : {
523 13007 : loff_t i_size = i_size_read(inode);
524 :
525 : /*
526 : * Only truncate newly allocated pages beyoned EOF, even if the
527 : * write started inside the existing inode size.
528 : */
529 13007 : if (pos + len > i_size)
530 3304 : truncate_pagecache_range(inode, max(pos, i_size),
531 : pos + len - 1);
532 13007 : }
533 :
534 5452892 : static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
535 : size_t poff, size_t plen, const struct iomap *iomap)
536 : {
537 5452892 : struct bio_vec bvec;
538 5452892 : struct bio bio;
539 :
540 5452892 : bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
541 5452882 : bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
542 5452882 : bio_add_folio_nofail(&bio, folio, plen, poff);
543 5452883 : return submit_bio_wait(&bio);
544 : }
545 :
546 178947985 : static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
547 : size_t len, struct folio *folio)
548 : {
549 178947985 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
550 178947985 : struct iomap_page *iop;
551 178947985 : loff_t block_size = i_blocksize(iter->inode);
552 178947985 : loff_t block_start = round_down(pos, block_size);
553 178947985 : loff_t block_end = round_up(pos + len, block_size);
554 178947985 : unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
555 178947985 : size_t from = offset_in_folio(folio, pos), to = from + len;
556 178947985 : size_t poff, plen;
557 :
558 258249840 : if (folio_test_uptodate(folio))
559 : return 0;
560 99612674 : folio_clear_error(folio);
561 :
562 99631060 : iop = iomap_page_create(iter->inode, folio, iter->flags);
563 99631432 : if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1)
564 : return -EAGAIN;
565 :
566 99702164 : do {
567 99702164 : iomap_adjust_read_range(iter->inode, folio, &block_start,
568 : block_end - block_start, &poff, &plen);
569 99704554 : if (plen == 0)
570 : break;
571 :
572 71403620 : if (!(iter->flags & IOMAP_UNSHARE) &&
573 71399086 : (from <= poff || from >= poff + plen) &&
574 47523704 : (to <= poff || to >= poff + plen))
575 27176244 : continue;
576 :
577 44227376 : if (iomap_block_needs_zeroing(iter, block_start)) {
578 38774478 : if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE))
579 : return -EIO;
580 38774478 : folio_zero_segments(folio, poff, from, to, poff + plen);
581 : } else {
582 5452898 : int status;
583 :
584 5452898 : if (iter->flags & IOMAP_NOWAIT)
585 : return -EAGAIN;
586 :
587 5452898 : status = iomap_read_folio_sync(block_start, folio,
588 : poff, plen, srcmap);
589 5452728 : if (status)
590 1227 : return status;
591 : }
592 44225919 : iomap_set_range_uptodate(folio, iop, poff, plen);
593 71403850 : } while ((block_start += plen) < block_end);
594 :
595 : return 0;
596 : }
597 :
598 178923918 : static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
599 : size_t len)
600 : {
601 178923918 : const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
602 :
603 178923918 : if (folio_ops && folio_ops->get_folio)
604 0 : return folio_ops->get_folio(iter, pos, len);
605 : else
606 178923918 : return iomap_get_folio(iter, pos);
607 : }
608 :
609 178965168 : static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret,
610 : struct folio *folio)
611 : {
612 178965168 : const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
613 :
614 178965168 : if (folio_ops && folio_ops->put_folio) {
615 0 : folio_ops->put_folio(iter->inode, pos, ret, folio);
616 : } else {
617 178965168 : folio_unlock(folio);
618 178960648 : folio_put(folio);
619 : }
620 178964331 : }
621 :
622 0 : static int iomap_write_begin_inline(const struct iomap_iter *iter,
623 : struct folio *folio)
624 : {
625 : /* needs more work for the tailpacking case; disable for now */
626 0 : if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0))
627 : return -EIO;
628 0 : return iomap_read_inline_data(iter, folio);
629 : }
630 :
631 178843380 : static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
632 : size_t len, struct folio **foliop)
633 : {
634 178843380 : const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
635 178843380 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
636 178843380 : struct folio *folio;
637 178843380 : int status = 0;
638 :
639 178843380 : BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
640 178843380 : if (srcmap != &iter->iomap)
641 1333629 : BUG_ON(pos + len > srcmap->offset + srcmap->length);
642 :
643 178843380 : if (fatal_signal_pending(current))
644 : return -EINTR;
645 :
646 357846476 : if (!mapping_large_folio_support(iter->inode->i_mapping))
647 0 : len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
648 :
649 178923238 : folio = __iomap_get_folio(iter, pos, len);
650 178955408 : if (IS_ERR(folio))
651 0 : return PTR_ERR(folio);
652 :
653 : /*
654 : * Now we have a locked folio, before we do anything with it we need to
655 : * check that the iomap we have cached is not stale. The inode extent
656 : * mapping can change due to concurrent IO in flight (e.g.
657 : * IOMAP_UNWRITTEN state can change and memory reclaim could have
658 : * reclaimed a previously partially written page at this index after IO
659 : * completion before this write reaches this file offset) and hence we
660 : * could do the wrong thing here (zero a page range incorrectly or fail
661 : * to zero) and corrupt data.
662 : */
663 178955408 : if (folio_ops && folio_ops->iomap_valid) {
664 178957851 : bool iomap_valid = folio_ops->iomap_valid(iter->inode,
665 : &iter->iomap);
666 178954767 : if (!iomap_valid) {
667 11773 : iter->iomap.flags |= IOMAP_F_STALE;
668 11773 : status = 0;
669 11773 : goto out_unlock;
670 : }
671 : }
672 :
673 179765997 : if (pos + len > folio_pos(folio) + folio_size(folio))
674 18213 : len = folio_pos(folio) + folio_size(folio) - pos;
675 :
676 178940551 : if (srcmap->type == IOMAP_INLINE)
677 0 : status = iomap_write_begin_inline(iter, folio);
678 178940551 : else if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
679 0 : status = __block_write_begin_int(folio, pos, len, NULL, srcmap);
680 : else
681 178940551 : status = __iomap_write_begin(iter, pos, len, folio);
682 :
683 178917970 : if (unlikely(status))
684 1227 : goto out_unlock;
685 :
686 178916743 : *foliop = folio;
687 178916743 : return 0;
688 :
689 13000 : out_unlock:
690 13000 : __iomap_put_folio(iter, pos, 0, folio);
691 13000 : iomap_write_failed(iter->inode, pos, len);
692 :
693 13000 : return status;
694 : }
695 :
696 178895513 : static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
697 : size_t copied, struct folio *folio)
698 : {
699 178895513 : struct iomap_page *iop = to_iomap_page(folio);
700 178895513 : flush_dcache_folio(folio);
701 :
702 : /*
703 : * The blocks that were entirely written will now be uptodate, so we
704 : * don't have to worry about a read_folio reading them and overwriting a
705 : * partial write. However, if we've encountered a short write and only
706 : * partially written into a block, it will not be marked uptodate, so a
707 : * read_folio might come in and destroy our partial write.
708 : *
709 : * Do the simplest thing and just treat any short write to a
710 : * non-uptodate page as a zero-length write, and force the caller to
711 : * redo the whole thing.
712 : */
713 178936823 : if (unlikely(copied < len && !folio_test_uptodate(folio)))
714 : return 0;
715 179762255 : iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len);
716 178882976 : filemap_dirty_folio(inode->i_mapping, folio);
717 178882976 : return copied;
718 : }
719 :
720 0 : static size_t iomap_write_end_inline(const struct iomap_iter *iter,
721 : struct folio *folio, loff_t pos, size_t copied)
722 : {
723 0 : const struct iomap *iomap = &iter->iomap;
724 0 : void *addr;
725 :
726 0 : WARN_ON_ONCE(!folio_test_uptodate(folio));
727 0 : BUG_ON(!iomap_inline_data_valid(iomap));
728 :
729 0 : flush_dcache_folio(folio);
730 0 : addr = kmap_local_folio(folio, pos);
731 0 : memcpy(iomap_inline_data(iomap, pos), addr, copied);
732 0 : kunmap_local(addr);
733 :
734 0 : mark_inode_dirty(iter->inode);
735 0 : return copied;
736 : }
737 :
738 : /* Returns the number of bytes copied. May be 0. Cannot be an errno. */
739 178915155 : static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
740 : size_t copied, struct folio *folio)
741 : {
742 178915155 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
743 178915155 : loff_t old_size = iter->inode->i_size;
744 178915155 : size_t ret;
745 :
746 178915155 : if (srcmap->type == IOMAP_INLINE) {
747 0 : ret = iomap_write_end_inline(iter, folio, pos, copied);
748 178915155 : } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
749 0 : ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
750 : copied, &folio->page, NULL);
751 : } else {
752 178915155 : ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
753 : }
754 :
755 : /*
756 : * Update the in-memory inode size after copying the data into the page
757 : * cache. It's up to the file system to write the updated size to disk,
758 : * preferably after I/O completion so that no stale data is exposed.
759 : */
760 178938382 : if (pos + ret > old_size) {
761 68112102 : i_size_write(iter->inode, pos + ret);
762 68112102 : iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
763 : }
764 178938382 : __iomap_put_folio(iter, pos, ret, folio);
765 :
766 178950518 : if (old_size < pos)
767 7735291 : pagecache_isize_extended(iter->inode, old_size, pos);
768 178950520 : if (ret < len)
769 7 : iomap_write_failed(iter->inode, pos + ret, len - ret);
770 178950520 : return ret;
771 : }
772 :
773 67780021 : static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
774 : {
775 67780021 : loff_t length = iomap_length(iter);
776 67780021 : loff_t pos = iter->pos;
777 67780021 : ssize_t written = 0;
778 67780021 : long status = 0;
779 67780021 : struct address_space *mapping = iter->inode->i_mapping;
780 67780021 : unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
781 :
782 161451006 : do {
783 161451006 : struct folio *folio;
784 161451006 : struct page *page;
785 161451006 : unsigned long offset; /* Offset into pagecache page */
786 161451006 : unsigned long bytes; /* Bytes to write to page */
787 161451006 : size_t copied; /* Bytes copied from user */
788 :
789 161451006 : offset = offset_in_page(pos);
790 161451006 : bytes = min_t(unsigned long, PAGE_SIZE - offset,
791 : iov_iter_count(i));
792 161451013 : again:
793 161451013 : status = balance_dirty_pages_ratelimited_flags(mapping,
794 : bdp_flags);
795 161448531 : if (unlikely(status))
796 : break;
797 :
798 161448531 : if (bytes > length)
799 : bytes = length;
800 :
801 : /*
802 : * Bring in the user page that we'll copy from _first_.
803 : * Otherwise there's a nasty deadlock on copying from the
804 : * same page as we're writing to, without it being marked
805 : * up-to-date.
806 : *
807 : * For async buffered writes the assumption is that the user
808 : * page has already been faulted in. This can be optimized by
809 : * faulting the user page.
810 : */
811 161448531 : if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
812 : status = -EFAULT;
813 : break;
814 : }
815 :
816 161444406 : status = iomap_write_begin(iter, pos, bytes, &folio);
817 161414517 : if (unlikely(status))
818 : break;
819 161414079 : if (iter->iomap.flags & IOMAP_F_STALE)
820 : break;
821 :
822 161402576 : page = folio_file_page(folio, pos >> PAGE_SHIFT);
823 161436744 : if (mapping_writably_mapped(mapping))
824 58285 : flush_dcache_page(page);
825 :
826 161436744 : copied = copy_page_from_iter_atomic(page, offset, bytes, i);
827 :
828 161432215 : status = iomap_write_end(iter, pos, bytes, copied, folio);
829 :
830 161441027 : if (unlikely(copied != status))
831 4 : iov_iter_revert(i, copied - status);
832 :
833 161441027 : cond_resched();
834 161437224 : if (unlikely(status == 0)) {
835 : /*
836 : * A short copy made iomap_write_end() reject the
837 : * thing entirely. Might be memory poisoning
838 : * halfway through, might be a race with munmap,
839 : * might be severe memory pressure.
840 : */
841 7 : if (copied)
842 4 : bytes = copied;
843 7 : goto again;
844 : }
845 161437217 : pos += status;
846 161437217 : written += status;
847 161437217 : length -= status;
848 161437217 : } while (iov_iter_count(i) && length);
849 :
850 67778173 : if (status == -EAGAIN) {
851 0 : iov_iter_revert(i, written);
852 0 : return -EAGAIN;
853 : }
854 67778173 : return written ? written : status;
855 : }
856 :
857 : ssize_t
858 59284893 : iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
859 : const struct iomap_ops *ops)
860 : {
861 59284893 : struct iomap_iter iter = {
862 59284893 : .inode = iocb->ki_filp->f_mapping->host,
863 59284893 : .pos = iocb->ki_pos,
864 : .len = iov_iter_count(i),
865 : .flags = IOMAP_WRITE,
866 : };
867 59284893 : ssize_t ret;
868 :
869 59284893 : if (iocb->ki_flags & IOCB_NOWAIT)
870 0 : iter.flags |= IOMAP_NOWAIT;
871 :
872 127066669 : while ((ret = iomap_iter(&iter, ops)) > 0)
873 67726076 : iter.processed = iomap_write_iter(&iter, i);
874 :
875 59275017 : if (unlikely(iter.pos == iocb->ki_pos))
876 : return ret;
877 58468772 : ret = iter.pos - iocb->ki_pos;
878 58468772 : iocb->ki_pos = iter.pos;
879 58468772 : return ret;
880 : }
881 : EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
882 :
883 : /*
884 : * Scan the data range passed to us for dirty page cache folios. If we find a
885 : * dirty folio, punch out the preceeding range and update the offset from which
886 : * the next punch will start from.
887 : *
888 : * We can punch out storage reservations under clean pages because they either
889 : * contain data that has been written back - in which case the delalloc punch
890 : * over that range is a no-op - or they have been read faults in which case they
891 : * contain zeroes and we can remove the delalloc backing range and any new
892 : * writes to those pages will do the normal hole filling operation...
893 : *
894 : * This makes the logic simple: we only need to keep the delalloc extents only
895 : * over the dirty ranges of the page cache.
896 : *
897 : * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
898 : * simplify range iterations.
899 : */
900 137 : static int iomap_write_delalloc_scan(struct inode *inode,
901 : loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
902 : int (*punch)(struct inode *inode, loff_t offset, loff_t length))
903 : {
904 280 : while (start_byte < end_byte) {
905 143 : struct folio *folio;
906 :
907 : /* grab locked page */
908 143 : folio = filemap_lock_folio(inode->i_mapping,
909 143 : start_byte >> PAGE_SHIFT);
910 143 : if (IS_ERR(folio)) {
911 2 : start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) +
912 : PAGE_SIZE;
913 2 : continue;
914 : }
915 :
916 : /* if dirty, punch up to offset */
917 141 : if (folio_test_dirty(folio)) {
918 15 : if (start_byte > *punch_start_byte) {
919 0 : int error;
920 :
921 0 : error = punch(inode, *punch_start_byte,
922 : start_byte - *punch_start_byte);
923 0 : if (error) {
924 0 : folio_unlock(folio);
925 0 : folio_put(folio);
926 0 : return error;
927 : }
928 : }
929 :
930 : /*
931 : * Make sure the next punch start is correctly bound to
932 : * the end of this data range, not the end of the folio.
933 : */
934 25 : *punch_start_byte = min_t(loff_t, end_byte,
935 : folio_next_index(folio) << PAGE_SHIFT);
936 : }
937 :
938 : /* move offset to start of next folio in range */
939 141 : start_byte = folio_next_index(folio) << PAGE_SHIFT;
940 141 : folio_unlock(folio);
941 141 : folio_put(folio);
942 : }
943 : return 0;
944 : }
945 :
946 : /*
947 : * Punch out all the delalloc blocks in the range given except for those that
948 : * have dirty data still pending in the page cache - those are going to be
949 : * written and so must still retain the delalloc backing for writeback.
950 : *
951 : * As we are scanning the page cache for data, we don't need to reimplement the
952 : * wheel - mapping_seek_hole_data() does exactly what we need to identify the
953 : * start and end of data ranges correctly even for sub-folio block sizes. This
954 : * byte range based iteration is especially convenient because it means we
955 : * don't have to care about variable size folios, nor where the start or end of
956 : * the data range lies within a folio, if they lie within the same folio or even
957 : * if there are multiple discontiguous data ranges within the folio.
958 : *
959 : * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so
960 : * can return data ranges that exist in the cache beyond EOF. e.g. a page fault
961 : * spanning EOF will initialise the post-EOF data to zeroes and mark it up to
962 : * date. A write page fault can then mark it dirty. If we then fail a write()
963 : * beyond EOF into that up to date cached range, we allocate a delalloc block
964 : * beyond EOF and then have to punch it out. Because the range is up to date,
965 : * mapping_seek_hole_data() will return it, and we will skip the punch because
966 : * the folio is dirty. THis is incorrect - we always need to punch out delalloc
967 : * beyond EOF in this case as writeback will never write back and covert that
968 : * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF,
969 : * resulting in always punching out the range from the EOF to the end of the
970 : * range the iomap spans.
971 : *
972 : * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it
973 : * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA
974 : * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte)
975 : * returns the end of the data range (data_end). Using closed intervals would
976 : * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
977 : * the code to subtle off-by-one bugs....
978 : */
979 6430 : static int iomap_write_delalloc_release(struct inode *inode,
980 : loff_t start_byte, loff_t end_byte,
981 : int (*punch)(struct inode *inode, loff_t pos, loff_t length))
982 : {
983 6430 : loff_t punch_start_byte = start_byte;
984 6430 : loff_t scan_end_byte = min(i_size_read(inode), end_byte);
985 6430 : int error = 0;
986 :
987 : /*
988 : * Lock the mapping to avoid races with page faults re-instantiating
989 : * folios and dirtying them via ->page_mkwrite whilst we walk the
990 : * cache and perform delalloc extent removal. Failing to do this can
991 : * leave dirty pages with no space reservation in the cache.
992 : */
993 6430 : filemap_invalidate_lock(inode->i_mapping);
994 6567 : while (start_byte < scan_end_byte) {
995 5408 : loff_t data_end;
996 :
997 5408 : start_byte = mapping_seek_hole_data(inode->i_mapping,
998 : start_byte, scan_end_byte, SEEK_DATA);
999 : /*
1000 : * If there is no more data to scan, all that is left is to
1001 : * punch out the remaining range.
1002 : */
1003 5408 : if (start_byte == -ENXIO || start_byte == scan_end_byte)
1004 : break;
1005 137 : if (start_byte < 0) {
1006 0 : error = start_byte;
1007 0 : goto out_unlock;
1008 : }
1009 137 : WARN_ON_ONCE(start_byte < punch_start_byte);
1010 137 : WARN_ON_ONCE(start_byte > scan_end_byte);
1011 :
1012 : /*
1013 : * We find the end of this contiguous cached data range by
1014 : * seeking from start_byte to the beginning of the next hole.
1015 : */
1016 137 : data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
1017 : scan_end_byte, SEEK_HOLE);
1018 137 : if (data_end < 0) {
1019 0 : error = data_end;
1020 0 : goto out_unlock;
1021 : }
1022 137 : WARN_ON_ONCE(data_end <= start_byte);
1023 137 : WARN_ON_ONCE(data_end > scan_end_byte);
1024 :
1025 137 : error = iomap_write_delalloc_scan(inode, &punch_start_byte,
1026 : start_byte, data_end, punch);
1027 137 : if (error)
1028 0 : goto out_unlock;
1029 :
1030 : /* The next data search starts at the end of this one. */
1031 : start_byte = data_end;
1032 : }
1033 :
1034 6430 : if (punch_start_byte < end_byte)
1035 6415 : error = punch(inode, punch_start_byte,
1036 : end_byte - punch_start_byte);
1037 15 : out_unlock:
1038 6430 : filemap_invalidate_unlock(inode->i_mapping);
1039 6430 : return error;
1040 : }
1041 :
1042 : /*
1043 : * When a short write occurs, the filesystem may need to remove reserved space
1044 : * that was allocated in ->iomap_begin from it's ->iomap_end method. For
1045 : * filesystems that use delayed allocation, we need to punch out delalloc
1046 : * extents from the range that are not dirty in the page cache. As the write can
1047 : * race with page faults, there can be dirty pages over the delalloc extent
1048 : * outside the range of a short write but still within the delalloc extent
1049 : * allocated for this iomap.
1050 : *
1051 : * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
1052 : * simplify range iterations.
1053 : *
1054 : * The punch() callback *must* only punch delalloc extents in the range passed
1055 : * to it. It must skip over all other types of extents in the range and leave
1056 : * them completely unchanged. It must do this punch atomically with respect to
1057 : * other extent modifications.
1058 : *
1059 : * The punch() callback may be called with a folio locked to prevent writeback
1060 : * extent allocation racing at the edge of the range we are currently punching.
1061 : * The locked folio may or may not cover the range being punched, so it is not
1062 : * safe for the punch() callback to lock folios itself.
1063 : *
1064 : * Lock order is:
1065 : *
1066 : * inode->i_rwsem (shared or exclusive)
1067 : * inode->i_mapping->invalidate_lock (exclusive)
1068 : * folio_lock()
1069 : * ->punch
1070 : * internal filesystem allocation lock
1071 : */
1072 117124058 : int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
1073 : struct iomap *iomap, loff_t pos, loff_t length,
1074 : ssize_t written,
1075 : int (*punch)(struct inode *inode, loff_t pos, loff_t length))
1076 : {
1077 117124058 : loff_t start_byte;
1078 117124058 : loff_t end_byte;
1079 117124058 : unsigned int blocksize = i_blocksize(inode);
1080 :
1081 117124058 : if (iomap->type != IOMAP_DELALLOC)
1082 : return 0;
1083 :
1084 : /* If we didn't reserve the blocks, we're not allowed to punch them. */
1085 56172667 : if (!(iomap->flags & IOMAP_F_NEW))
1086 : return 0;
1087 :
1088 : /*
1089 : * start_byte refers to the first unused block after a short write. If
1090 : * nothing was written, round offset down to point at the first block in
1091 : * the range.
1092 : */
1093 18111985 : if (unlikely(!written))
1094 4112 : start_byte = round_down(pos, blocksize);
1095 : else
1096 18107873 : start_byte = round_up(pos + written, blocksize);
1097 18111985 : end_byte = round_up(pos + length, blocksize);
1098 :
1099 : /* Nothing to do if we've written the entire delalloc extent */
1100 18111985 : if (start_byte >= end_byte)
1101 : return 0;
1102 :
1103 6430 : return iomap_write_delalloc_release(inode, start_byte, end_byte,
1104 : punch);
1105 : }
1106 : EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
1107 :
1108 12833 : static loff_t iomap_unshare_iter(struct iomap_iter *iter)
1109 : {
1110 12833 : struct iomap *iomap = &iter->iomap;
1111 12833 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
1112 12833 : loff_t pos = iter->pos;
1113 12833 : loff_t length = iomap_length(iter);
1114 12833 : long status = 0;
1115 12833 : loff_t written = 0;
1116 :
1117 : /* don't bother with blocks that are not shared to start with */
1118 12833 : if (!(iomap->flags & IOMAP_F_SHARED))
1119 : return length;
1120 : /* don't bother with holes or unwritten extents */
1121 4848 : if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
1122 : return length;
1123 :
1124 6083 : do {
1125 6083 : unsigned long offset = offset_in_page(pos);
1126 6083 : unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length);
1127 6083 : struct folio *folio;
1128 :
1129 6083 : status = iomap_write_begin(iter, pos, bytes, &folio);
1130 6083 : if (unlikely(status))
1131 0 : return status;
1132 6083 : if (iter->iomap.flags & IOMAP_F_STALE)
1133 : break;
1134 :
1135 6083 : status = iomap_write_end(iter, pos, bytes, bytes, folio);
1136 6083 : if (WARN_ON_ONCE(status == 0))
1137 : return -EIO;
1138 :
1139 6083 : cond_resched();
1140 :
1141 6083 : pos += status;
1142 6083 : written += status;
1143 6083 : length -= status;
1144 :
1145 6083 : balance_dirty_pages_ratelimited(iter->inode->i_mapping);
1146 6083 : } while (length);
1147 :
1148 : return written;
1149 : }
1150 :
1151 : int
1152 40 : iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
1153 : const struct iomap_ops *ops)
1154 : {
1155 40 : struct iomap_iter iter = {
1156 : .inode = inode,
1157 : .pos = pos,
1158 : .len = len,
1159 : .flags = IOMAP_WRITE | IOMAP_UNSHARE,
1160 : };
1161 40 : int ret;
1162 :
1163 12873 : while ((ret = iomap_iter(&iter, ops)) > 0)
1164 12833 : iter.processed = iomap_unshare_iter(&iter);
1165 40 : return ret;
1166 : }
1167 : EXPORT_SYMBOL_GPL(iomap_file_unshare);
1168 :
1169 0 : static loff_t iomap_dirty_iter(struct iomap_iter *iter)
1170 : {
1171 0 : loff_t pos = iter->pos;
1172 0 : loff_t length = iomap_length(iter);
1173 0 : long status = 0;
1174 0 : loff_t written = 0;
1175 :
1176 0 : do {
1177 0 : unsigned long offset = offset_in_page(pos);
1178 0 : unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length);
1179 0 : struct folio *folio;
1180 :
1181 0 : status = iomap_write_begin(iter, pos, bytes, &folio);
1182 0 : if (unlikely(status))
1183 0 : return status;
1184 :
1185 0 : folio_mark_accessed(folio);
1186 :
1187 0 : status = iomap_write_end(iter, pos, bytes, bytes, folio);
1188 0 : if (WARN_ON_ONCE(status == 0))
1189 : return -EIO;
1190 :
1191 0 : cond_resched();
1192 :
1193 0 : pos += status;
1194 0 : written += status;
1195 0 : length -= status;
1196 :
1197 0 : balance_dirty_pages_ratelimited(iter->inode->i_mapping);
1198 0 : } while (length);
1199 :
1200 : return written;
1201 : }
1202 :
1203 : int
1204 0 : iomap_dirty_range(struct inode *inode, loff_t pos, u64 len,
1205 : const struct iomap_ops *ops)
1206 : {
1207 0 : struct iomap_iter iter = {
1208 : .inode = inode,
1209 : .pos = pos,
1210 : .len = len,
1211 : .flags = IOMAP_WRITE,
1212 : };
1213 0 : int ret;
1214 :
1215 0 : if (IS_DAX(inode))
1216 : return -EINVAL;
1217 :
1218 0 : while ((ret = iomap_iter(&iter, ops)) > 0)
1219 0 : iter.processed = iomap_dirty_iter(&iter);
1220 0 : return ret;
1221 : }
1222 : EXPORT_SYMBOL_GPL(iomap_dirty_range);
1223 :
1224 49357116 : static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
1225 : {
1226 49357116 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
1227 49357116 : loff_t pos = iter->pos;
1228 49357116 : loff_t length = iomap_length(iter);
1229 49357116 : loff_t written = 0;
1230 :
1231 : /* already zeroed? we're done. */
1232 49357116 : if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
1233 : return length;
1234 :
1235 17512770 : do {
1236 17512770 : struct folio *folio;
1237 17512770 : int status;
1238 17512770 : size_t offset;
1239 17512770 : size_t bytes = min_t(u64, SIZE_MAX, length);
1240 :
1241 17512770 : status = iomap_write_begin(iter, pos, bytes, &folio);
1242 17507820 : if (status)
1243 933 : return status;
1244 17506887 : if (iter->iomap.flags & IOMAP_F_STALE)
1245 : break;
1246 :
1247 17506617 : offset = offset_in_folio(folio, pos);
1248 17554411 : if (bytes > folio_size(folio) - offset)
1249 18213 : bytes = folio_size(folio) - offset;
1250 :
1251 17506617 : folio_zero_range(folio, offset, bytes);
1252 17508500 : folio_mark_accessed(folio);
1253 :
1254 17511488 : bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
1255 17511616 : if (WARN_ON_ONCE(bytes == 0))
1256 : return -EIO;
1257 :
1258 17511616 : pos += bytes;
1259 17511616 : length -= bytes;
1260 17511616 : written += bytes;
1261 17511616 : } while (length > 0);
1262 :
1263 17493756 : if (did_zero)
1264 1883014 : *did_zero = true;
1265 : return written;
1266 : }
1267 :
1268 : int
1269 37030528 : iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
1270 : const struct iomap_ops *ops)
1271 : {
1272 37030528 : struct iomap_iter iter = {
1273 : .inode = inode,
1274 : .pos = pos,
1275 : .len = len,
1276 : .flags = IOMAP_ZERO,
1277 : };
1278 37030528 : int ret;
1279 :
1280 86382856 : while ((ret = iomap_iter(&iter, ops)) > 0)
1281 49353640 : iter.processed = iomap_zero_iter(&iter, did_zero);
1282 37030558 : return ret;
1283 : }
1284 : EXPORT_SYMBOL_GPL(iomap_zero_range);
1285 :
1286 : int
1287 3072444 : iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
1288 : const struct iomap_ops *ops)
1289 : {
1290 3072444 : unsigned int blocksize = i_blocksize(inode);
1291 3072444 : unsigned int off = pos & (blocksize - 1);
1292 :
1293 : /* Block boundary? Nothing to do */
1294 3072444 : if (!off)
1295 : return 0;
1296 2491705 : return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
1297 : }
1298 : EXPORT_SYMBOL_GPL(iomap_truncate_page);
1299 :
1300 6967240 : static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
1301 : struct folio *folio)
1302 : {
1303 6967240 : loff_t length = iomap_length(iter);
1304 6967240 : int ret;
1305 :
1306 6967240 : if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) {
1307 0 : ret = __block_write_begin_int(folio, iter->pos, length, NULL,
1308 0 : &iter->iomap);
1309 0 : if (ret)
1310 0 : return ret;
1311 0 : block_commit_write(&folio->page, 0, length);
1312 : } else {
1313 13934496 : WARN_ON_ONCE(!folio_test_uptodate(folio));
1314 6967257 : folio_mark_dirty(folio);
1315 : }
1316 :
1317 : return length;
1318 : }
1319 :
1320 4031512 : vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
1321 : {
1322 4031512 : struct iomap_iter iter = {
1323 4031512 : .inode = file_inode(vmf->vma->vm_file),
1324 : .flags = IOMAP_WRITE | IOMAP_FAULT,
1325 : };
1326 4031512 : struct folio *folio = page_folio(vmf->page);
1327 4031512 : ssize_t ret;
1328 :
1329 4031512 : folio_lock(folio);
1330 4031515 : ret = folio_mkwrite_check_truncate(folio, iter.inode);
1331 4031506 : if (ret < 0)
1332 167 : goto out_unlock;
1333 4031339 : iter.pos = folio_pos(folio);
1334 4031339 : iter.len = ret;
1335 10998537 : while ((ret = iomap_iter(&iter, ops)) > 0)
1336 6967240 : iter.processed = iomap_folio_mkwrite_iter(&iter, folio);
1337 :
1338 4031282 : if (ret < 0)
1339 29716 : goto out_unlock;
1340 4001566 : folio_wait_stable(folio);
1341 4001566 : return VM_FAULT_LOCKED;
1342 29883 : out_unlock:
1343 29883 : folio_unlock(folio);
1344 29882 : return block_page_mkwrite_return(ret);
1345 : }
1346 : EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
1347 :
1348 68520493 : static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
1349 : size_t len, int error)
1350 : {
1351 68520493 : struct iomap_page *iop = to_iomap_page(folio);
1352 :
1353 68520493 : if (error) {
1354 136514 : folio_set_error(folio);
1355 136514 : mapping_set_error(inode->i_mapping, error);
1356 : }
1357 :
1358 139692872 : WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !iop);
1359 137040711 : WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0);
1360 :
1361 137040685 : if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending))
1362 57069383 : folio_end_writeback(folio);
1363 68520490 : }
1364 :
1365 : /*
1366 : * We're now finished for good with this ioend structure. Update the page
1367 : * state, release holds on bios, and finally free up memory. Do not use the
1368 : * ioend after this.
1369 : */
1370 : static u32
1371 44570799 : iomap_finish_ioend(struct iomap_ioend *ioend, int error)
1372 : {
1373 44570799 : struct inode *inode = ioend->io_inode;
1374 44570799 : struct bio *bio = &ioend->io_inline_bio;
1375 44570799 : struct bio *last = ioend->io_bio, *next;
1376 44570799 : u64 start = bio->bi_iter.bi_sector;
1377 44570799 : loff_t offset = ioend->io_offset;
1378 44570799 : bool quiet = bio_flagged(bio, BIO_QUIET);
1379 44570799 : u32 folio_count = 0;
1380 :
1381 89144799 : for (bio = &ioend->io_inline_bio; bio; bio = next) {
1382 44573833 : struct folio_iter fi;
1383 :
1384 : /*
1385 : * For the last bio, bi_private points to the ioend, so we
1386 : * need to explicitly end the iteration here.
1387 : */
1388 44573833 : if (bio == last)
1389 : next = NULL;
1390 : else
1391 2920 : next = bio->bi_private;
1392 :
1393 : /* walk all folios in bio, ending page IO on them */
1394 113094351 : bio_for_each_folio_all(fi, bio) {
1395 68520489 : iomap_finish_folio_write(inode, fi.folio, fi.length,
1396 : error);
1397 68520446 : folio_count++;
1398 : }
1399 44573887 : bio_put(bio);
1400 : }
1401 : /* The ioend has been freed by bio_put() */
1402 :
1403 44570966 : if (unlikely(error && !quiet)) {
1404 44478 : printk_ratelimited(KERN_ERR
1405 : "%s: writeback error on inode %lu, offset %lld, sector %llu",
1406 : inode->i_sb->s_id, inode->i_ino, offset, start);
1407 : }
1408 44570965 : return folio_count;
1409 : }
1410 :
1411 : /*
1412 : * Ioend completion routine for merged bios. This can only be called from task
1413 : * contexts as merged ioends can be of unbound length. Hence we have to break up
1414 : * the writeback completions into manageable chunks to avoid long scheduler
1415 : * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
1416 : * good batch processing throughput without creating adverse scheduler latency
1417 : * conditions.
1418 : */
1419 : void
1420 39363668 : iomap_finish_ioends(struct iomap_ioend *ioend, int error)
1421 : {
1422 39363668 : struct list_head tmp;
1423 39363668 : u32 completions;
1424 :
1425 39363668 : might_sleep();
1426 :
1427 39363668 : list_replace_init(&ioend->io_list, &tmp);
1428 39363668 : completions = iomap_finish_ioend(ioend, error);
1429 :
1430 39368454 : while (!list_empty(&tmp)) {
1431 4786 : if (completions > IOEND_BATCH_SIZE * 8) {
1432 0 : cond_resched();
1433 0 : completions = 0;
1434 : }
1435 4786 : ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
1436 4786 : list_del_init(&ioend->io_list);
1437 4786 : completions += iomap_finish_ioend(ioend, error);
1438 : }
1439 39363668 : }
1440 : EXPORT_SYMBOL_GPL(iomap_finish_ioends);
1441 :
1442 : /*
1443 : * We can merge two adjacent ioends if they have the same set of work to do.
1444 : */
1445 : static bool
1446 6837517 : iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
1447 : {
1448 6837517 : if (ioend->io_bio->bi_status != next->io_bio->bi_status)
1449 : return false;
1450 6837496 : if ((ioend->io_flags & IOMAP_F_SHARED) ^
1451 6837496 : (next->io_flags & IOMAP_F_SHARED))
1452 : return false;
1453 6755522 : if ((ioend->io_type == IOMAP_UNWRITTEN) ^
1454 6755522 : (next->io_type == IOMAP_UNWRITTEN))
1455 : return false;
1456 5639039 : if (ioend->io_offset + ioend->io_size != next->io_offset)
1457 : return false;
1458 : /*
1459 : * Do not merge physically discontiguous ioends. The filesystem
1460 : * completion functions will have to iterate the physical
1461 : * discontiguities even if we merge the ioends at a logical level, so
1462 : * we don't gain anything by merging physical discontiguities here.
1463 : *
1464 : * We cannot use bio->bi_iter.bi_sector here as it is modified during
1465 : * submission so does not point to the start sector of the bio at
1466 : * completion.
1467 : */
1468 3989723 : if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
1469 3984937 : return false;
1470 : return true;
1471 : }
1472 :
1473 : void
1474 39363665 : iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
1475 : {
1476 39363665 : struct iomap_ioend *next;
1477 :
1478 39363665 : INIT_LIST_HEAD(&ioend->io_list);
1479 :
1480 39368454 : while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
1481 : io_list))) {
1482 6837517 : if (!iomap_ioend_can_merge(ioend, next))
1483 : break;
1484 4786 : list_move_tail(&next->io_list, &ioend->io_list);
1485 4789 : ioend->io_size += next->io_size;
1486 : }
1487 39363668 : }
1488 : EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
1489 :
1490 : static int
1491 10710616 : iomap_ioend_compare(void *priv, const struct list_head *a,
1492 : const struct list_head *b)
1493 : {
1494 10710616 : struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
1495 10710616 : struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
1496 :
1497 10710616 : if (ia->io_offset < ib->io_offset)
1498 : return -1;
1499 4810795 : if (ia->io_offset > ib->io_offset)
1500 4810764 : return 1;
1501 : return 0;
1502 : }
1503 :
1504 : void
1505 32530937 : iomap_sort_ioends(struct list_head *ioend_list)
1506 : {
1507 32530937 : list_sort(NULL, ioend_list, iomap_ioend_compare);
1508 32530937 : }
1509 : EXPORT_SYMBOL_GPL(iomap_sort_ioends);
1510 :
1511 5202513 : static void iomap_writepage_end_bio(struct bio *bio)
1512 : {
1513 5202513 : struct iomap_ioend *ioend = bio->bi_private;
1514 :
1515 5202513 : iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status));
1516 5202513 : }
1517 :
1518 : /*
1519 : * Submit the final bio for an ioend.
1520 : *
1521 : * If @error is non-zero, it means that we have a situation where some part of
1522 : * the submission process has failed after we've marked pages for writeback
1523 : * and unlocked them. In this situation, we need to fail the bio instead of
1524 : * submitting it. This typically only happens on a filesystem shutdown.
1525 : */
1526 : static int
1527 44570838 : iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
1528 : int error)
1529 : {
1530 44570838 : ioend->io_bio->bi_private = ioend;
1531 44570838 : ioend->io_bio->bi_end_io = iomap_writepage_end_bio;
1532 :
1533 44570838 : if (wpc->ops->prepare_ioend)
1534 44570838 : error = wpc->ops->prepare_ioend(ioend, error);
1535 44570681 : if (error) {
1536 : /*
1537 : * If we're failing the IO now, just mark the ioend with an
1538 : * error and finish it. This will run IO completion immediately
1539 : * as there is only one reference to the ioend at this point in
1540 : * time.
1541 : */
1542 198 : ioend->io_bio->bi_status = errno_to_blk_status(error);
1543 198 : bio_endio(ioend->io_bio);
1544 198 : return error;
1545 : }
1546 :
1547 44570483 : submit_bio(ioend->io_bio);
1548 44570483 : return 0;
1549 : }
1550 :
1551 : static struct iomap_ioend *
1552 44570628 : iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
1553 : loff_t offset, sector_t sector, struct writeback_control *wbc)
1554 : {
1555 44570628 : struct iomap_ioend *ioend;
1556 44570628 : struct bio *bio;
1557 :
1558 79199577 : bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
1559 : REQ_OP_WRITE | wbc_to_write_flags(wbc),
1560 : GFP_NOFS, &iomap_ioend_bioset);
1561 44570836 : bio->bi_iter.bi_sector = sector;
1562 44570836 : wbc_init_bio(wbc, bio);
1563 :
1564 44570609 : ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
1565 44570609 : INIT_LIST_HEAD(&ioend->io_list);
1566 44570609 : ioend->io_type = wpc->iomap.type;
1567 44570609 : ioend->io_flags = wpc->iomap.flags;
1568 44570609 : ioend->io_inode = inode;
1569 44570609 : ioend->io_size = 0;
1570 44570609 : ioend->io_folios = 0;
1571 44570609 : ioend->io_offset = offset;
1572 44570609 : ioend->io_bio = bio;
1573 44570609 : ioend->io_sector = sector;
1574 44570609 : return ioend;
1575 : }
1576 :
1577 : /*
1578 : * Allocate a new bio, and chain the old bio to the new one.
1579 : *
1580 : * Note that we have to perform the chaining in this unintuitive order
1581 : * so that the bi_private linkage is set up in the right direction for the
1582 : * traversal in iomap_finish_ioend().
1583 : */
1584 : static struct bio *
1585 2920 : iomap_chain_bio(struct bio *prev)
1586 : {
1587 2920 : struct bio *new;
1588 :
1589 2920 : new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
1590 2920 : bio_clone_blkg_association(new, prev);
1591 2920 : new->bi_iter.bi_sector = bio_end_sector(prev);
1592 :
1593 2920 : bio_chain(prev, new);
1594 2920 : bio_get(prev); /* for iomap_finish_ioend */
1595 2920 : submit_bio(prev);
1596 2920 : return new;
1597 : }
1598 :
1599 : static bool
1600 486086988 : iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
1601 : sector_t sector)
1602 : {
1603 486086988 : if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
1604 486086988 : (wpc->ioend->io_flags & IOMAP_F_SHARED))
1605 : return false;
1606 485837114 : if (wpc->iomap.type != wpc->ioend->io_type)
1607 : return false;
1608 480154933 : if (offset != wpc->ioend->io_offset + wpc->ioend->io_size)
1609 : return false;
1610 478674621 : if (sector != bio_end_sector(wpc->ioend->io_bio))
1611 : return false;
1612 : /*
1613 : * Limit ioend bio chain lengths to minimise IO completion latency. This
1614 : * also prevents long tight loops ending page writeback on all the
1615 : * folios in the ioend.
1616 : */
1617 469196996 : if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE)
1618 92 : return false;
1619 : return true;
1620 : }
1621 :
1622 : /*
1623 : * Test to see if we have an existing ioend structure that we could append to
1624 : * first; otherwise finish off the current ioend and start another.
1625 : */
1626 : static void
1627 513765493 : iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
1628 : struct iomap_page *iop, struct iomap_writepage_ctx *wpc,
1629 : struct writeback_control *wbc, struct list_head *iolist)
1630 : {
1631 513765493 : sector_t sector = iomap_sector(&wpc->iomap, pos);
1632 513765493 : unsigned len = i_blocksize(inode);
1633 513765493 : size_t poff = offset_in_folio(folio, pos);
1634 :
1635 513765493 : if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) {
1636 44570842 : if (wpc->ioend)
1637 16890502 : list_add(&wpc->ioend->io_list, iolist);
1638 44570843 : wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc);
1639 : }
1640 :
1641 513765153 : if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) {
1642 2920 : wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio);
1643 2920 : bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff);
1644 : }
1645 :
1646 513767134 : if (iop)
1647 513766821 : atomic_add(len, &iop->write_bytes_pending);
1648 513767383 : wpc->ioend->io_size += len;
1649 513767383 : wbc_account_cgroup_owner(wbc, &folio->page, len);
1650 513767172 : }
1651 :
1652 : /*
1653 : * We implement an immediate ioend submission policy here to avoid needing to
1654 : * chain multiple ioends and hence nest mempool allocations which can violate
1655 : * the forward progress guarantees we need to provide. The current ioend we're
1656 : * adding blocks to is cached in the writepage context, and if the new block
1657 : * doesn't append to the cached ioend, it will create a new ioend and cache that
1658 : * instead.
1659 : *
1660 : * If a new ioend is created and cached, the old ioend is returned and queued
1661 : * locally for submission once the entire page is processed or an error has been
1662 : * detected. While ioends are submitted immediately after they are completed,
1663 : * batching optimisations are provided by higher level block plugging.
1664 : *
1665 : * At the end of a writeback pass, there will be a cached ioend remaining on the
1666 : * writepage context that the caller will need to submit.
1667 : */
1668 : static int
1669 57246259 : iomap_writepage_map(struct iomap_writepage_ctx *wpc,
1670 : struct writeback_control *wbc, struct inode *inode,
1671 : struct folio *folio, u64 end_pos)
1672 : {
1673 57246259 : struct iomap_page *iop = iomap_page_create(inode, folio, 0);
1674 57246494 : struct iomap_ioend *ioend, *next;
1675 57246494 : unsigned len = i_blocksize(inode);
1676 57246494 : unsigned nblocks = i_blocks_per_folio(inode, folio);
1677 57246494 : u64 pos = folio_pos(folio);
1678 57246494 : int error = 0, count = 0, i;
1679 57246494 : LIST_HEAD(submit_list);
1680 :
1681 114492674 : WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0);
1682 :
1683 : /*
1684 : * Walk through the folio to find areas to write back. If we
1685 : * run off the end of the current map or find the current map
1686 : * invalid, grab a new one.
1687 : */
1688 883080091 : for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
1689 826007526 : if (iop && !test_bit(i, iop->uptodate))
1690 291235743 : continue;
1691 :
1692 534771783 : error = wpc->ops->map_blocks(wpc, inode, pos);
1693 534770980 : if (error)
1694 : break;
1695 534597118 : trace_iomap_writepage_map(inode, &wpc->iomap);
1696 534598179 : if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
1697 0 : continue;
1698 534598179 : if (wpc->iomap.type == IOMAP_HOLE)
1699 20830648 : continue;
1700 513767531 : iomap_add_to_ioend(inode, pos, folio, iop, wpc, wbc,
1701 : &submit_list);
1702 513767206 : count++;
1703 : }
1704 57246427 : if (count)
1705 57069083 : wpc->ioend->io_folios++;
1706 :
1707 57423099 : WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
1708 57246427 : WARN_ON_ONCE(!folio_test_locked(folio));
1709 57246427 : WARN_ON_ONCE(folio_test_writeback(folio));
1710 57246427 : WARN_ON_ONCE(folio_test_dirty(folio));
1711 :
1712 : /*
1713 : * We cannot cancel the ioend directly here on error. We may have
1714 : * already set other pages under writeback and hence we have to run I/O
1715 : * completion to mark the error state of the pages under writeback
1716 : * appropriately.
1717 : */
1718 57246427 : if (unlikely(error)) {
1719 : /*
1720 : * Let the filesystem know what portion of the current page
1721 : * failed to map. If the page hasn't been added to ioend, it
1722 : * won't be affected by I/O completion and we must unlock it
1723 : * now.
1724 : */
1725 173862 : if (wpc->ops->discard_folio)
1726 173862 : wpc->ops->discard_folio(folio, pos);
1727 173862 : if (!count) {
1728 173826 : folio_unlock(folio);
1729 173826 : goto done;
1730 : }
1731 : }
1732 :
1733 57072601 : folio_start_writeback(folio);
1734 57072603 : folio_unlock(folio);
1735 :
1736 : /*
1737 : * Preserve the original error if there was one; catch
1738 : * submission errors here and propagate into subsequent ioend
1739 : * submissions.
1740 : */
1741 73963043 : list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
1742 16890395 : int error2;
1743 :
1744 16890395 : list_del_init(&ioend->io_list);
1745 16890502 : error2 = iomap_submit_ioend(wpc, ioend, error);
1746 16890496 : if (error2 && !error)
1747 0 : error = error2;
1748 : }
1749 :
1750 : /*
1751 : * We can end up here with no error and nothing to write only if we race
1752 : * with a partial page truncate on a sub-page block sized filesystem.
1753 : */
1754 57072648 : if (!count)
1755 3518 : folio_end_writeback(folio);
1756 57069130 : done:
1757 57246474 : mapping_set_error(inode->i_mapping, error);
1758 57246386 : return error;
1759 : }
1760 :
1761 : /*
1762 : * Write out a dirty page.
1763 : *
1764 : * For delalloc space on the page, we need to allocate space and flush it.
1765 : * For unwritten space on the page, we need to start the conversion to
1766 : * regular allocated space.
1767 : */
1768 57246576 : static int iomap_do_writepage(struct folio *folio,
1769 : struct writeback_control *wbc, void *data)
1770 : {
1771 57246576 : struct iomap_writepage_ctx *wpc = data;
1772 57246576 : struct inode *inode = folio->mapping->host;
1773 57246576 : u64 end_pos, isize;
1774 :
1775 58476942 : trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio));
1776 :
1777 : /*
1778 : * Refuse to write the folio out if we're called from reclaim context.
1779 : *
1780 : * This avoids stack overflows when called from deeply used stacks in
1781 : * random callers for direct reclaim or memcg reclaim. We explicitly
1782 : * allow reclaim from kswapd as the stack usage there is relatively low.
1783 : *
1784 : * This should never happen except in the case of a VM regression so
1785 : * warn about it.
1786 : */
1787 57246596 : if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
1788 : PF_MEMALLOC))
1789 0 : goto redirty;
1790 :
1791 : /*
1792 : * Is this folio beyond the end of the file?
1793 : *
1794 : * The folio index is less than the end_index, adjust the end_pos
1795 : * to the highest offset that this folio should represent.
1796 : * -----------------------------------------------------
1797 : * | file mapping | <EOF> |
1798 : * -----------------------------------------------------
1799 : * | Page ... | Page N-2 | Page N-1 | Page N | |
1800 : * ^--------------------------------^----------|--------
1801 : * | desired writeback range | see else |
1802 : * ---------------------------------^------------------|
1803 : */
1804 57246596 : isize = i_size_read(inode);
1805 57246596 : end_pos = folio_pos(folio) + folio_size(folio);
1806 57246596 : if (end_pos > isize) {
1807 : /*
1808 : * Check whether the page to write out is beyond or straddles
1809 : * i_size or not.
1810 : * -------------------------------------------------------
1811 : * | file mapping | <EOF> |
1812 : * -------------------------------------------------------
1813 : * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
1814 : * ^--------------------------------^-----------|---------
1815 : * | | Straddles |
1816 : * ---------------------------------^-----------|--------|
1817 : */
1818 11590042 : size_t poff = offset_in_folio(folio, isize);
1819 11590042 : pgoff_t end_index = isize >> PAGE_SHIFT;
1820 :
1821 : /*
1822 : * Skip the page if it's fully outside i_size, e.g.
1823 : * due to a truncate operation that's in progress. We've
1824 : * cleaned this page and truncate will finish things off for
1825 : * us.
1826 : *
1827 : * Note that the end_index is unsigned long. If the given
1828 : * offset is greater than 16TB on a 32-bit system then if we
1829 : * checked if the page is fully outside i_size with
1830 : * "if (page->index >= end_index + 1)", "end_index + 1" would
1831 : * overflow and evaluate to 0. Hence this page would be
1832 : * redirtied and written out repeatedly, which would result in
1833 : * an infinite loop; the user program performing this operation
1834 : * would hang. Instead, we can detect this situation by
1835 : * checking if the page is totally beyond i_size or if its
1836 : * offset is just equal to the EOF.
1837 : */
1838 11590042 : if (folio->index > end_index ||
1839 11553240 : (folio->index == end_index && poff == 0))
1840 147 : goto unlock;
1841 :
1842 : /*
1843 : * The page straddles i_size. It must be zeroed out on each
1844 : * and every writepage invocation because it may be mmapped.
1845 : * "A file is mapped in multiples of the page size. For a file
1846 : * that is not a multiple of the page size, the remaining
1847 : * memory is zeroed when mapped, and writes to that region are
1848 : * not written out to the file."
1849 : */
1850 11589895 : folio_zero_segment(folio, poff, folio_size(folio));
1851 11589895 : end_pos = isize;
1852 : }
1853 :
1854 57246459 : return iomap_writepage_map(wpc, wbc, inode, folio, end_pos);
1855 :
1856 : redirty:
1857 0 : folio_redirty_for_writepage(wbc, folio);
1858 147 : unlock:
1859 147 : folio_unlock(folio);
1860 147 : return 0;
1861 : }
1862 :
1863 : int
1864 44377409 : iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
1865 : struct iomap_writepage_ctx *wpc,
1866 : const struct iomap_writeback_ops *ops)
1867 : {
1868 44377409 : int ret;
1869 :
1870 44377409 : wpc->ops = ops;
1871 44377409 : ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
1872 44377833 : if (!wpc->ioend)
1873 : return ret;
1874 27680428 : return iomap_submit_ioend(wpc, wpc->ioend, ret);
1875 : }
1876 : EXPORT_SYMBOL_GPL(iomap_writepages);
1877 :
1878 0 : static int __init iomap_init(void)
1879 : {
1880 0 : return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
1881 : offsetof(struct iomap_ioend, io_inline_bio),
1882 : BIOSET_NEED_BVECS);
1883 : }
1884 : fs_initcall(iomap_init);
|