Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (C) 2010 Red Hat, Inc.
4 : * Copyright (C) 2016-2019 Christoph Hellwig.
5 : */
6 : #include <linux/module.h>
7 : #include <linux/compiler.h>
8 : #include <linux/fs.h>
9 : #include <linux/iomap.h>
10 : #include <linux/pagemap.h>
11 : #include <linux/uio.h>
12 : #include <linux/buffer_head.h>
13 : #include <linux/dax.h>
14 : #include <linux/writeback.h>
15 : #include <linux/list_sort.h>
16 : #include <linux/swap.h>
17 : #include <linux/bio.h>
18 : #include <linux/sched/signal.h>
19 : #include <linux/migrate.h>
20 : #include "trace.h"
21 :
22 : #include "../internal.h"
23 :
24 : #define IOEND_BATCH_SIZE 4096
25 :
26 : /*
27 : * Structure allocated for each folio when block size < folio size
28 : * to track sub-folio uptodate status and I/O completions.
29 : */
30 : struct iomap_page {
31 : atomic_t read_bytes_pending;
32 : atomic_t write_bytes_pending;
33 : spinlock_t uptodate_lock;
34 : unsigned long uptodate[];
35 : };
36 :
37 : static inline struct iomap_page *to_iomap_page(struct folio *folio)
38 : {
39 764921519 : if (folio_test_private(folio))
40 579914029 : return folio_get_private(folio);
41 : return NULL;
42 : }
43 :
44 : static struct bio_set iomap_ioend_bioset;
45 :
46 : static struct iomap_page *
47 294337552 : iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
48 : {
49 294337552 : struct iomap_page *iop = to_iomap_page(folio);
50 294337552 : unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
51 294337552 : gfp_t gfp;
52 :
53 294337552 : if (iop || nr_blocks <= 1)
54 : return iop;
55 :
56 184774913 : if (flags & IOMAP_NOWAIT)
57 : gfp = GFP_NOWAIT;
58 : else
59 184774691 : gfp = GFP_NOFS | __GFP_NOFAIL;
60 :
61 184774913 : iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
62 : gfp);
63 184770619 : if (iop) {
64 184770619 : spin_lock_init(&iop->uptodate_lock);
65 184969667 : if (folio_test_uptodate(folio))
66 198977 : bitmap_fill(iop->uptodate, nr_blocks);
67 184770690 : folio_attach_private(folio, iop);
68 : }
69 : return iop;
70 : }
71 :
72 184772824 : static void iomap_page_release(struct folio *folio)
73 : {
74 184772824 : struct iomap_page *iop = folio_detach_private(folio);
75 184782736 : struct inode *inode = folio->mapping->host;
76 184782736 : unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
77 :
78 184782736 : if (!iop)
79 : return;
80 184782736 : WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending));
81 184782736 : WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending));
82 369567573 : WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
83 : folio_test_uptodate(folio));
84 184784837 : kfree(iop);
85 : }
86 :
87 : /*
88 : * Calculate the range inside the folio that we actually need to read.
89 : */
90 249353811 : static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
91 : loff_t *pos, loff_t length, size_t *offp, size_t *lenp)
92 : {
93 249353811 : struct iomap_page *iop = to_iomap_page(folio);
94 249353811 : loff_t orig_pos = *pos;
95 249353811 : loff_t isize = i_size_read(inode);
96 249353811 : unsigned block_bits = inode->i_blkbits;
97 249353811 : unsigned block_size = (1 << block_bits);
98 249353811 : size_t poff = offset_in_folio(folio, *pos);
99 249353811 : size_t plen = min_t(loff_t, folio_size(folio) - poff, length);
100 249353811 : unsigned first = poff >> block_bits;
101 249353811 : unsigned last = (poff + plen - 1) >> block_bits;
102 :
103 : /*
104 : * If the block size is smaller than the page size, we need to check the
105 : * per-block uptodate status and adjust the offset and length if needed
106 : * to avoid reading in already uptodate ranges.
107 : */
108 249353811 : if (iop) {
109 : unsigned int i;
110 :
111 : /* move forward for each leading block marked uptodate */
112 290405919 : for (i = first; i <= last; i++) {
113 258789535 : if (!test_bit(i, iop->uptodate))
114 : break;
115 41052748 : *pos += block_size;
116 41052748 : poff += block_size;
117 41052748 : plen -= block_size;
118 41052748 : first++;
119 : }
120 :
121 : /* truncate len if we find any trailing uptodate block(s) */
122 3271782625 : for ( ; i <= last; i++) {
123 3023010644 : if (test_bit(i, iop->uptodate)) {
124 581190 : plen -= (last - i + 1) * block_size;
125 581190 : last = i - 1;
126 581190 : break;
127 : }
128 : }
129 : }
130 :
131 : /*
132 : * If the extent spans the block that contains the i_size, we need to
133 : * handle both halves separately so that we properly zero data in the
134 : * page cache for blocks that are entirely outside of i_size.
135 : */
136 249353811 : if (orig_pos <= isize && orig_pos + length > isize) {
137 58485443 : unsigned end = offset_in_folio(folio, isize - 1) >> block_bits;
138 :
139 58485443 : if (first <= end && last > end)
140 1570992 : plen -= (last - end) * block_size;
141 : }
142 :
143 249353811 : *offp = poff;
144 249353811 : *lenp = plen;
145 249353811 : }
146 :
147 354877615 : static void iomap_iop_set_range_uptodate(struct folio *folio,
148 : struct iomap_page *iop, size_t off, size_t len)
149 : {
150 354877615 : struct inode *inode = folio->mapping->host;
151 354877615 : unsigned first = off >> inode->i_blkbits;
152 354877615 : unsigned last = (off + len - 1) >> inode->i_blkbits;
153 354877615 : unsigned long flags;
154 :
155 354877615 : spin_lock_irqsave(&iop->uptodate_lock, flags);
156 354968340 : bitmap_set(iop->uptodate, first, last - first + 1);
157 368975509 : if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio)))
158 233179018 : folio_mark_uptodate(folio);
159 354964386 : spin_unlock_irqrestore(&iop->uptodate_lock, flags);
160 354947313 : }
161 :
162 355109426 : static void iomap_set_range_uptodate(struct folio *folio,
163 : struct iomap_page *iop, size_t off, size_t len)
164 : {
165 355109426 : if (iop)
166 354919949 : iomap_iop_set_range_uptodate(folio, iop, off, len);
167 : else
168 189477 : folio_mark_uptodate(folio);
169 355117110 : }
170 :
171 10772779 : static void iomap_finish_folio_read(struct folio *folio, size_t offset,
172 : size_t len, int error)
173 : {
174 10772779 : struct iomap_page *iop = to_iomap_page(folio);
175 :
176 10772779 : if (unlikely(error)) {
177 1347 : folio_clear_uptodate(folio);
178 1347 : folio_set_error(folio);
179 : } else {
180 10771432 : iomap_set_range_uptodate(folio, iop, offset, len);
181 : }
182 :
183 21545554 : if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending))
184 8277901 : folio_unlock(folio);
185 10772779 : }
186 :
187 9732742 : static void iomap_read_end_io(struct bio *bio)
188 : {
189 9732742 : int error = blk_status_to_errno(bio->bi_status);
190 9732742 : struct folio_iter fi;
191 :
192 20505521 : bio_for_each_folio_all(fi, bio)
193 10772779 : iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
194 9732742 : bio_put(bio);
195 9732742 : }
196 :
197 : struct iomap_readpage_ctx {
198 : struct folio *cur_folio;
199 : bool cur_folio_in_bio;
200 : struct bio *bio;
201 : struct readahead_control *rac;
202 : };
203 :
204 : /**
205 : * iomap_read_inline_data - copy inline data into the page cache
206 : * @iter: iteration structure
207 : * @folio: folio to copy to
208 : *
209 : * Copy the inline data in @iter into @folio and zero out the rest of the folio.
210 : * Only a single IOMAP_INLINE extent is allowed at the end of each file.
211 : * Returns zero for success to complete the read, or the usual negative errno.
212 : */
213 0 : static int iomap_read_inline_data(const struct iomap_iter *iter,
214 : struct folio *folio)
215 : {
216 0 : struct iomap_page *iop;
217 0 : const struct iomap *iomap = iomap_iter_srcmap(iter);
218 0 : size_t size = i_size_read(iter->inode) - iomap->offset;
219 0 : size_t poff = offset_in_page(iomap->offset);
220 0 : size_t offset = offset_in_folio(folio, iomap->offset);
221 0 : void *addr;
222 :
223 0 : if (folio_test_uptodate(folio))
224 : return 0;
225 :
226 0 : if (WARN_ON_ONCE(size > PAGE_SIZE - poff))
227 : return -EIO;
228 0 : if (WARN_ON_ONCE(size > PAGE_SIZE -
229 : offset_in_page(iomap->inline_data)))
230 : return -EIO;
231 0 : if (WARN_ON_ONCE(size > iomap->length))
232 : return -EIO;
233 0 : if (offset > 0)
234 0 : iop = iomap_page_create(iter->inode, folio, iter->flags);
235 : else
236 0 : iop = to_iomap_page(folio);
237 :
238 0 : addr = kmap_local_folio(folio, offset);
239 0 : memcpy(addr, iomap->inline_data, size);
240 0 : memset(addr + size, 0, PAGE_SIZE - poff - size);
241 0 : kunmap_local(addr);
242 0 : iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff);
243 0 : return 0;
244 : }
245 :
246 197819628 : static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
247 : loff_t pos)
248 : {
249 197819628 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
250 :
251 213208109 : return srcmap->type != IOMAP_MAPPED ||
252 197819628 : (srcmap->flags & IOMAP_F_NEW) ||
253 15388480 : pos >= i_size_read(iter->inode);
254 : }
255 :
256 166962258 : static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
257 : struct iomap_readpage_ctx *ctx, loff_t offset)
258 : {
259 166962258 : const struct iomap *iomap = &iter->iomap;
260 166962258 : loff_t pos = iter->pos + offset;
261 166962258 : loff_t length = iomap_length(iter) - offset;
262 166962258 : struct folio *folio = ctx->cur_folio;
263 166962258 : struct iomap_page *iop;
264 166962258 : loff_t orig_pos = pos;
265 166962258 : size_t poff, plen;
266 166962258 : sector_t sector;
267 :
268 166962258 : if (iomap->type == IOMAP_INLINE)
269 0 : return iomap_read_inline_data(iter, folio);
270 :
271 : /* zero post-eof blocks as the page may be mapped */
272 166962258 : iop = iomap_page_create(iter->inode, folio, iter->flags);
273 166961587 : iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
274 166964772 : if (plen == 0)
275 3831692 : goto done;
276 :
277 163133080 : if (iomap_block_needs_zeroing(iter, pos)) {
278 152248091 : folio_zero_range(folio, poff, plen);
279 152244944 : iomap_set_range_uptodate(folio, iop, poff, plen);
280 152246553 : goto done;
281 : }
282 :
283 10884989 : ctx->cur_folio_in_bio = true;
284 10884989 : if (iop)
285 10884985 : atomic_add(plen, &iop->read_bytes_pending);
286 :
287 10884948 : sector = iomap_sector(iomap, pos);
288 10884948 : if (!ctx->bio ||
289 5352269 : bio_end_sector(ctx->bio) != sector ||
290 1152710 : !bio_add_folio(ctx->bio, folio, plen, poff)) {
291 9732672 : gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
292 9732672 : gfp_t orig_gfp = gfp;
293 9732672 : unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
294 :
295 9732672 : if (ctx->bio)
296 3047285 : submit_bio(ctx->bio);
297 :
298 9732623 : if (ctx->rac) /* same as readahead_gfp_mask */
299 4612314 : gfp |= __GFP_NORETRY | __GFP_NOWARN;
300 9732623 : ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
301 : REQ_OP_READ, gfp);
302 : /*
303 : * If the bio_alloc fails, try it again for a single page to
304 : * avoid having to deal with partial page reads. This emulates
305 : * what do_mpage_read_folio does.
306 : */
307 9732687 : if (!ctx->bio) {
308 0 : ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
309 : orig_gfp);
310 : }
311 9732687 : if (ctx->rac)
312 4612350 : ctx->bio->bi_opf |= REQ_RAHEAD;
313 9732687 : ctx->bio->bi_iter.bi_sector = sector;
314 9732687 : ctx->bio->bi_end_io = iomap_read_end_io;
315 9732687 : bio_add_folio_nofail(ctx->bio, folio, plen, poff);
316 : }
317 :
318 1152274 : done:
319 : /*
320 : * Move the caller beyond our range so that it keeps making progress.
321 : * For that, we have to include any leading non-uptodate ranges, but
322 : * we can skip trailing ones as they will be handled in the next
323 : * iteration.
324 : */
325 166963164 : return pos - orig_pos + plen;
326 : }
327 :
328 113395255 : int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
329 : {
330 226790510 : struct iomap_iter iter = {
331 113395255 : .inode = folio->mapping->host,
332 : .pos = folio_pos(folio),
333 : .len = folio_size(folio),
334 : };
335 113395255 : struct iomap_readpage_ctx ctx = {
336 : .cur_folio = folio,
337 : };
338 113395255 : int ret;
339 :
340 113395255 : trace_iomap_readpage(iter.inode, 1);
341 :
342 236599162 : while ((ret = iomap_iter(&iter, ops)) > 0)
343 123204528 : iter.processed = iomap_readpage_iter(&iter, &ctx, 0);
344 :
345 113397659 : if (ret < 0)
346 65 : folio_set_error(folio);
347 :
348 113397659 : if (ctx.bio) {
349 4632478 : submit_bio(ctx.bio);
350 4632503 : WARN_ON_ONCE(!ctx.cur_folio_in_bio);
351 : } else {
352 108765181 : WARN_ON_ONCE(ctx.cur_folio_in_bio);
353 108765181 : folio_unlock(folio);
354 : }
355 :
356 : /*
357 : * Just like mpage_readahead and block_read_full_folio, we always
358 : * return 0 and just set the folio error flag on errors. This
359 : * should be cleaned up throughout the stack eventually.
360 : */
361 113395771 : return 0;
362 : }
363 : EXPORT_SYMBOL_GPL(iomap_read_folio);
364 :
365 18823756 : static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
366 : struct iomap_readpage_ctx *ctx)
367 : {
368 18823756 : loff_t length = iomap_length(iter);
369 18823756 : loff_t done, ret;
370 :
371 62586979 : for (done = 0; done < length; done += ret) {
372 43763130 : if (ctx->cur_folio &&
373 45943023 : offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) {
374 24119783 : if (!ctx->cur_folio_in_bio)
375 22017054 : folio_unlock(ctx->cur_folio);
376 24119428 : ctx->cur_folio = NULL;
377 : }
378 43762775 : if (!ctx->cur_folio) {
379 34200249 : ctx->cur_folio = readahead_folio(ctx->rac);
380 34200743 : ctx->cur_folio_in_bio = false;
381 : }
382 43763269 : ret = iomap_readpage_iter(iter, ctx, done);
383 43763223 : if (ret <= 0)
384 0 : return ret;
385 : }
386 :
387 : return done;
388 : }
389 :
390 : /**
391 : * iomap_readahead - Attempt to read pages from a file.
392 : * @rac: Describes the pages to be read.
393 : * @ops: The operations vector for the filesystem.
394 : *
395 : * This function is for filesystems to call to implement their readahead
396 : * address_space operation.
397 : *
398 : * Context: The @ops callbacks may submit I/O (eg to read the addresses of
399 : * blocks from disc), and may wait for it. The caller may be trying to
400 : * access a different page, and so sleeping excessively should be avoided.
401 : * It may allocate memory, but should avoid costly allocations. This
402 : * function is called with memalloc_nofs set, so allocations will not cause
403 : * the filesystem to be reentered.
404 : */
405 10081120 : void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
406 : {
407 10081120 : struct iomap_iter iter = {
408 10081120 : .inode = rac->mapping->host,
409 : .pos = readahead_pos(rac),
410 : .len = readahead_length(rac),
411 : };
412 10081120 : struct iomap_readpage_ctx ctx = {
413 : .rac = rac,
414 : };
415 :
416 10081120 : trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
417 :
418 28904962 : while (iomap_iter(&iter, ops) > 0)
419 18823766 : iter.processed = iomap_readahead_iter(&iter, &ctx);
420 :
421 10081138 : if (ctx.bio)
422 2052917 : submit_bio(ctx.bio);
423 10081130 : if (ctx.cur_folio) {
424 10081110 : if (!ctx.cur_folio_in_bio)
425 8538503 : folio_unlock(ctx.cur_folio);
426 : }
427 10081085 : }
428 : EXPORT_SYMBOL_GPL(iomap_readahead);
429 :
430 : /*
431 : * iomap_is_partially_uptodate checks whether blocks within a folio are
432 : * uptodate or not.
433 : *
434 : * Returns true if all blocks which correspond to the specified part
435 : * of the folio are uptodate.
436 : */
437 3563984 : bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
438 : {
439 3563984 : struct iomap_page *iop = to_iomap_page(folio);
440 3563984 : struct inode *inode = folio->mapping->host;
441 3563984 : unsigned first, last, i;
442 :
443 3563984 : if (!iop)
444 : return false;
445 :
446 : /* Caller's range may extend past the end of this folio */
447 3520025 : count = min(folio_size(folio) - from, count);
448 :
449 : /* First and last blocks in range within folio */
450 3520025 : first = from >> inode->i_blkbits;
451 3520025 : last = (from + count - 1) >> inode->i_blkbits;
452 :
453 7821246 : for (i = first; i <= last; i++)
454 6905464 : if (!test_bit(i, iop->uptodate))
455 : return false;
456 : return true;
457 : }
458 : EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
459 :
460 : /**
461 : * iomap_get_folio - get a folio reference for writing
462 : * @iter: iteration structure
463 : * @pos: start offset of write
464 : *
465 : * Returns a locked reference to the folio at @pos, or an error pointer if the
466 : * folio could not be obtained.
467 : */
468 157447260 : struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
469 : {
470 157447260 : unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
471 :
472 157447260 : if (iter->flags & IOMAP_NOWAIT)
473 0 : fgp |= FGP_NOWAIT;
474 :
475 157447260 : return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
476 : fgp, mapping_gfp_mask(iter->inode->i_mapping));
477 : }
478 : EXPORT_SYMBOL_GPL(iomap_get_folio);
479 :
480 48031770 : bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags)
481 : {
482 52327616 : trace_iomap_release_folio(folio->mapping->host, folio_pos(folio),
483 : folio_size(folio));
484 :
485 : /*
486 : * mm accommodates an old ext3 case where clean folios might
487 : * not have had the dirty bit cleared. Thus, it can send actual
488 : * dirty folios to ->release_folio() via shrink_active_list();
489 : * skip those here.
490 : */
491 48031349 : if (folio_test_dirty(folio) || folio_test_writeback(folio))
492 : return false;
493 47878630 : iomap_page_release(folio);
494 47878630 : return true;
495 : }
496 : EXPORT_SYMBOL_GPL(iomap_release_folio);
497 :
498 138151648 : void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
499 : {
500 138151648 : trace_iomap_invalidate_folio(folio->mapping->host,
501 138151648 : folio_pos(folio) + offset, len);
502 :
503 : /*
504 : * If we're invalidating the entire folio, clear the dirty state
505 : * from it and release it to avoid unnecessary buildup of the LRU.
506 : */
507 274117888 : if (offset == 0 && len == folio_size(folio)) {
508 135646372 : WARN_ON_ONCE(folio_test_writeback(folio));
509 135646372 : folio_cancel_dirty(folio);
510 135648641 : iomap_page_release(folio);
511 2504880 : } else if (folio_test_large(folio)) {
512 : /* Must release the iop so the page can be split */
513 2504654 : WARN_ON_ONCE(!folio_test_uptodate(folio) &&
514 : folio_test_dirty(folio));
515 1252327 : iomap_page_release(folio);
516 : }
517 138155581 : }
518 : EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
519 :
520 : static void
521 11371 : iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
522 : {
523 11371 : loff_t i_size = i_size_read(inode);
524 :
525 : /*
526 : * Only truncate newly allocated pages beyoned EOF, even if the
527 : * write started inside the existing inode size.
528 : */
529 11371 : if (pos + len > i_size)
530 3236 : truncate_pagecache_range(inode, max(pos, i_size),
531 : pos + len - 1);
532 11371 : }
533 :
534 4491250 : static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
535 : size_t poff, size_t plen, const struct iomap *iomap)
536 : {
537 4491250 : struct bio_vec bvec;
538 4491250 : struct bio bio;
539 :
540 4491250 : bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
541 4491253 : bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
542 4491253 : bio_add_folio_nofail(&bio, folio, plen, poff);
543 4491247 : return submit_bio_wait(&bio);
544 : }
545 :
546 157455451 : static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
547 : size_t len, struct folio *folio)
548 : {
549 157455451 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
550 157455451 : struct iomap_page *iop;
551 157455451 : loff_t block_size = i_blocksize(iter->inode);
552 157455451 : loff_t block_start = round_down(pos, block_size);
553 157455451 : loff_t block_end = round_up(pos + len, block_size);
554 157455451 : unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
555 157455451 : size_t from = offset_in_folio(folio, pos), to = from + len;
556 157455451 : size_t poff, plen;
557 :
558 232564037 : if (folio_test_uptodate(folio))
559 : return 0;
560 82331835 : folio_clear_error(folio);
561 :
562 82324850 : iop = iomap_page_create(iter->inode, folio, iter->flags);
563 82334409 : if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1)
564 : return -EAGAIN;
565 :
566 82388421 : do {
567 82388421 : iomap_adjust_read_range(iter->inode, folio, &block_start,
568 : block_end - block_start, &poff, &plen);
569 82384077 : if (plen == 0)
570 : break;
571 :
572 54608632 : if (!(iter->flags & IOMAP_UNSHARE) &&
573 54564438 : (from <= poff || from >= poff + plen) &&
574 36243211 : (to <= poff || to >= poff + plen))
575 19919364 : continue;
576 :
577 34689268 : if (iomap_block_needs_zeroing(iter, block_start)) {
578 30198007 : if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE))
579 : return -EIO;
580 30198007 : folio_zero_segments(folio, poff, from, to, poff + plen);
581 : } else {
582 4491261 : int status;
583 :
584 4491261 : if (iter->flags & IOMAP_NOWAIT)
585 : return -EAGAIN;
586 :
587 4491261 : status = iomap_read_folio_sync(block_start, folio,
588 : poff, plen, srcmap);
589 4490936 : if (status)
590 1139 : return status;
591 : }
592 34687987 : iomap_set_range_uptodate(folio, iop, poff, plen);
593 54609969 : } while ((block_start += plen) < block_end);
594 :
595 : return 0;
596 : }
597 :
598 157447148 : static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
599 : size_t len)
600 : {
601 157447148 : const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
602 :
603 157447148 : if (folio_ops && folio_ops->get_folio)
604 0 : return folio_ops->get_folio(iter, pos, len);
605 : else
606 157447148 : return iomap_get_folio(iter, pos);
607 : }
608 :
609 157466555 : static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret,
610 : struct folio *folio)
611 : {
612 157466555 : const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
613 :
614 157466555 : if (folio_ops && folio_ops->put_folio) {
615 0 : folio_ops->put_folio(iter->inode, pos, ret, folio);
616 : } else {
617 157466555 : folio_unlock(folio);
618 157454627 : folio_put(folio);
619 : }
620 157465338 : }
621 :
622 0 : static int iomap_write_begin_inline(const struct iomap_iter *iter,
623 : struct folio *folio)
624 : {
625 : /* needs more work for the tailpacking case; disable for now */
626 0 : if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0))
627 : return -EIO;
628 0 : return iomap_read_inline_data(iter, folio);
629 : }
630 :
631 157427723 : static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
632 : size_t len, struct folio **foliop)
633 : {
634 157427723 : const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
635 157427723 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
636 157427723 : struct folio *folio;
637 157427723 : int status = 0;
638 :
639 157427723 : BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
640 157427723 : if (srcmap != &iter->iomap)
641 909381 : BUG_ON(pos + len > srcmap->offset + srcmap->length);
642 :
643 157427723 : if (fatal_signal_pending(current))
644 : return -EINTR;
645 :
646 314889332 : if (!mapping_large_folio_support(iter->inode->i_mapping))
647 0 : len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
648 :
649 157444666 : folio = __iomap_get_folio(iter, pos, len);
650 157459934 : if (IS_ERR(folio))
651 0 : return PTR_ERR(folio);
652 :
653 : /*
654 : * Now we have a locked folio, before we do anything with it we need to
655 : * check that the iomap we have cached is not stale. The inode extent
656 : * mapping can change due to concurrent IO in flight (e.g.
657 : * IOMAP_UNWRITTEN state can change and memory reclaim could have
658 : * reclaimed a previously partially written page at this index after IO
659 : * completion before this write reaches this file offset) and hence we
660 : * could do the wrong thing here (zero a page range incorrectly or fail
661 : * to zero) and corrupt data.
662 : */
663 157459934 : if (folio_ops && folio_ops->iomap_valid) {
664 157462665 : bool iomap_valid = folio_ops->iomap_valid(iter->inode,
665 : &iter->iomap);
666 157459799 : if (!iomap_valid) {
667 10230 : iter->iomap.flags |= IOMAP_F_STALE;
668 10230 : status = 0;
669 10230 : goto out_unlock;
670 : }
671 : }
672 :
673 158298334 : if (pos + len > folio_pos(folio) + folio_size(folio))
674 17810 : len = folio_pos(folio) + folio_size(folio) - pos;
675 :
676 157446838 : if (srcmap->type == IOMAP_INLINE)
677 0 : status = iomap_write_begin_inline(iter, folio);
678 157446838 : else if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
679 0 : status = __block_write_begin_int(folio, pos, len, NULL, srcmap);
680 : else
681 157446838 : status = __iomap_write_begin(iter, pos, len, folio);
682 :
683 157431286 : if (unlikely(status))
684 1139 : goto out_unlock;
685 :
686 157430147 : *foliop = folio;
687 157430147 : return 0;
688 :
689 11369 : out_unlock:
690 11369 : __iomap_put_folio(iter, pos, 0, folio);
691 11369 : iomap_write_failed(iter->inode, pos, len);
692 :
693 11369 : return status;
694 : }
695 :
696 157422971 : static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
697 : size_t copied, struct folio *folio)
698 : {
699 157422971 : struct iomap_page *iop = to_iomap_page(folio);
700 157422971 : flush_dcache_folio(folio);
701 :
702 : /*
703 : * The blocks that were entirely written will now be uptodate, so we
704 : * don't have to worry about a read_folio reading them and overwriting a
705 : * partial write. However, if we've encountered a short write and only
706 : * partially written into a block, it will not be marked uptodate, so a
707 : * read_folio might come in and destroy our partial write.
708 : *
709 : * Do the simplest thing and just treat any short write to a
710 : * non-uptodate page as a zero-length write, and force the caller to
711 : * redo the whole thing.
712 : */
713 157438064 : if (unlikely(copied < len && !folio_test_uptodate(folio)))
714 : return 0;
715 158289556 : iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len);
716 157404918 : filemap_dirty_folio(inode->i_mapping, folio);
717 157404918 : return copied;
718 : }
719 :
720 0 : static size_t iomap_write_end_inline(const struct iomap_iter *iter,
721 : struct folio *folio, loff_t pos, size_t copied)
722 : {
723 0 : const struct iomap *iomap = &iter->iomap;
724 0 : void *addr;
725 :
726 0 : WARN_ON_ONCE(!folio_test_uptodate(folio));
727 0 : BUG_ON(!iomap_inline_data_valid(iomap));
728 :
729 0 : flush_dcache_folio(folio);
730 0 : addr = kmap_local_folio(folio, pos);
731 0 : memcpy(iomap_inline_data(iomap, pos), addr, copied);
732 0 : kunmap_local(addr);
733 :
734 0 : mark_inode_dirty(iter->inode);
735 0 : return copied;
736 : }
737 :
738 : /* Returns the number of bytes copied. May be 0. Cannot be an errno. */
739 157440818 : static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
740 : size_t copied, struct folio *folio)
741 : {
742 157440818 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
743 157440818 : loff_t old_size = iter->inode->i_size;
744 157440818 : size_t ret;
745 :
746 157440818 : if (srcmap->type == IOMAP_INLINE) {
747 0 : ret = iomap_write_end_inline(iter, folio, pos, copied);
748 157440818 : } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
749 0 : ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
750 : copied, &folio->page, NULL);
751 : } else {
752 157440818 : ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
753 : }
754 :
755 : /*
756 : * Update the in-memory inode size after copying the data into the page
757 : * cache. It's up to the file system to write the updated size to disk,
758 : * preferably after I/O completion so that no stale data is exposed.
759 : */
760 157455269 : if (pos + ret > old_size) {
761 61815951 : i_size_write(iter->inode, pos + ret);
762 61815951 : iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
763 : }
764 157455269 : __iomap_put_folio(iter, pos, ret, folio);
765 :
766 157452981 : if (old_size < pos)
767 6106760 : pagecache_isize_extended(iter->inode, old_size, pos);
768 157452972 : if (ret < len)
769 2 : iomap_write_failed(iter->inode, pos + ret, len - ret);
770 157452972 : return ret;
771 : }
772 :
773 55735613 : static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
774 : {
775 55735613 : loff_t length = iomap_length(iter);
776 55735613 : loff_t pos = iter->pos;
777 55735613 : ssize_t written = 0;
778 55735613 : long status = 0;
779 55735613 : struct address_space *mapping = iter->inode->i_mapping;
780 55735613 : unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
781 :
782 141316484 : do {
783 141316484 : struct folio *folio;
784 141316484 : struct page *page;
785 141316484 : unsigned long offset; /* Offset into pagecache page */
786 141316484 : unsigned long bytes; /* Bytes to write to page */
787 141316484 : size_t copied; /* Bytes copied from user */
788 :
789 141316484 : offset = offset_in_page(pos);
790 141316484 : bytes = min_t(unsigned long, PAGE_SIZE - offset,
791 : iov_iter_count(i));
792 141316486 : again:
793 141316486 : status = balance_dirty_pages_ratelimited_flags(mapping,
794 : bdp_flags);
795 141308734 : if (unlikely(status))
796 : break;
797 :
798 141308734 : if (bytes > length)
799 : bytes = length;
800 :
801 : /*
802 : * Bring in the user page that we'll copy from _first_.
803 : * Otherwise there's a nasty deadlock on copying from the
804 : * same page as we're writing to, without it being marked
805 : * up-to-date.
806 : *
807 : * For async buffered writes the assumption is that the user
808 : * page has already been faulted in. This can be optimized by
809 : * faulting the user page.
810 : */
811 141308734 : if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
812 : status = -EFAULT;
813 : break;
814 : }
815 :
816 141262361 : status = iomap_write_begin(iter, pos, bytes, &folio);
817 141284968 : if (unlikely(status))
818 : break;
819 141284612 : if (iter->iomap.flags & IOMAP_F_STALE)
820 : break;
821 :
822 141274607 : page = folio_file_page(folio, pos >> PAGE_SHIFT);
823 141297185 : if (mapping_writably_mapped(mapping))
824 58295 : flush_dcache_page(page);
825 :
826 141297185 : copied = copy_page_from_iter_atomic(page, offset, bytes, i);
827 :
828 141294623 : status = iomap_write_end(iter, pos, bytes, copied, folio);
829 :
830 141302170 : if (unlikely(copied != status))
831 0 : iov_iter_revert(i, copied - status);
832 :
833 141302170 : cond_resched();
834 141302521 : if (unlikely(status == 0)) {
835 : /*
836 : * A short copy made iomap_write_end() reject the
837 : * thing entirely. Might be memory poisoning
838 : * halfway through, might be a race with munmap,
839 : * might be severe memory pressure.
840 : */
841 2 : if (copied)
842 0 : bytes = copied;
843 2 : goto again;
844 : }
845 141302519 : pos += status;
846 141302519 : written += status;
847 141302519 : length -= status;
848 141302519 : } while (iov_iter_count(i) && length);
849 :
850 55732009 : if (status == -EAGAIN) {
851 0 : iov_iter_revert(i, written);
852 0 : return -EAGAIN;
853 : }
854 55732009 : return written ? written : status;
855 : }
856 :
857 : ssize_t
858 53657415 : iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
859 : const struct iomap_ops *ops)
860 : {
861 53657415 : struct iomap_iter iter = {
862 53657415 : .inode = iocb->ki_filp->f_mapping->host,
863 53657415 : .pos = iocb->ki_pos,
864 : .len = iov_iter_count(i),
865 : .flags = IOMAP_WRITE,
866 : };
867 53657415 : ssize_t ret;
868 :
869 53657415 : if (iocb->ki_flags & IOCB_NOWAIT)
870 0 : iter.flags |= IOMAP_NOWAIT;
871 :
872 109389063 : while ((ret = iomap_iter(&iter, ops)) > 0)
873 55725436 : iter.processed = iomap_write_iter(&iter, i);
874 :
875 53664973 : if (unlikely(iter.pos == iocb->ki_pos))
876 : return ret;
877 53089040 : ret = iter.pos - iocb->ki_pos;
878 53089040 : iocb->ki_pos = iter.pos;
879 53089040 : return ret;
880 : }
881 : EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
882 :
883 : /*
884 : * Scan the data range passed to us for dirty page cache folios. If we find a
885 : * dirty folio, punch out the preceeding range and update the offset from which
886 : * the next punch will start from.
887 : *
888 : * We can punch out storage reservations under clean pages because they either
889 : * contain data that has been written back - in which case the delalloc punch
890 : * over that range is a no-op - or they have been read faults in which case they
891 : * contain zeroes and we can remove the delalloc backing range and any new
892 : * writes to those pages will do the normal hole filling operation...
893 : *
894 : * This makes the logic simple: we only need to keep the delalloc extents only
895 : * over the dirty ranges of the page cache.
896 : *
897 : * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
898 : * simplify range iterations.
899 : */
900 95 : static int iomap_write_delalloc_scan(struct inode *inode,
901 : loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
902 : int (*punch)(struct inode *inode, loff_t offset, loff_t length))
903 : {
904 195 : while (start_byte < end_byte) {
905 100 : struct folio *folio;
906 :
907 : /* grab locked page */
908 100 : folio = filemap_lock_folio(inode->i_mapping,
909 100 : start_byte >> PAGE_SHIFT);
910 100 : if (IS_ERR(folio)) {
911 0 : start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) +
912 : PAGE_SIZE;
913 0 : continue;
914 : }
915 :
916 : /* if dirty, punch up to offset */
917 100 : if (folio_test_dirty(folio)) {
918 6 : if (start_byte > *punch_start_byte) {
919 0 : int error;
920 :
921 0 : error = punch(inode, *punch_start_byte,
922 : start_byte - *punch_start_byte);
923 0 : if (error) {
924 0 : folio_unlock(folio);
925 0 : folio_put(folio);
926 0 : return error;
927 : }
928 : }
929 :
930 : /*
931 : * Make sure the next punch start is correctly bound to
932 : * the end of this data range, not the end of the folio.
933 : */
934 8 : *punch_start_byte = min_t(loff_t, end_byte,
935 : folio_next_index(folio) << PAGE_SHIFT);
936 : }
937 :
938 : /* move offset to start of next folio in range */
939 100 : start_byte = folio_next_index(folio) << PAGE_SHIFT;
940 100 : folio_unlock(folio);
941 100 : folio_put(folio);
942 : }
943 : return 0;
944 : }
945 :
946 : /*
947 : * Punch out all the delalloc blocks in the range given except for those that
948 : * have dirty data still pending in the page cache - those are going to be
949 : * written and so must still retain the delalloc backing for writeback.
950 : *
951 : * As we are scanning the page cache for data, we don't need to reimplement the
952 : * wheel - mapping_seek_hole_data() does exactly what we need to identify the
953 : * start and end of data ranges correctly even for sub-folio block sizes. This
954 : * byte range based iteration is especially convenient because it means we
955 : * don't have to care about variable size folios, nor where the start or end of
956 : * the data range lies within a folio, if they lie within the same folio or even
957 : * if there are multiple discontiguous data ranges within the folio.
958 : *
959 : * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so
960 : * can return data ranges that exist in the cache beyond EOF. e.g. a page fault
961 : * spanning EOF will initialise the post-EOF data to zeroes and mark it up to
962 : * date. A write page fault can then mark it dirty. If we then fail a write()
963 : * beyond EOF into that up to date cached range, we allocate a delalloc block
964 : * beyond EOF and then have to punch it out. Because the range is up to date,
965 : * mapping_seek_hole_data() will return it, and we will skip the punch because
966 : * the folio is dirty. THis is incorrect - we always need to punch out delalloc
967 : * beyond EOF in this case as writeback will never write back and covert that
968 : * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF,
969 : * resulting in always punching out the range from the EOF to the end of the
970 : * range the iomap spans.
971 : *
972 : * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it
973 : * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA
974 : * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte)
975 : * returns the end of the data range (data_end). Using closed intervals would
976 : * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
977 : * the code to subtle off-by-one bugs....
978 : */
979 5742 : static int iomap_write_delalloc_release(struct inode *inode,
980 : loff_t start_byte, loff_t end_byte,
981 : int (*punch)(struct inode *inode, loff_t pos, loff_t length))
982 : {
983 5742 : loff_t punch_start_byte = start_byte;
984 5742 : loff_t scan_end_byte = min(i_size_read(inode), end_byte);
985 5742 : int error = 0;
986 :
987 : /*
988 : * Lock the mapping to avoid races with page faults re-instantiating
989 : * folios and dirtying them via ->page_mkwrite whilst we walk the
990 : * cache and perform delalloc extent removal. Failing to do this can
991 : * leave dirty pages with no space reservation in the cache.
992 : */
993 5742 : filemap_invalidate_lock(inode->i_mapping);
994 5837 : while (start_byte < scan_end_byte) {
995 4980 : loff_t data_end;
996 :
997 4980 : start_byte = mapping_seek_hole_data(inode->i_mapping,
998 : start_byte, scan_end_byte, SEEK_DATA);
999 : /*
1000 : * If there is no more data to scan, all that is left is to
1001 : * punch out the remaining range.
1002 : */
1003 4980 : if (start_byte == -ENXIO || start_byte == scan_end_byte)
1004 : break;
1005 95 : if (start_byte < 0) {
1006 0 : error = start_byte;
1007 0 : goto out_unlock;
1008 : }
1009 95 : WARN_ON_ONCE(start_byte < punch_start_byte);
1010 95 : WARN_ON_ONCE(start_byte > scan_end_byte);
1011 :
1012 : /*
1013 : * We find the end of this contiguous cached data range by
1014 : * seeking from start_byte to the beginning of the next hole.
1015 : */
1016 95 : data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
1017 : scan_end_byte, SEEK_HOLE);
1018 95 : if (data_end < 0) {
1019 0 : error = data_end;
1020 0 : goto out_unlock;
1021 : }
1022 95 : WARN_ON_ONCE(data_end <= start_byte);
1023 95 : WARN_ON_ONCE(data_end > scan_end_byte);
1024 :
1025 95 : error = iomap_write_delalloc_scan(inode, &punch_start_byte,
1026 : start_byte, data_end, punch);
1027 95 : if (error)
1028 0 : goto out_unlock;
1029 :
1030 : /* The next data search starts at the end of this one. */
1031 : start_byte = data_end;
1032 : }
1033 :
1034 5742 : if (punch_start_byte < end_byte)
1035 5736 : error = punch(inode, punch_start_byte,
1036 : end_byte - punch_start_byte);
1037 6 : out_unlock:
1038 5742 : filemap_invalidate_unlock(inode->i_mapping);
1039 5742 : return error;
1040 : }
1041 :
1042 : /*
1043 : * When a short write occurs, the filesystem may need to remove reserved space
1044 : * that was allocated in ->iomap_begin from it's ->iomap_end method. For
1045 : * filesystems that use delayed allocation, we need to punch out delalloc
1046 : * extents from the range that are not dirty in the page cache. As the write can
1047 : * race with page faults, there can be dirty pages over the delalloc extent
1048 : * outside the range of a short write but still within the delalloc extent
1049 : * allocated for this iomap.
1050 : *
1051 : * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
1052 : * simplify range iterations.
1053 : *
1054 : * The punch() callback *must* only punch delalloc extents in the range passed
1055 : * to it. It must skip over all other types of extents in the range and leave
1056 : * them completely unchanged. It must do this punch atomically with respect to
1057 : * other extent modifications.
1058 : *
1059 : * The punch() callback may be called with a folio locked to prevent writeback
1060 : * extent allocation racing at the edge of the range we are currently punching.
1061 : * The locked folio may or may not cover the range being punched, so it is not
1062 : * safe for the punch() callback to lock folios itself.
1063 : *
1064 : * Lock order is:
1065 : *
1066 : * inode->i_rwsem (shared or exclusive)
1067 : * inode->i_mapping->invalidate_lock (exclusive)
1068 : * folio_lock()
1069 : * ->punch
1070 : * internal filesystem allocation lock
1071 : */
1072 97707383 : int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
1073 : struct iomap *iomap, loff_t pos, loff_t length,
1074 : ssize_t written,
1075 : int (*punch)(struct inode *inode, loff_t pos, loff_t length))
1076 : {
1077 97707383 : loff_t start_byte;
1078 97707383 : loff_t end_byte;
1079 97707383 : unsigned int blocksize = i_blocksize(inode);
1080 :
1081 97707383 : if (iomap->type != IOMAP_DELALLOC)
1082 : return 0;
1083 :
1084 : /* If we didn't reserve the blocks, we're not allowed to punch them. */
1085 54395135 : if (!(iomap->flags & IOMAP_F_NEW))
1086 : return 0;
1087 :
1088 : /*
1089 : * start_byte refers to the first unused block after a short write. If
1090 : * nothing was written, round offset down to point at the first block in
1091 : * the range.
1092 : */
1093 16545092 : if (unlikely(!written))
1094 3963 : start_byte = round_down(pos, blocksize);
1095 : else
1096 16541129 : start_byte = round_up(pos + written, blocksize);
1097 16545092 : end_byte = round_up(pos + length, blocksize);
1098 :
1099 : /* Nothing to do if we've written the entire delalloc extent */
1100 16545092 : if (start_byte >= end_byte)
1101 : return 0;
1102 :
1103 5742 : return iomap_write_delalloc_release(inode, start_byte, end_byte,
1104 : punch);
1105 : }
1106 : EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
1107 :
1108 12686 : static loff_t iomap_unshare_iter(struct iomap_iter *iter)
1109 : {
1110 12686 : struct iomap *iomap = &iter->iomap;
1111 12686 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
1112 12686 : loff_t pos = iter->pos;
1113 12686 : loff_t length = iomap_length(iter);
1114 12686 : long status = 0;
1115 12686 : loff_t written = 0;
1116 :
1117 : /* don't bother with blocks that are not shared to start with */
1118 12686 : if (!(iomap->flags & IOMAP_F_SHARED))
1119 : return length;
1120 : /* don't bother with holes or unwritten extents */
1121 4841 : if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
1122 : return length;
1123 :
1124 6107 : do {
1125 6107 : unsigned long offset = offset_in_page(pos);
1126 6107 : unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length);
1127 6107 : struct folio *folio;
1128 :
1129 6107 : status = iomap_write_begin(iter, pos, bytes, &folio);
1130 6107 : if (unlikely(status))
1131 0 : return status;
1132 6107 : if (iter->iomap.flags & IOMAP_F_STALE)
1133 : break;
1134 :
1135 6107 : status = iomap_write_end(iter, pos, bytes, bytes, folio);
1136 6107 : if (WARN_ON_ONCE(status == 0))
1137 : return -EIO;
1138 :
1139 6107 : cond_resched();
1140 :
1141 6107 : pos += status;
1142 6107 : written += status;
1143 6107 : length -= status;
1144 :
1145 6107 : balance_dirty_pages_ratelimited(iter->inode->i_mapping);
1146 6107 : } while (length);
1147 :
1148 : return written;
1149 : }
1150 :
1151 : int
1152 40 : iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
1153 : const struct iomap_ops *ops)
1154 : {
1155 40 : struct iomap_iter iter = {
1156 : .inode = inode,
1157 : .pos = pos,
1158 : .len = len,
1159 : .flags = IOMAP_WRITE | IOMAP_UNSHARE,
1160 : };
1161 40 : int ret;
1162 :
1163 12726 : while ((ret = iomap_iter(&iter, ops)) > 0)
1164 12686 : iter.processed = iomap_unshare_iter(&iter);
1165 40 : return ret;
1166 : }
1167 : EXPORT_SYMBOL_GPL(iomap_file_unshare);
1168 :
1169 41980911 : static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
1170 : {
1171 41980911 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
1172 41980911 : loff_t pos = iter->pos;
1173 41980911 : loff_t length = iomap_length(iter);
1174 41980911 : loff_t written = 0;
1175 :
1176 : /* already zeroed? we're done. */
1177 41980911 : if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
1178 : return length;
1179 :
1180 16154850 : do {
1181 16154850 : struct folio *folio;
1182 16154850 : int status;
1183 16154850 : size_t offset;
1184 16154850 : size_t bytes = min_t(u64, SIZE_MAX, length);
1185 :
1186 16154850 : status = iomap_write_begin(iter, pos, bytes, &folio);
1187 16155431 : if (status)
1188 897 : return status;
1189 16154534 : if (iter->iomap.flags & IOMAP_F_STALE)
1190 : break;
1191 :
1192 16154309 : offset = offset_in_folio(folio, pos);
1193 16202085 : if (bytes > folio_size(folio) - offset)
1194 17810 : bytes = folio_size(folio) - offset;
1195 :
1196 16154309 : folio_zero_range(folio, offset, bytes);
1197 16140412 : folio_mark_accessed(folio);
1198 :
1199 16154473 : bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
1200 16154664 : if (WARN_ON_ONCE(bytes == 0))
1201 : return -EIO;
1202 :
1203 16154664 : pos += bytes;
1204 16154664 : length -= bytes;
1205 16154664 : written += bytes;
1206 16154664 : } while (length > 0);
1207 :
1208 16137147 : if (did_zero)
1209 1756225 : *did_zero = true;
1210 : return written;
1211 : }
1212 :
1213 : int
1214 32581285 : iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
1215 : const struct iomap_ops *ops)
1216 : {
1217 32581285 : struct iomap_iter iter = {
1218 : .inode = inode,
1219 : .pos = pos,
1220 : .len = len,
1221 : .flags = IOMAP_ZERO,
1222 : };
1223 32581285 : int ret;
1224 :
1225 74562921 : while ((ret = iomap_iter(&iter, ops)) > 0)
1226 41977097 : iter.processed = iomap_zero_iter(&iter, did_zero);
1227 32591870 : return ret;
1228 : }
1229 : EXPORT_SYMBOL_GPL(iomap_zero_range);
1230 :
1231 : int
1232 2665277 : iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
1233 : const struct iomap_ops *ops)
1234 : {
1235 2665277 : unsigned int blocksize = i_blocksize(inode);
1236 2665277 : unsigned int off = pos & (blocksize - 1);
1237 :
1238 : /* Block boundary? Nothing to do */
1239 2665277 : if (!off)
1240 : return 0;
1241 2120020 : return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
1242 : }
1243 : EXPORT_SYMBOL_GPL(iomap_truncate_page);
1244 :
1245 5309930 : static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
1246 : struct folio *folio)
1247 : {
1248 5309930 : loff_t length = iomap_length(iter);
1249 5309930 : int ret;
1250 :
1251 5309930 : if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) {
1252 0 : ret = __block_write_begin_int(folio, iter->pos, length, NULL,
1253 0 : &iter->iomap);
1254 0 : if (ret)
1255 0 : return ret;
1256 0 : block_commit_write(&folio->page, 0, length);
1257 : } else {
1258 10619869 : WARN_ON_ONCE(!folio_test_uptodate(folio));
1259 5309941 : folio_mark_dirty(folio);
1260 : }
1261 :
1262 : return length;
1263 : }
1264 :
1265 3579329 : vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
1266 : {
1267 3579329 : struct iomap_iter iter = {
1268 3579329 : .inode = file_inode(vmf->vma->vm_file),
1269 : .flags = IOMAP_WRITE | IOMAP_FAULT,
1270 : };
1271 3579329 : struct folio *folio = page_folio(vmf->page);
1272 3579329 : ssize_t ret;
1273 :
1274 3579329 : folio_lock(folio);
1275 3579321 : ret = folio_mkwrite_check_truncate(folio, iter.inode);
1276 3579288 : if (ret < 0)
1277 175 : goto out_unlock;
1278 3579113 : iter.pos = folio_pos(folio);
1279 3579113 : iter.len = ret;
1280 8888981 : while ((ret = iomap_iter(&iter, ops)) > 0)
1281 5309930 : iter.processed = iomap_folio_mkwrite_iter(&iter, folio);
1282 :
1283 3579124 : if (ret < 0)
1284 21556 : goto out_unlock;
1285 3557568 : folio_wait_stable(folio);
1286 3557568 : return VM_FAULT_LOCKED;
1287 21731 : out_unlock:
1288 21731 : folio_unlock(folio);
1289 21731 : return block_page_mkwrite_return(ret);
1290 : }
1291 : EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
1292 :
1293 49470422 : static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
1294 : size_t len, int error)
1295 : {
1296 49470422 : struct iomap_page *iop = to_iomap_page(folio);
1297 :
1298 49470422 : if (error) {
1299 132974 : folio_set_error(folio);
1300 132974 : mapping_set_error(inode->i_mapping, error);
1301 : }
1302 :
1303 100798234 : WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !iop);
1304 98940569 : WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0);
1305 :
1306 98940567 : if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending))
1307 44882468 : folio_end_writeback(folio);
1308 49470444 : }
1309 :
1310 : /*
1311 : * We're now finished for good with this ioend structure. Update the page
1312 : * state, release holds on bios, and finally free up memory. Do not use the
1313 : * ioend after this.
1314 : */
1315 : static u32
1316 30913804 : iomap_finish_ioend(struct iomap_ioend *ioend, int error)
1317 : {
1318 30913804 : struct inode *inode = ioend->io_inode;
1319 30913804 : struct bio *bio = &ioend->io_inline_bio;
1320 30913804 : struct bio *last = ioend->io_bio, *next;
1321 30913804 : u64 start = bio->bi_iter.bi_sector;
1322 30913804 : loff_t offset = ioend->io_offset;
1323 30913804 : bool quiet = bio_flagged(bio, BIO_QUIET);
1324 30913804 : u32 folio_count = 0;
1325 :
1326 61829903 : for (bio = &ioend->io_inline_bio; bio; bio = next) {
1327 30915920 : struct folio_iter fi;
1328 :
1329 : /*
1330 : * For the last bio, bi_private points to the ioend, so we
1331 : * need to explicitly end the iteration here.
1332 : */
1333 30915920 : if (bio == last)
1334 : next = NULL;
1335 : else
1336 1965 : next = bio->bi_private;
1337 :
1338 : /* walk all folios in bio, ending page IO on them */
1339 80386379 : bio_for_each_folio_all(fi, bio) {
1340 49470442 : iomap_finish_folio_write(inode, fi.folio, fi.length,
1341 : error);
1342 49470422 : folio_count++;
1343 : }
1344 30915949 : bio_put(bio);
1345 : }
1346 : /* The ioend has been freed by bio_put() */
1347 :
1348 30913983 : if (unlikely(error && !quiet)) {
1349 44406 : printk_ratelimited(KERN_ERR
1350 : "%s: writeback error on inode %lu, offset %lld, sector %llu",
1351 : inode->i_sb->s_id, inode->i_ino, offset, start);
1352 : }
1353 30913984 : return folio_count;
1354 : }
1355 :
1356 : /*
1357 : * Ioend completion routine for merged bios. This can only be called from task
1358 : * contexts as merged ioends can be of unbound length. Hence we have to break up
1359 : * the writeback completions into manageable chunks to avoid long scheduler
1360 : * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
1361 : * good batch processing throughput without creating adverse scheduler latency
1362 : * conditions.
1363 : */
1364 : void
1365 26834983 : iomap_finish_ioends(struct iomap_ioend *ioend, int error)
1366 : {
1367 26834983 : struct list_head tmp;
1368 26834983 : u32 completions;
1369 :
1370 26834983 : might_sleep();
1371 :
1372 26834991 : list_replace_init(&ioend->io_list, &tmp);
1373 26834991 : completions = iomap_finish_ioend(ioend, error);
1374 :
1375 26839068 : while (!list_empty(&tmp)) {
1376 4077 : if (completions > IOEND_BATCH_SIZE * 8) {
1377 0 : cond_resched();
1378 0 : completions = 0;
1379 : }
1380 4077 : ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
1381 4077 : list_del_init(&ioend->io_list);
1382 4077 : completions += iomap_finish_ioend(ioend, error);
1383 : }
1384 26834991 : }
1385 : EXPORT_SYMBOL_GPL(iomap_finish_ioends);
1386 :
1387 : /*
1388 : * We can merge two adjacent ioends if they have the same set of work to do.
1389 : */
1390 : static bool
1391 1866560 : iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
1392 : {
1393 1866560 : if (ioend->io_bio->bi_status != next->io_bio->bi_status)
1394 : return false;
1395 1866550 : if ((ioend->io_flags & IOMAP_F_SHARED) ^
1396 1866550 : (next->io_flags & IOMAP_F_SHARED))
1397 : return false;
1398 1858125 : if ((ioend->io_type == IOMAP_UNWRITTEN) ^
1399 1858125 : (next->io_type == IOMAP_UNWRITTEN))
1400 : return false;
1401 1349658 : if (ioend->io_offset + ioend->io_size != next->io_offset)
1402 : return false;
1403 : /*
1404 : * Do not merge physically discontiguous ioends. The filesystem
1405 : * completion functions will have to iterate the physical
1406 : * discontiguities even if we merge the ioends at a logical level, so
1407 : * we don't gain anything by merging physical discontiguities here.
1408 : *
1409 : * We cannot use bio->bi_iter.bi_sector here as it is modified during
1410 : * submission so does not point to the start sector of the bio at
1411 : * completion.
1412 : */
1413 400187 : if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
1414 396110 : return false;
1415 : return true;
1416 : }
1417 :
1418 : void
1419 26834991 : iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
1420 : {
1421 26834991 : struct iomap_ioend *next;
1422 :
1423 26834991 : INIT_LIST_HEAD(&ioend->io_list);
1424 :
1425 26839068 : while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
1426 : io_list))) {
1427 1866560 : if (!iomap_ioend_can_merge(ioend, next))
1428 : break;
1429 4077 : list_move_tail(&next->io_list, &ioend->io_list);
1430 4077 : ioend->io_size += next->io_size;
1431 : }
1432 26834991 : }
1433 : EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
1434 :
1435 : static int
1436 2943710 : iomap_ioend_compare(void *priv, const struct list_head *a,
1437 : const struct list_head *b)
1438 : {
1439 2943710 : struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
1440 2943710 : struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
1441 :
1442 2943710 : if (ia->io_offset < ib->io_offset)
1443 : return -1;
1444 1018408 : if (ia->io_offset > ib->io_offset)
1445 1018381 : return 1;
1446 : return 0;
1447 : }
1448 :
1449 : void
1450 24972508 : iomap_sort_ioends(struct list_head *ioend_list)
1451 : {
1452 24972508 : list_sort(NULL, ioend_list, iomap_ioend_compare);
1453 24972508 : }
1454 : EXPORT_SYMBOL_GPL(iomap_sort_ioends);
1455 :
1456 4074916 : static void iomap_writepage_end_bio(struct bio *bio)
1457 : {
1458 4074916 : struct iomap_ioend *ioend = bio->bi_private;
1459 :
1460 4074916 : iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status));
1461 4074916 : }
1462 :
1463 : /*
1464 : * Submit the final bio for an ioend.
1465 : *
1466 : * If @error is non-zero, it means that we have a situation where some part of
1467 : * the submission process has failed after we've marked pages for writeback
1468 : * and unlocked them. In this situation, we need to fail the bio instead of
1469 : * submitting it. This typically only happens on a filesystem shutdown.
1470 : */
1471 : static int
1472 30913637 : iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
1473 : int error)
1474 : {
1475 30913637 : ioend->io_bio->bi_private = ioend;
1476 30913637 : ioend->io_bio->bi_end_io = iomap_writepage_end_bio;
1477 :
1478 30913637 : if (wpc->ops->prepare_ioend)
1479 30913637 : error = wpc->ops->prepare_ioend(ioend, error);
1480 30913398 : if (error) {
1481 : /*
1482 : * If we're failing the IO now, just mark the ioend with an
1483 : * error and finish it. This will run IO completion immediately
1484 : * as there is only one reference to the ioend at this point in
1485 : * time.
1486 : */
1487 151 : ioend->io_bio->bi_status = errno_to_blk_status(error);
1488 151 : bio_endio(ioend->io_bio);
1489 151 : return error;
1490 : }
1491 :
1492 30913247 : submit_bio(ioend->io_bio);
1493 30913247 : return 0;
1494 : }
1495 :
1496 : static struct iomap_ioend *
1497 30913418 : iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
1498 : loff_t offset, sector_t sector, struct writeback_control *wbc)
1499 : {
1500 30913418 : struct iomap_ioend *ioend;
1501 30913418 : struct bio *bio;
1502 :
1503 53006129 : bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
1504 : REQ_OP_WRITE | wbc_to_write_flags(wbc),
1505 : GFP_NOFS, &iomap_ioend_bioset);
1506 30913788 : bio->bi_iter.bi_sector = sector;
1507 30913788 : wbc_init_bio(wbc, bio);
1508 :
1509 30913307 : ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
1510 30913307 : INIT_LIST_HEAD(&ioend->io_list);
1511 30913307 : ioend->io_type = wpc->iomap.type;
1512 30913307 : ioend->io_flags = wpc->iomap.flags;
1513 30913307 : ioend->io_inode = inode;
1514 30913307 : ioend->io_size = 0;
1515 30913307 : ioend->io_folios = 0;
1516 30913307 : ioend->io_offset = offset;
1517 30913307 : ioend->io_bio = bio;
1518 30913307 : ioend->io_sector = sector;
1519 30913307 : return ioend;
1520 : }
1521 :
1522 : /*
1523 : * Allocate a new bio, and chain the old bio to the new one.
1524 : *
1525 : * Note that we have to perform the chaining in this unintuitive order
1526 : * so that the bi_private linkage is set up in the right direction for the
1527 : * traversal in iomap_finish_ioend().
1528 : */
1529 : static struct bio *
1530 1965 : iomap_chain_bio(struct bio *prev)
1531 : {
1532 1965 : struct bio *new;
1533 :
1534 1965 : new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
1535 1965 : bio_clone_blkg_association(new, prev);
1536 1965 : new->bi_iter.bi_sector = bio_end_sector(prev);
1537 :
1538 1965 : bio_chain(prev, new);
1539 1965 : bio_get(prev); /* for iomap_finish_ioend */
1540 1965 : submit_bio(prev);
1541 1965 : return new;
1542 : }
1543 :
1544 : static bool
1545 380639735 : iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
1546 : sector_t sector)
1547 : {
1548 380639735 : if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
1549 380639735 : (wpc->ioend->io_flags & IOMAP_F_SHARED))
1550 : return false;
1551 380575343 : if (wpc->iomap.type != wpc->ioend->io_type)
1552 : return false;
1553 375869397 : if (offset != wpc->ioend->io_offset + wpc->ioend->io_size)
1554 : return false;
1555 374383646 : if (sector != bio_end_sector(wpc->ioend->io_bio))
1556 : return false;
1557 : /*
1558 : * Limit ioend bio chain lengths to minimise IO completion latency. This
1559 : * also prevents long tight loops ending page writeback on all the
1560 : * folios in the ioend.
1561 : */
1562 371714852 : if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE)
1563 78 : return false;
1564 : return true;
1565 : }
1566 :
1567 : /*
1568 : * Test to see if we have an existing ioend structure that we could append to
1569 : * first; otherwise finish off the current ioend and start another.
1570 : */
1571 : static void
1572 402628466 : iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
1573 : struct iomap_page *iop, struct iomap_writepage_ctx *wpc,
1574 : struct writeback_control *wbc, struct list_head *iolist)
1575 : {
1576 402628466 : sector_t sector = iomap_sector(&wpc->iomap, pos);
1577 402628466 : unsigned len = i_blocksize(inode);
1578 402628466 : size_t poff = offset_in_folio(folio, pos);
1579 :
1580 402628466 : if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) {
1581 30913768 : if (wpc->ioend)
1582 8925252 : list_add(&wpc->ioend->io_list, iolist);
1583 30913766 : wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc);
1584 : }
1585 :
1586 402627894 : if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) {
1587 1965 : wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio);
1588 1965 : bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff);
1589 : }
1590 :
1591 402628581 : if (iop)
1592 402628262 : atomic_add(len, &iop->write_bytes_pending);
1593 402628690 : wpc->ioend->io_size += len;
1594 402628690 : wbc_account_cgroup_owner(wbc, &folio->page, len);
1595 402628920 : }
1596 :
1597 : /*
1598 : * We implement an immediate ioend submission policy here to avoid needing to
1599 : * chain multiple ioends and hence nest mempool allocations which can violate
1600 : * the forward progress guarantees we need to provide. The current ioend we're
1601 : * adding blocks to is cached in the writepage context, and if the new block
1602 : * doesn't append to the cached ioend, it will create a new ioend and cache that
1603 : * instead.
1604 : *
1605 : * If a new ioend is created and cached, the old ioend is returned and queued
1606 : * locally for submission once the entire page is processed or an error has been
1607 : * detected. While ioends are submitted immediately after they are completed,
1608 : * batching optimisations are provided by higher level block plugging.
1609 : *
1610 : * At the end of a writeback pass, there will be a cached ioend remaining on the
1611 : * writepage context that the caller will need to submit.
1612 : */
1613 : static int
1614 45050744 : iomap_writepage_map(struct iomap_writepage_ctx *wpc,
1615 : struct writeback_control *wbc, struct inode *inode,
1616 : struct folio *folio, u64 end_pos)
1617 : {
1618 45050744 : struct iomap_page *iop = iomap_page_create(inode, folio, 0);
1619 45051178 : struct iomap_ioend *ioend, *next;
1620 45051178 : unsigned len = i_blocksize(inode);
1621 45051178 : unsigned nblocks = i_blocks_per_folio(inode, folio);
1622 45051178 : u64 pos = folio_pos(folio);
1623 45051178 : int error = 0, count = 0, i;
1624 45051178 : LIST_HEAD(submit_list);
1625 :
1626 90101967 : WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0);
1627 :
1628 : /*
1629 : * Walk through the folio to find areas to write back. If we
1630 : * run off the end of the current map or find the current map
1631 : * invalid, grab a new one.
1632 : */
1633 687763150 : for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
1634 642878000 : if (iop && !test_bit(i, iop->uptodate))
1635 221965684 : continue;
1636 :
1637 420912316 : error = wpc->ops->map_blocks(wpc, inode, pos);
1638 420910262 : if (error)
1639 : break;
1640 420744618 : trace_iomap_writepage_map(inode, &wpc->iomap);
1641 420746346 : if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
1642 0 : continue;
1643 420746346 : if (wpc->iomap.type == IOMAP_HOLE)
1644 18118000 : continue;
1645 402628346 : iomap_add_to_ioend(inode, pos, folio, iop, wpc, wbc,
1646 : &submit_list);
1647 402628288 : count++;
1648 : }
1649 45050794 : if (count)
1650 44881770 : wpc->ioend->io_folios++;
1651 :
1652 45219107 : WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
1653 45050794 : WARN_ON_ONCE(!folio_test_locked(folio));
1654 45050794 : WARN_ON_ONCE(folio_test_writeback(folio));
1655 45050794 : WARN_ON_ONCE(folio_test_dirty(folio));
1656 :
1657 : /*
1658 : * We cannot cancel the ioend directly here on error. We may have
1659 : * already set other pages under writeback and hence we have to run I/O
1660 : * completion to mark the error state of the pages under writeback
1661 : * appropriately.
1662 : */
1663 45050794 : if (unlikely(error)) {
1664 : /*
1665 : * Let the filesystem know what portion of the current page
1666 : * failed to map. If the page hasn't been added to ioend, it
1667 : * won't be affected by I/O completion and we must unlock it
1668 : * now.
1669 : */
1670 165644 : if (wpc->ops->discard_folio)
1671 165644 : wpc->ops->discard_folio(folio, pos);
1672 165644 : if (!count) {
1673 165616 : folio_unlock(folio);
1674 165616 : goto done;
1675 : }
1676 : }
1677 :
1678 44885178 : folio_start_writeback(folio);
1679 44885773 : folio_unlock(folio);
1680 :
1681 : /*
1682 : * Preserve the original error if there was one; catch
1683 : * submission errors here and propagate into subsequent ioend
1684 : * submissions.
1685 : */
1686 53810522 : list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
1687 8925201 : int error2;
1688 :
1689 8925201 : list_del_init(&ioend->io_list);
1690 8925248 : error2 = iomap_submit_ioend(wpc, ioend, error);
1691 8925248 : if (error2 && !error)
1692 0 : error = error2;
1693 : }
1694 :
1695 : /*
1696 : * We can end up here with no error and nothing to write only if we race
1697 : * with a partial page truncate on a sub-page block sized filesystem.
1698 : */
1699 44885321 : if (!count)
1700 3408 : folio_end_writeback(folio);
1701 44881913 : done:
1702 45050937 : mapping_set_error(inode->i_mapping, error);
1703 45050685 : return error;
1704 : }
1705 :
1706 : /*
1707 : * Write out a dirty page.
1708 : *
1709 : * For delalloc space on the page, we need to allocate space and flush it.
1710 : * For unwritten space on the page, we need to start the conversion to
1711 : * regular allocated space.
1712 : */
1713 45051074 : static int iomap_do_writepage(struct folio *folio,
1714 : struct writeback_control *wbc, void *data)
1715 : {
1716 45051074 : struct iomap_writepage_ctx *wpc = data;
1717 45051074 : struct inode *inode = folio->mapping->host;
1718 45051074 : u64 end_pos, isize;
1719 :
1720 46129031 : trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio));
1721 :
1722 : /*
1723 : * Refuse to write the folio out if we're called from reclaim context.
1724 : *
1725 : * This avoids stack overflows when called from deeply used stacks in
1726 : * random callers for direct reclaim or memcg reclaim. We explicitly
1727 : * allow reclaim from kswapd as the stack usage there is relatively low.
1728 : *
1729 : * This should never happen except in the case of a VM regression so
1730 : * warn about it.
1731 : */
1732 45051114 : if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
1733 : PF_MEMALLOC))
1734 0 : goto redirty;
1735 :
1736 : /*
1737 : * Is this folio beyond the end of the file?
1738 : *
1739 : * The folio index is less than the end_index, adjust the end_pos
1740 : * to the highest offset that this folio should represent.
1741 : * -----------------------------------------------------
1742 : * | file mapping | <EOF> |
1743 : * -----------------------------------------------------
1744 : * | Page ... | Page N-2 | Page N-1 | Page N | |
1745 : * ^--------------------------------^----------|--------
1746 : * | desired writeback range | see else |
1747 : * ---------------------------------^------------------|
1748 : */
1749 45051114 : isize = i_size_read(inode);
1750 45051114 : end_pos = folio_pos(folio) + folio_size(folio);
1751 45051114 : if (end_pos > isize) {
1752 : /*
1753 : * Check whether the page to write out is beyond or straddles
1754 : * i_size or not.
1755 : * -------------------------------------------------------
1756 : * | file mapping | <EOF> |
1757 : * -------------------------------------------------------
1758 : * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
1759 : * ^--------------------------------^-----------|---------
1760 : * | | Straddles |
1761 : * ---------------------------------^-----------|--------|
1762 : */
1763 9984304 : size_t poff = offset_in_folio(folio, isize);
1764 9984304 : pgoff_t end_index = isize >> PAGE_SHIFT;
1765 :
1766 : /*
1767 : * Skip the page if it's fully outside i_size, e.g.
1768 : * due to a truncate operation that's in progress. We've
1769 : * cleaned this page and truncate will finish things off for
1770 : * us.
1771 : *
1772 : * Note that the end_index is unsigned long. If the given
1773 : * offset is greater than 16TB on a 32-bit system then if we
1774 : * checked if the page is fully outside i_size with
1775 : * "if (page->index >= end_index + 1)", "end_index + 1" would
1776 : * overflow and evaluate to 0. Hence this page would be
1777 : * redirtied and written out repeatedly, which would result in
1778 : * an infinite loop; the user program performing this operation
1779 : * would hang. Instead, we can detect this situation by
1780 : * checking if the page is totally beyond i_size or if its
1781 : * offset is just equal to the EOF.
1782 : */
1783 9984304 : if (folio->index > end_index ||
1784 9954095 : (folio->index == end_index && poff == 0))
1785 114 : goto unlock;
1786 :
1787 : /*
1788 : * The page straddles i_size. It must be zeroed out on each
1789 : * and every writepage invocation because it may be mmapped.
1790 : * "A file is mapped in multiples of the page size. For a file
1791 : * that is not a multiple of the page size, the remaining
1792 : * memory is zeroed when mapped, and writes to that region are
1793 : * not written out to the file."
1794 : */
1795 9984190 : folio_zero_segment(folio, poff, folio_size(folio));
1796 9984190 : end_pos = isize;
1797 : }
1798 :
1799 45051023 : return iomap_writepage_map(wpc, wbc, inode, folio, end_pos);
1800 :
1801 : redirty:
1802 0 : folio_redirty_for_writepage(wbc, folio);
1803 114 : unlock:
1804 114 : folio_unlock(folio);
1805 114 : return 0;
1806 : }
1807 :
1808 : int
1809 36621314 : iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
1810 : struct iomap_writepage_ctx *wpc,
1811 : const struct iomap_writeback_ops *ops)
1812 : {
1813 36621314 : int ret;
1814 :
1815 36621314 : wpc->ops = ops;
1816 36621314 : ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
1817 36621969 : if (!wpc->ioend)
1818 : return ret;
1819 21988691 : return iomap_submit_ioend(wpc, wpc->ioend, ret);
1820 : }
1821 : EXPORT_SYMBOL_GPL(iomap_writepages);
1822 :
1823 0 : static int __init iomap_init(void)
1824 : {
1825 0 : return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
1826 : offsetof(struct iomap_ioend, io_inline_bio),
1827 : BIOSET_NEED_BVECS);
1828 : }
1829 : fs_initcall(iomap_init);
|