Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (C) 2010 Red Hat, Inc.
4 : * Copyright (C) 2016-2019 Christoph Hellwig.
5 : */
6 : #include <linux/module.h>
7 : #include <linux/compiler.h>
8 : #include <linux/fs.h>
9 : #include <linux/iomap.h>
10 : #include <linux/pagemap.h>
11 : #include <linux/uio.h>
12 : #include <linux/buffer_head.h>
13 : #include <linux/dax.h>
14 : #include <linux/writeback.h>
15 : #include <linux/list_sort.h>
16 : #include <linux/swap.h>
17 : #include <linux/bio.h>
18 : #include <linux/sched/signal.h>
19 : #include <linux/migrate.h>
20 : #include "trace.h"
21 :
22 : #include "../internal.h"
23 :
24 : #define IOEND_BATCH_SIZE 4096
25 :
26 : /*
27 : * Structure allocated for each folio when block size < folio size
28 : * to track sub-folio uptodate status and I/O completions.
29 : */
30 : struct iomap_page {
31 : atomic_t read_bytes_pending;
32 : atomic_t write_bytes_pending;
33 : spinlock_t uptodate_lock;
34 : unsigned long uptodate[];
35 : };
36 :
37 : static inline struct iomap_page *to_iomap_page(struct folio *folio)
38 : {
39 15165710938 : if (folio_test_private(folio))
40 337417466 : return folio_get_private(folio);
41 : return NULL;
42 : }
43 :
44 : static struct bio_set iomap_ioend_bioset;
45 :
46 : static struct iomap_page *
47 6473876473 : iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
48 : {
49 6473876473 : struct iomap_page *iop = to_iomap_page(folio);
50 6473876473 : unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
51 6472552480 : gfp_t gfp;
52 :
53 6472552480 : if (iop || nr_blocks <= 1)
54 : return iop;
55 :
56 173327364 : if (flags & IOMAP_NOWAIT)
57 : gfp = GFP_NOWAIT;
58 : else
59 173328009 : gfp = GFP_NOFS | __GFP_NOFAIL;
60 :
61 173327364 : iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
62 : gfp);
63 173334677 : if (iop) {
64 173334677 : spin_lock_init(&iop->uptodate_lock);
65 173338700 : if (folio_test_uptodate(folio))
66 25 : bitmap_fill(iop->uptodate, nr_blocks);
67 173338675 : folio_attach_private(folio, iop);
68 : }
69 : return iop;
70 : }
71 :
72 173344063 : static void iomap_page_release(struct folio *folio)
73 : {
74 173344063 : struct iomap_page *iop = folio_detach_private(folio);
75 173347439 : struct inode *inode = folio->mapping->host;
76 173347439 : unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
77 :
78 173344643 : if (!iop)
79 : return;
80 173344643 : WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending));
81 173344643 : WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending));
82 346686538 : WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
83 : folio_test_uptodate(folio));
84 173341895 : kfree(iop);
85 : }
86 :
87 : /*
88 : * Calculate the range inside the folio that we actually need to read.
89 : */
90 5594998053 : static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
91 : loff_t *pos, loff_t length, size_t *offp, size_t *lenp)
92 : {
93 5594998053 : struct iomap_page *iop = to_iomap_page(folio);
94 5594998053 : loff_t orig_pos = *pos;
95 5594998053 : loff_t isize = i_size_read(inode);
96 5594998053 : unsigned block_bits = inode->i_blkbits;
97 5594998053 : unsigned block_size = (1 << block_bits);
98 5594998053 : size_t poff = offset_in_folio(folio, *pos);
99 5593378489 : size_t plen = min_t(loff_t, folio_size(folio) - poff, length);
100 5593691499 : unsigned first = poff >> block_bits;
101 5593691499 : unsigned last = (poff + plen - 1) >> block_bits;
102 :
103 : /*
104 : * If the block size is smaller than the page size, we need to check the
105 : * per-block uptodate status and adjust the offset and length if needed
106 : * to avoid reading in already uptodate ranges.
107 : */
108 5593691499 : if (iop) {
109 : unsigned int i;
110 :
111 : /* move forward for each leading block marked uptodate */
112 219614569 : for (i = first; i <= last; i++) {
113 412274120 : if (!test_bit(i, iop->uptodate))
114 : break;
115 14512469 : *pos += block_size;
116 14512469 : poff += block_size;
117 14512469 : plen -= block_size;
118 14512469 : first++;
119 : }
120 :
121 : /* truncate len if we find any trailing uptodate block(s) */
122 1079751720 : for ( ; i <= last; i++) {
123 1748162747 : if (test_bit(i, iop->uptodate)) {
124 1 : plen -= (last - i + 1) * block_size;
125 1 : last = i - 1;
126 1 : break;
127 : }
128 : }
129 : }
130 :
131 : /*
132 : * If the extent spans the block that contains the i_size, we need to
133 : * handle both halves separately so that we properly zero data in the
134 : * page cache for blocks that are entirely outside of i_size.
135 : */
136 5594263556 : if (orig_pos <= isize && orig_pos + length > isize) {
137 630356736 : unsigned end = offset_in_folio(folio, isize - 1) >> block_bits;
138 :
139 630535324 : if (first <= end && last > end)
140 7131688 : plen -= (last - end) * block_size;
141 : }
142 :
143 5594442144 : *offp = poff;
144 5594442144 : *lenp = plen;
145 5594442144 : }
146 :
147 245957819 : static void iomap_iop_set_range_uptodate(struct folio *folio,
148 : struct iomap_page *iop, size_t off, size_t len)
149 : {
150 245957819 : struct inode *inode = folio->mapping->host;
151 245957819 : unsigned first = off >> inode->i_blkbits;
152 245957819 : unsigned last = (off + len - 1) >> inode->i_blkbits;
153 245957819 : unsigned long flags;
154 :
155 245957819 : spin_lock_irqsave(&iop->uptodate_lock, flags);
156 245963798 : bitmap_set(iop->uptodate, first, last - first + 1);
157 245961355 : if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio)))
158 214176484 : folio_mark_uptodate(folio);
159 245965383 : spin_unlock_irqrestore(&iop->uptodate_lock, flags);
160 245957039 : }
161 :
162 6875093402 : static void iomap_set_range_uptodate(struct folio *folio,
163 : struct iomap_page *iop, size_t off, size_t len)
164 : {
165 6875093402 : if (iop)
166 245958457 : iomap_iop_set_range_uptodate(folio, iop, off, len);
167 : else
168 6629134945 : folio_mark_uptodate(folio);
169 6879953265 : }
170 :
171 122958531 : static void iomap_finish_folio_read(struct folio *folio, size_t offset,
172 : size_t len, int error)
173 : {
174 122958531 : struct iomap_page *iop = to_iomap_page(folio);
175 :
176 122958531 : if (unlikely(error)) {
177 5693 : folio_clear_uptodate(folio);
178 5693 : folio_set_error(folio);
179 : } else {
180 122952838 : iomap_set_range_uptodate(folio, iop, offset, len);
181 : }
182 :
183 122958531 : if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending))
184 119166115 : folio_unlock(folio);
185 122958531 : }
186 :
187 60687359 : static void iomap_read_end_io(struct bio *bio)
188 : {
189 60687359 : int error = blk_status_to_errno(bio->bi_status);
190 60687359 : struct folio_iter fi;
191 :
192 183645890 : bio_for_each_folio_all(fi, bio)
193 122958531 : iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
194 60687359 : bio_put(bio);
195 60687359 : }
196 :
197 : struct iomap_readpage_ctx {
198 : struct folio *cur_folio;
199 : bool cur_folio_in_bio;
200 : struct bio *bio;
201 : struct readahead_control *rac;
202 : };
203 :
204 : /**
205 : * iomap_read_inline_data - copy inline data into the page cache
206 : * @iter: iteration structure
207 : * @folio: folio to copy to
208 : *
209 : * Copy the inline data in @iter into @folio and zero out the rest of the folio.
210 : * Only a single IOMAP_INLINE extent is allowed at the end of each file.
211 : * Returns zero for success to complete the read, or the usual negative errno.
212 : */
213 0 : static int iomap_read_inline_data(const struct iomap_iter *iter,
214 : struct folio *folio)
215 : {
216 0 : struct iomap_page *iop;
217 0 : const struct iomap *iomap = iomap_iter_srcmap(iter);
218 0 : size_t size = i_size_read(iter->inode) - iomap->offset;
219 0 : size_t poff = offset_in_page(iomap->offset);
220 0 : size_t offset = offset_in_folio(folio, iomap->offset);
221 0 : void *addr;
222 :
223 0 : if (folio_test_uptodate(folio))
224 : return 0;
225 :
226 0 : if (WARN_ON_ONCE(size > PAGE_SIZE - poff))
227 : return -EIO;
228 0 : if (WARN_ON_ONCE(size > PAGE_SIZE -
229 : offset_in_page(iomap->inline_data)))
230 : return -EIO;
231 0 : if (WARN_ON_ONCE(size > iomap->length))
232 : return -EIO;
233 0 : if (offset > 0)
234 0 : iop = iomap_page_create(iter->inode, folio, iter->flags);
235 : else
236 0 : iop = to_iomap_page(folio);
237 :
238 0 : addr = kmap_local_folio(folio, offset);
239 0 : memcpy(addr, iomap->inline_data, size);
240 0 : memset(addr + size, 0, PAGE_SIZE - poff - size);
241 0 : kunmap_local(addr);
242 0 : iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff);
243 0 : return 0;
244 : }
245 :
246 4782108713 : static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
247 : loff_t pos)
248 : {
249 4782108713 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
250 :
251 4915391000 : return srcmap->type != IOMAP_MAPPED ||
252 4782108713 : (srcmap->flags & IOMAP_F_NEW) ||
253 133275174 : pos >= i_size_read(iter->inode);
254 : }
255 :
256 4687008522 : static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
257 : struct iomap_readpage_ctx *ctx, loff_t offset)
258 : {
259 4687008522 : const struct iomap *iomap = &iter->iomap;
260 4687008522 : loff_t pos = iter->pos + offset;
261 4687008522 : loff_t length = iomap_length(iter) - offset;
262 4687008522 : struct folio *folio = ctx->cur_folio;
263 4687008522 : struct iomap_page *iop;
264 4687008522 : loff_t orig_pos = pos;
265 4687008522 : size_t poff, plen;
266 4687008522 : sector_t sector;
267 :
268 4687008522 : if (iomap->type == IOMAP_INLINE)
269 0 : return iomap_read_inline_data(iter, folio);
270 :
271 : /* zero post-eof blocks as the page may be mapped */
272 4687008522 : iop = iomap_page_create(iter->inode, folio, iter->flags);
273 4686349777 : iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
274 4686357315 : if (plen == 0)
275 163 : goto done;
276 :
277 4686357152 : if (iomap_block_needs_zeroing(iter, pos)) {
278 4562677907 : folio_zero_range(folio, poff, plen);
279 4563585626 : iomap_set_range_uptodate(folio, iop, poff, plen);
280 4565959002 : goto done;
281 : }
282 :
283 123679245 : ctx->cur_folio_in_bio = true;
284 123679245 : if (iop)
285 16695504 : atomic_add(plen, &iop->read_bytes_pending);
286 :
287 123679263 : sector = iomap_sector(iomap, pos);
288 123679263 : if (!ctx->bio ||
289 137742803 : bio_end_sector(ctx->bio) != sector ||
290 63134479 : !bio_add_folio(ctx->bio, folio, plen, poff)) {
291 60649263 : gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
292 60649263 : gfp_t orig_gfp = gfp;
293 60649263 : unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
294 :
295 60649263 : if (ctx->bio)
296 11578651 : submit_bio(ctx->bio);
297 :
298 60651059 : if (ctx->rac) /* same as readahead_gfp_mask */
299 20349300 : gfp |= __GFP_NORETRY | __GFP_NOWARN;
300 60651059 : ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
301 : REQ_OP_READ, gfp);
302 : /*
303 : * If the bio_alloc fails, try it again for a single page to
304 : * avoid having to deal with partial page reads. This emulates
305 : * what do_mpage_read_folio does.
306 : */
307 60669237 : if (!ctx->bio) {
308 0 : ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
309 : orig_gfp);
310 : }
311 60669237 : if (ctx->rac)
312 20349529 : ctx->bio->bi_opf |= REQ_RAHEAD;
313 60669237 : ctx->bio->bi_iter.bi_sector = sector;
314 60669237 : ctx->bio->bi_end_io = iomap_read_end_io;
315 60669237 : bio_add_folio_nofail(ctx->bio, folio, plen, poff);
316 : }
317 :
318 63029932 : done:
319 : /*
320 : * Move the caller beyond our range so that it keeps making progress.
321 : * For that, we have to include any leading non-uptodate ranges, but
322 : * we can skip trailing ones as they will be handled in the next
323 : * iteration.
324 : */
325 4689653709 : return pos - orig_pos + plen;
326 : }
327 :
328 3489647552 : int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
329 : {
330 6976939913 : struct iomap_iter iter = {
331 3489647552 : .inode = folio->mapping->host,
332 : .pos = folio_pos(folio),
333 3489647552 : .len = folio_size(folio),
334 : };
335 3487292361 : struct iomap_readpage_ctx ctx = {
336 : .cur_folio = folio,
337 : };
338 3487292361 : int ret;
339 :
340 3487292361 : trace_iomap_readpage(iter.inode, 1);
341 :
342 6976031616 : while ((ret = iomap_iter(&iter, ops)) > 0)
343 3485398583 : iter.processed = iomap_readpage_iter(&iter, &ctx, 0);
344 :
345 3488368738 : if (ret < 0)
346 286 : folio_set_error(folio);
347 :
348 3488368738 : if (ctx.bio) {
349 40303865 : submit_bio(ctx.bio);
350 40337540 : WARN_ON_ONCE(!ctx.cur_folio_in_bio);
351 : } else {
352 3448064873 : WARN_ON_ONCE(ctx.cur_folio_in_bio);
353 3448064873 : folio_unlock(folio);
354 : }
355 :
356 : /*
357 : * Just like mpage_readahead and block_read_full_folio, we always
358 : * return 0 and just set the folio error flag on errors. This
359 : * should be cleaned up throughout the stack eventually.
360 : */
361 3489716867 : return 0;
362 : }
363 : EXPORT_SYMBOL_GPL(iomap_read_folio);
364 :
365 81263432 : static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
366 : struct iomap_readpage_ctx *ctx)
367 : {
368 81263432 : loff_t length = iomap_length(iter);
369 81263432 : loff_t done, ret;
370 :
371 1282467431 : for (done = 0; done < length; done += ret) {
372 1201203042 : if (ctx->cur_folio &&
373 1153845595 : offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) {
374 1135849654 : if (!ctx->cur_folio_in_bio)
375 1062508036 : folio_unlock(ctx->cur_folio);
376 1135880381 : ctx->cur_folio = NULL;
377 : }
378 1201189359 : if (!ctx->cur_folio) {
379 1183213595 : ctx->cur_folio = readahead_folio(ctx->rac);
380 1183241973 : ctx->cur_folio_in_bio = false;
381 : }
382 1201217737 : ret = iomap_readpage_iter(iter, ctx, done);
383 1201203999 : if (ret <= 0)
384 0 : return ret;
385 : }
386 :
387 : return done;
388 : }
389 :
390 : /**
391 : * iomap_readahead - Attempt to read pages from a file.
392 : * @rac: Describes the pages to be read.
393 : * @ops: The operations vector for the filesystem.
394 : *
395 : * This function is for filesystems to call to implement their readahead
396 : * address_space operation.
397 : *
398 : * Context: The @ops callbacks may submit I/O (eg to read the addresses of
399 : * blocks from disc), and may wait for it. The caller may be trying to
400 : * access a different page, and so sleeping excessively should be avoided.
401 : * It may allocate memory, but should avoid costly allocations. This
402 : * function is called with memalloc_nofs set, so allocations will not cause
403 : * the filesystem to be reentered.
404 : */
405 47358409 : void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
406 : {
407 47358409 : struct iomap_iter iter = {
408 47358409 : .inode = rac->mapping->host,
409 : .pos = readahead_pos(rac),
410 : .len = readahead_length(rac),
411 : };
412 47358409 : struct iomap_readpage_ctx ctx = {
413 : .rac = rac,
414 : };
415 :
416 47358409 : trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
417 :
418 128622778 : while (iomap_iter(&iter, ops) > 0)
419 81263576 : iter.processed = iomap_readahead_iter(&iter, &ctx);
420 :
421 47358333 : if (ctx.bio)
422 8770829 : submit_bio(ctx.bio);
423 47358222 : if (ctx.cur_folio) {
424 47358103 : if (!ctx.cur_folio_in_bio)
425 41872885 : folio_unlock(ctx.cur_folio);
426 : }
427 47358364 : }
428 : EXPORT_SYMBOL_GPL(iomap_readahead);
429 :
430 : /*
431 : * iomap_is_partially_uptodate checks whether blocks within a folio are
432 : * uptodate or not.
433 : *
434 : * Returns true if all blocks which correspond to the specified part
435 : * of the folio are uptodate.
436 : */
437 28187 : bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
438 : {
439 28187 : struct iomap_page *iop = to_iomap_page(folio);
440 28187 : struct inode *inode = folio->mapping->host;
441 28187 : unsigned first, last, i;
442 :
443 28187 : if (!iop)
444 : return false;
445 :
446 : /* Caller's range may extend past the end of this folio */
447 37 : count = min(folio_size(folio) - from, count);
448 :
449 : /* First and last blocks in range within folio */
450 37 : first = from >> inode->i_blkbits;
451 37 : last = (from + count - 1) >> inode->i_blkbits;
452 :
453 138 : for (i = first; i <= last; i++)
454 276 : if (!test_bit(i, iop->uptodate))
455 : return false;
456 : return true;
457 : }
458 : EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
459 :
460 : /**
461 : * iomap_get_folio - get a folio reference for writing
462 : * @iter: iteration structure
463 : * @pos: start offset of write
464 : *
465 : * Returns a locked reference to the folio at @pos, or an error pointer if the
466 : * folio could not be obtained.
467 : */
468 2091972287 : struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
469 : {
470 2091972287 : unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
471 :
472 2091972287 : if (iter->flags & IOMAP_NOWAIT)
473 0 : fgp |= FGP_NOWAIT;
474 :
475 4184715028 : return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
476 2091972287 : fgp, mapping_gfp_mask(iter->inode->i_mapping));
477 : }
478 : EXPORT_SYMBOL_GPL(iomap_get_folio);
479 :
480 15043493 : bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags)
481 : {
482 15043489 : trace_iomap_release_folio(folio->mapping->host, folio_pos(folio),
483 15043493 : folio_size(folio));
484 :
485 : /*
486 : * mm accommodates an old ext3 case where clean folios might
487 : * not have had the dirty bit cleared. Thus, it can send actual
488 : * dirty folios to ->release_folio() via shrink_active_list();
489 : * skip those here.
490 : */
491 15043485 : if (folio_test_dirty(folio) || folio_test_writeback(folio))
492 : return false;
493 15043484 : iomap_page_release(folio);
494 15043484 : return true;
495 : }
496 : EXPORT_SYMBOL_GPL(iomap_release_folio);
497 :
498 158302724 : void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
499 : {
500 158302724 : trace_iomap_invalidate_folio(folio->mapping->host,
501 158302724 : folio_pos(folio) + offset, len);
502 :
503 : /*
504 : * If we're invalidating the entire folio, clear the dirty state
505 : * from it and release it to avoid unnecessary buildup of the LRU.
506 : */
507 158302101 : if (offset == 0 && len == folio_size(folio)) {
508 145386188 : WARN_ON_ONCE(folio_test_writeback(folio));
509 145386188 : folio_cancel_dirty(folio);
510 145386767 : iomap_page_release(folio);
511 12915532 : } else if (folio_test_large(folio)) {
512 : /* Must release the iop so the page can be split */
513 25830918 : WARN_ON_ONCE(!folio_test_uptodate(folio) &&
514 : folio_test_dirty(folio));
515 12915458 : iomap_page_release(folio);
516 : }
517 158304451 : }
518 : EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
519 :
520 : static void
521 101686 : iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
522 : {
523 101686 : loff_t i_size = i_size_read(inode);
524 :
525 : /*
526 : * Only truncate newly allocated pages beyoned EOF, even if the
527 : * write started inside the existing inode size.
528 : */
529 101686 : if (pos + len > i_size)
530 36078 : truncate_pagecache_range(inode, max(pos, i_size),
531 : pos + len - 1);
532 101686 : }
533 :
534 9579238 : static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
535 : size_t poff, size_t plen, const struct iomap *iomap)
536 : {
537 9579238 : struct bio_vec bvec;
538 9579238 : struct bio bio;
539 :
540 9579238 : bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
541 9579473 : bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
542 9579473 : bio_add_folio_nofail(&bio, folio, plen, poff);
543 9579341 : return submit_bio_wait(&bio);
544 : }
545 :
546 2092938554 : static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
547 : size_t len, struct folio *folio)
548 : {
549 2092938554 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
550 2092938554 : struct iomap_page *iop;
551 2092938554 : loff_t block_size = i_blocksize(iter->inode);
552 2093205866 : loff_t block_start = round_down(pos, block_size);
553 2093205866 : loff_t block_end = round_up(pos + len, block_size);
554 2093205866 : unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
555 2093267767 : size_t from = offset_in_folio(folio, pos), to = from + len;
556 2093363312 : size_t poff, plen;
557 :
558 3279300271 : if (folio_test_uptodate(folio))
559 : return 0;
560 907445539 : folio_clear_error(folio);
561 :
562 908202275 : iop = iomap_page_create(iter->inode, folio, iter->flags);
563 907243864 : if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1)
564 : return -EAGAIN;
565 :
566 907180105 : do {
567 907180105 : iomap_adjust_read_range(iter->inode, folio, &block_start,
568 : block_end - block_start, &poff, &plen);
569 907763381 : if (plen == 0)
570 : break;
571 :
572 894003027 : if (!(iter->flags & IOMAP_UNSHARE) &&
573 892066434 : (from <= poff || from >= poff + plen) &&
574 858475250 : (to <= poff || to >= poff + plen))
575 798691535 : continue;
576 :
577 95311492 : if (iomap_block_needs_zeroing(iter, block_start)) {
578 85732158 : if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE))
579 : return -EIO;
580 85732158 : folio_zero_segments(folio, poff, from, to, poff + plen);
581 : } else {
582 9579334 : int status;
583 :
584 9579334 : if (iter->flags & IOMAP_NOWAIT)
585 : return -EAGAIN;
586 :
587 9579334 : status = iomap_read_folio_sync(block_start, folio,
588 : poff, plen, srcmap);
589 9579569 : if (status)
590 1456 : return status;
591 : }
592 95404042 : iomap_set_range_uptodate(folio, iop, poff, plen);
593 894155442 : } while ((block_start += plen) < block_end);
594 :
595 : return 0;
596 : }
597 :
598 2091854877 : static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
599 : size_t len)
600 : {
601 2091854877 : const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
602 :
603 2091854877 : if (folio_ops && folio_ops->get_folio)
604 0 : return folio_ops->get_folio(iter, pos, len);
605 : else
606 2091854877 : return iomap_get_folio(iter, pos);
607 : }
608 :
609 2094367063 : static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret,
610 : struct folio *folio)
611 : {
612 2094367063 : const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
613 :
614 2094367063 : if (folio_ops && folio_ops->put_folio) {
615 0 : folio_ops->put_folio(iter->inode, pos, ret, folio);
616 : } else {
617 2094367063 : folio_unlock(folio);
618 2095360010 : folio_put(folio);
619 : }
620 2095490987 : }
621 :
622 0 : static int iomap_write_begin_inline(const struct iomap_iter *iter,
623 : struct folio *folio)
624 : {
625 : /* needs more work for the tailpacking case; disable for now */
626 0 : if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0))
627 : return -EIO;
628 0 : return iomap_read_inline_data(iter, folio);
629 : }
630 :
631 2092848651 : static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
632 : size_t len, struct folio **foliop)
633 : {
634 2092848651 : const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
635 2092848651 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
636 2092848651 : struct folio *folio;
637 2092848651 : int status = 0;
638 :
639 2092848651 : BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
640 2092848651 : if (srcmap != &iter->iomap)
641 132307575 : BUG_ON(pos + len > srcmap->offset + srcmap->length);
642 :
643 2092848651 : if (fatal_signal_pending(current))
644 : return -EINTR;
645 :
646 4183334350 : if (!mapping_large_folio_support(iter->inode->i_mapping))
647 0 : len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
648 :
649 2091667175 : folio = __iomap_get_folio(iter, pos, len);
650 2091926374 : if (IS_ERR(folio))
651 0 : return PTR_ERR(folio);
652 :
653 : /*
654 : * Now we have a locked folio, before we do anything with it we need to
655 : * check that the iomap we have cached is not stale. The inode extent
656 : * mapping can change due to concurrent IO in flight (e.g.
657 : * IOMAP_UNWRITTEN state can change and memory reclaim could have
658 : * reclaimed a previously partially written page at this index after IO
659 : * completion before this write reaches this file offset) and hence we
660 : * could do the wrong thing here (zero a page range incorrectly or fail
661 : * to zero) and corrupt data.
662 : */
663 2091926374 : if (folio_ops && folio_ops->iomap_valid) {
664 2091926374 : bool iomap_valid = folio_ops->iomap_valid(iter->inode,
665 : &iter->iomap);
666 2092653569 : if (!iomap_valid) {
667 100230 : iter->iomap.flags |= IOMAP_F_STALE;
668 100230 : status = 0;
669 100230 : goto out_unlock;
670 : }
671 : }
672 :
673 2092553339 : if (pos + len > folio_pos(folio) + folio_size(folio))
674 6594284 : len = folio_pos(folio) + folio_size(folio) - pos;
675 :
676 2092723551 : if (srcmap->type == IOMAP_INLINE)
677 0 : status = iomap_write_begin_inline(iter, folio);
678 2092723551 : else if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
679 0 : status = __block_write_begin_int(folio, pos, len, NULL, srcmap);
680 : else
681 2092723551 : status = __iomap_write_begin(iter, pos, len, folio);
682 :
683 2092800570 : if (unlikely(status))
684 1456 : goto out_unlock;
685 :
686 2092799114 : *foliop = folio;
687 2092799114 : return 0;
688 :
689 101686 : out_unlock:
690 101686 : __iomap_put_folio(iter, pos, 0, folio);
691 101686 : iomap_write_failed(iter->inode, pos, len);
692 :
693 101686 : return status;
694 : }
695 :
696 2094387940 : static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
697 : size_t copied, struct folio *folio)
698 : {
699 2094387940 : struct iomap_page *iop = to_iomap_page(folio);
700 2094387940 : flush_dcache_folio(folio);
701 :
702 : /*
703 : * The blocks that were entirely written will now be uptodate, so we
704 : * don't have to worry about a read_folio reading them and overwriting a
705 : * partial write. However, if we've encountered a short write and only
706 : * partially written into a block, it will not be marked uptodate, so a
707 : * read_folio might come in and destroy our partial write.
708 : *
709 : * Do the simplest thing and just treat any short write to a
710 : * non-uptodate page as a zero-length write, and force the caller to
711 : * redo the whole thing.
712 : */
713 2094387940 : if (unlikely(copied < len && !folio_test_uptodate(folio)))
714 : return 0;
715 2094387940 : iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len);
716 2094816940 : filemap_dirty_folio(inode->i_mapping, folio);
717 2094816940 : return copied;
718 : }
719 :
720 0 : static size_t iomap_write_end_inline(const struct iomap_iter *iter,
721 : struct folio *folio, loff_t pos, size_t copied)
722 : {
723 0 : const struct iomap *iomap = &iter->iomap;
724 0 : void *addr;
725 :
726 0 : WARN_ON_ONCE(!folio_test_uptodate(folio));
727 0 : BUG_ON(!iomap_inline_data_valid(iomap));
728 :
729 0 : flush_dcache_folio(folio);
730 0 : addr = kmap_local_folio(folio, pos);
731 0 : memcpy(iomap_inline_data(iomap, pos), addr, copied);
732 0 : kunmap_local(addr);
733 :
734 0 : mark_inode_dirty(iter->inode);
735 0 : return copied;
736 : }
737 :
738 : /* Returns the number of bytes copied. May be 0. Cannot be an errno. */
739 2095273702 : static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
740 : size_t copied, struct folio *folio)
741 : {
742 2095273702 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
743 2095273702 : loff_t old_size = iter->inode->i_size;
744 2095273702 : size_t ret;
745 :
746 2095273702 : if (srcmap->type == IOMAP_INLINE) {
747 0 : ret = iomap_write_end_inline(iter, folio, pos, copied);
748 2095273702 : } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
749 0 : ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
750 : copied, &folio->page, NULL);
751 : } else {
752 2095273702 : ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
753 : }
754 :
755 : /*
756 : * Update the in-memory inode size after copying the data into the page
757 : * cache. It's up to the file system to write the updated size to disk,
758 : * preferably after I/O completion so that no stale data is exposed.
759 : */
760 2094640539 : if (pos + ret > old_size) {
761 700998027 : i_size_write(iter->inode, pos + ret);
762 700998027 : iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
763 : }
764 2094640539 : __iomap_put_folio(iter, pos, ret, folio);
765 :
766 2095380991 : if (old_size < pos)
767 41253453 : pagecache_isize_extended(iter->inode, old_size, pos);
768 2095354419 : if (ret < len)
769 0 : iomap_write_failed(iter->inode, pos + ret, len - ret);
770 2095354419 : return ret;
771 : }
772 :
773 292700434 : static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
774 : {
775 292700434 : loff_t length = iomap_length(iter);
776 292700434 : loff_t pos = iter->pos;
777 292700434 : ssize_t written = 0;
778 292700434 : long status = 0;
779 292700434 : struct address_space *mapping = iter->inode->i_mapping;
780 292700434 : unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
781 :
782 2021198274 : do {
783 2021198274 : struct folio *folio;
784 2021198274 : struct page *page;
785 2021198274 : unsigned long offset; /* Offset into pagecache page */
786 2021198274 : unsigned long bytes; /* Bytes to write to page */
787 2021198274 : size_t copied; /* Bytes copied from user */
788 :
789 2021198274 : offset = offset_in_page(pos);
790 2021198274 : bytes = min_t(unsigned long, PAGE_SIZE - offset,
791 : iov_iter_count(i));
792 2021198274 : again:
793 2021198274 : status = balance_dirty_pages_ratelimited_flags(mapping,
794 : bdp_flags);
795 2020035548 : if (unlikely(status))
796 : break;
797 :
798 2020035548 : if (bytes > length)
799 : bytes = length;
800 :
801 : /*
802 : * Bring in the user page that we'll copy from _first_.
803 : * Otherwise there's a nasty deadlock on copying from the
804 : * same page as we're writing to, without it being marked
805 : * up-to-date.
806 : *
807 : * For async buffered writes the assumption is that the user
808 : * page has already been faulted in. This can be optimized by
809 : * faulting the user page.
810 : */
811 2020035548 : if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
812 : status = -EFAULT;
813 : break;
814 : }
815 :
816 2021558249 : status = iomap_write_begin(iter, pos, bytes, &folio);
817 2020798360 : if (unlikely(status))
818 : break;
819 2020797050 : if (iter->iomap.flags & IOMAP_F_STALE)
820 : break;
821 :
822 2020700151 : page = folio_file_page(folio, pos >> PAGE_SHIFT);
823 2020369723 : if (mapping_writably_mapped(mapping))
824 : flush_dcache_page(page);
825 :
826 2020369723 : copied = copy_page_from_iter_atomic(page, offset, bytes, i);
827 :
828 2022244514 : status = iomap_write_end(iter, pos, bytes, copied, folio);
829 :
830 2021942069 : if (unlikely(copied != status))
831 0 : iov_iter_revert(i, copied - status);
832 :
833 2021942069 : cond_resched();
834 2021824789 : if (unlikely(status == 0)) {
835 : /*
836 : * A short copy made iomap_write_end() reject the
837 : * thing entirely. Might be memory poisoning
838 : * halfway through, might be a race with munmap,
839 : * might be severe memory pressure.
840 : */
841 0 : if (copied)
842 0 : bytes = copied;
843 0 : goto again;
844 : }
845 2021824789 : pos += status;
846 2021824789 : written += status;
847 2021824789 : length -= status;
848 2021824789 : } while (iov_iter_count(i) && length);
849 :
850 293425158 : if (status == -EAGAIN) {
851 0 : iov_iter_revert(i, written);
852 0 : return -EAGAIN;
853 : }
854 293425158 : return written ? written : status;
855 : }
856 :
857 : ssize_t
858 272487675 : iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
859 : const struct iomap_ops *ops)
860 : {
861 272487675 : struct iomap_iter iter = {
862 272487675 : .inode = iocb->ki_filp->f_mapping->host,
863 272487675 : .pos = iocb->ki_pos,
864 : .len = iov_iter_count(i),
865 : .flags = IOMAP_WRITE,
866 : };
867 272487675 : ssize_t ret;
868 :
869 272487675 : if (iocb->ki_flags & IOCB_NOWAIT)
870 0 : iter.flags |= IOMAP_NOWAIT;
871 :
872 565816188 : while ((ret = iomap_iter(&iter, ops)) > 0)
873 292765118 : iter.processed = iomap_write_iter(&iter, i);
874 :
875 272956898 : if (unlikely(iter.pos == iocb->ki_pos))
876 : return ret;
877 269809257 : ret = iter.pos - iocb->ki_pos;
878 269809257 : iocb->ki_pos = iter.pos;
879 269809257 : return ret;
880 : }
881 : EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
882 :
883 : /*
884 : * Scan the data range passed to us for dirty page cache folios. If we find a
885 : * dirty folio, punch out the preceeding range and update the offset from which
886 : * the next punch will start from.
887 : *
888 : * We can punch out storage reservations under clean pages because they either
889 : * contain data that has been written back - in which case the delalloc punch
890 : * over that range is a no-op - or they have been read faults in which case they
891 : * contain zeroes and we can remove the delalloc backing range and any new
892 : * writes to those pages will do the normal hole filling operation...
893 : *
894 : * This makes the logic simple: we only need to keep the delalloc extents only
895 : * over the dirty ranges of the page cache.
896 : *
897 : * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
898 : * simplify range iterations.
899 : */
900 2172 : static int iomap_write_delalloc_scan(struct inode *inode,
901 : loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
902 : int (*punch)(struct inode *inode, loff_t offset, loff_t length))
903 : {
904 13877 : while (start_byte < end_byte) {
905 11705 : struct folio *folio;
906 :
907 : /* grab locked page */
908 11705 : folio = filemap_lock_folio(inode->i_mapping,
909 11705 : start_byte >> PAGE_SHIFT);
910 11705 : if (IS_ERR(folio)) {
911 143 : start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) +
912 : PAGE_SIZE;
913 143 : continue;
914 : }
915 :
916 : /* if dirty, punch up to offset */
917 11562 : if (folio_test_dirty(folio)) {
918 278 : if (start_byte > *punch_start_byte) {
919 1 : int error;
920 :
921 1 : error = punch(inode, *punch_start_byte,
922 : start_byte - *punch_start_byte);
923 1 : if (error) {
924 0 : folio_unlock(folio);
925 0 : folio_put(folio);
926 0 : return error;
927 : }
928 : }
929 :
930 : /*
931 : * Make sure the next punch start is correctly bound to
932 : * the end of this data range, not the end of the folio.
933 : */
934 545 : *punch_start_byte = min_t(loff_t, end_byte,
935 : folio_next_index(folio) << PAGE_SHIFT);
936 : }
937 :
938 : /* move offset to start of next folio in range */
939 11562 : start_byte = folio_next_index(folio) << PAGE_SHIFT;
940 11562 : folio_unlock(folio);
941 11562 : folio_put(folio);
942 : }
943 : return 0;
944 : }
945 :
946 : /*
947 : * Punch out all the delalloc blocks in the range given except for those that
948 : * have dirty data still pending in the page cache - those are going to be
949 : * written and so must still retain the delalloc backing for writeback.
950 : *
951 : * As we are scanning the page cache for data, we don't need to reimplement the
952 : * wheel - mapping_seek_hole_data() does exactly what we need to identify the
953 : * start and end of data ranges correctly even for sub-folio block sizes. This
954 : * byte range based iteration is especially convenient because it means we
955 : * don't have to care about variable size folios, nor where the start or end of
956 : * the data range lies within a folio, if they lie within the same folio or even
957 : * if there are multiple discontiguous data ranges within the folio.
958 : *
959 : * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so
960 : * can return data ranges that exist in the cache beyond EOF. e.g. a page fault
961 : * spanning EOF will initialise the post-EOF data to zeroes and mark it up to
962 : * date. A write page fault can then mark it dirty. If we then fail a write()
963 : * beyond EOF into that up to date cached range, we allocate a delalloc block
964 : * beyond EOF and then have to punch it out. Because the range is up to date,
965 : * mapping_seek_hole_data() will return it, and we will skip the punch because
966 : * the folio is dirty. THis is incorrect - we always need to punch out delalloc
967 : * beyond EOF in this case as writeback will never write back and covert that
968 : * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF,
969 : * resulting in always punching out the range from the EOF to the end of the
970 : * range the iomap spans.
971 : *
972 : * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it
973 : * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA
974 : * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte)
975 : * returns the end of the data range (data_end). Using closed intervals would
976 : * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
977 : * the code to subtle off-by-one bugs....
978 : */
979 40113 : static int iomap_write_delalloc_release(struct inode *inode,
980 : loff_t start_byte, loff_t end_byte,
981 : int (*punch)(struct inode *inode, loff_t pos, loff_t length))
982 : {
983 40113 : loff_t punch_start_byte = start_byte;
984 40113 : loff_t scan_end_byte = min(i_size_read(inode), end_byte);
985 40113 : int error = 0;
986 :
987 : /*
988 : * Lock the mapping to avoid races with page faults re-instantiating
989 : * folios and dirtying them via ->page_mkwrite whilst we walk the
990 : * cache and perform delalloc extent removal. Failing to do this can
991 : * leave dirty pages with no space reservation in the cache.
992 : */
993 40113 : filemap_invalidate_lock(inode->i_mapping);
994 42287 : while (start_byte < scan_end_byte) {
995 30400 : loff_t data_end;
996 :
997 30400 : start_byte = mapping_seek_hole_data(inode->i_mapping,
998 : start_byte, scan_end_byte, SEEK_DATA);
999 : /*
1000 : * If there is no more data to scan, all that is left is to
1001 : * punch out the remaining range.
1002 : */
1003 30400 : if (start_byte == -ENXIO || start_byte == scan_end_byte)
1004 : break;
1005 2172 : if (start_byte < 0) {
1006 0 : error = start_byte;
1007 0 : goto out_unlock;
1008 : }
1009 2172 : WARN_ON_ONCE(start_byte < punch_start_byte);
1010 2172 : WARN_ON_ONCE(start_byte > scan_end_byte);
1011 :
1012 : /*
1013 : * We find the end of this contiguous cached data range by
1014 : * seeking from start_byte to the beginning of the next hole.
1015 : */
1016 2172 : data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
1017 : scan_end_byte, SEEK_HOLE);
1018 2172 : if (data_end < 0) {
1019 0 : error = data_end;
1020 0 : goto out_unlock;
1021 : }
1022 2172 : WARN_ON_ONCE(data_end <= start_byte);
1023 2172 : WARN_ON_ONCE(data_end > scan_end_byte);
1024 :
1025 2172 : error = iomap_write_delalloc_scan(inode, &punch_start_byte,
1026 : start_byte, data_end, punch);
1027 2172 : if (error)
1028 0 : goto out_unlock;
1029 :
1030 : /* The next data search starts at the end of this one. */
1031 : start_byte = data_end;
1032 : }
1033 :
1034 40115 : if (punch_start_byte < end_byte)
1035 39977 : error = punch(inode, punch_start_byte,
1036 : end_byte - punch_start_byte);
1037 138 : out_unlock:
1038 40114 : filemap_invalidate_unlock(inode->i_mapping);
1039 40114 : return error;
1040 : }
1041 :
1042 : /*
1043 : * When a short write occurs, the filesystem may need to remove reserved space
1044 : * that was allocated in ->iomap_begin from it's ->iomap_end method. For
1045 : * filesystems that use delayed allocation, we need to punch out delalloc
1046 : * extents from the range that are not dirty in the page cache. As the write can
1047 : * race with page faults, there can be dirty pages over the delalloc extent
1048 : * outside the range of a short write but still within the delalloc extent
1049 : * allocated for this iomap.
1050 : *
1051 : * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
1052 : * simplify range iterations.
1053 : *
1054 : * The punch() callback *must* only punch delalloc extents in the range passed
1055 : * to it. It must skip over all other types of extents in the range and leave
1056 : * them completely unchanged. It must do this punch atomically with respect to
1057 : * other extent modifications.
1058 : *
1059 : * The punch() callback may be called with a folio locked to prevent writeback
1060 : * extent allocation racing at the edge of the range we are currently punching.
1061 : * The locked folio may or may not cover the range being punched, so it is not
1062 : * safe for the punch() callback to lock folios itself.
1063 : *
1064 : * Lock order is:
1065 : *
1066 : * inode->i_rwsem (shared or exclusive)
1067 : * inode->i_mapping->invalidate_lock (exclusive)
1068 : * folio_lock()
1069 : * ->punch
1070 : * internal filesystem allocation lock
1071 : */
1072 471470231 : int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
1073 : struct iomap *iomap, loff_t pos, loff_t length,
1074 : ssize_t written,
1075 : int (*punch)(struct inode *inode, loff_t pos, loff_t length))
1076 : {
1077 471470231 : loff_t start_byte;
1078 471470231 : loff_t end_byte;
1079 471470231 : unsigned int blocksize = i_blocksize(inode);
1080 :
1081 471119157 : if (iomap->type != IOMAP_DELALLOC)
1082 : return 0;
1083 :
1084 : /* If we didn't reserve the blocks, we're not allowed to punch them. */
1085 173606868 : if (!(iomap->flags & IOMAP_F_NEW))
1086 : return 0;
1087 :
1088 : /*
1089 : * start_byte refers to the first unused block after a short write. If
1090 : * nothing was written, round offset down to point at the first block in
1091 : * the range.
1092 : */
1093 32857889 : if (unlikely(!written))
1094 19484 : start_byte = round_down(pos, blocksize);
1095 : else
1096 32838405 : start_byte = round_up(pos + written, blocksize);
1097 32857889 : end_byte = round_up(pos + length, blocksize);
1098 :
1099 : /* Nothing to do if we've written the entire delalloc extent */
1100 32857889 : if (start_byte >= end_byte)
1101 : return 0;
1102 :
1103 40114 : return iomap_write_delalloc_release(inode, start_byte, end_byte,
1104 : punch);
1105 : }
1106 : EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
1107 :
1108 24146264 : static loff_t iomap_unshare_iter(struct iomap_iter *iter)
1109 : {
1110 24146264 : struct iomap *iomap = &iter->iomap;
1111 24146264 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
1112 24146264 : loff_t pos = iter->pos;
1113 24146264 : loff_t length = iomap_length(iter);
1114 24146264 : long status = 0;
1115 24146264 : loff_t written = 0;
1116 :
1117 : /* don't bother with blocks that are not shared to start with */
1118 24146264 : if (!(iomap->flags & IOMAP_F_SHARED))
1119 : return length;
1120 : /* don't bother with holes or unwritten extents */
1121 1997577 : if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
1122 : return length;
1123 :
1124 25968055 : do {
1125 25968055 : unsigned long offset = offset_in_page(pos);
1126 25968055 : unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length);
1127 25968055 : struct folio *folio;
1128 :
1129 25968055 : status = iomap_write_begin(iter, pos, bytes, &folio);
1130 25968212 : if (unlikely(status))
1131 0 : return status;
1132 25968212 : if (iter->iomap.flags & IOMAP_F_STALE)
1133 : break;
1134 :
1135 25966822 : status = iomap_write_end(iter, pos, bytes, bytes, folio);
1136 25966826 : if (WARN_ON_ONCE(status == 0))
1137 : return -EIO;
1138 :
1139 25966826 : cond_resched();
1140 :
1141 25966797 : pos += status;
1142 25966797 : written += status;
1143 25966797 : length -= status;
1144 :
1145 25966797 : balance_dirty_pages_ratelimited(iter->inode->i_mapping);
1146 25966672 : } while (length);
1147 :
1148 : return written;
1149 : }
1150 :
1151 : int
1152 21893258 : iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
1153 : const struct iomap_ops *ops)
1154 : {
1155 21893258 : struct iomap_iter iter = {
1156 : .inode = inode,
1157 : .pos = pos,
1158 : .len = len,
1159 : .flags = IOMAP_WRITE | IOMAP_UNSHARE,
1160 : };
1161 21893258 : int ret;
1162 :
1163 46039525 : while ((ret = iomap_iter(&iter, ops)) > 0)
1164 24146272 : iter.processed = iomap_unshare_iter(&iter);
1165 21893490 : return ret;
1166 : }
1167 : EXPORT_SYMBOL_GPL(iomap_file_unshare);
1168 :
1169 0 : static loff_t iomap_dirty_iter(struct iomap_iter *iter)
1170 : {
1171 0 : loff_t pos = iter->pos;
1172 0 : loff_t length = iomap_length(iter);
1173 0 : long status = 0;
1174 0 : loff_t written = 0;
1175 :
1176 0 : do {
1177 0 : unsigned long offset = offset_in_page(pos);
1178 0 : unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length);
1179 0 : struct folio *folio;
1180 :
1181 0 : status = iomap_write_begin(iter, pos, bytes, &folio);
1182 0 : if (unlikely(status))
1183 0 : return status;
1184 :
1185 0 : folio_mark_accessed(folio);
1186 :
1187 0 : status = iomap_write_end(iter, pos, bytes, bytes, folio);
1188 0 : if (WARN_ON_ONCE(status == 0))
1189 : return -EIO;
1190 :
1191 0 : cond_resched();
1192 :
1193 0 : pos += status;
1194 0 : written += status;
1195 0 : length -= status;
1196 :
1197 0 : balance_dirty_pages_ratelimited(iter->inode->i_mapping);
1198 0 : } while (length);
1199 :
1200 : return written;
1201 : }
1202 :
1203 : int
1204 0 : iomap_dirty_range(struct inode *inode, loff_t pos, u64 len,
1205 : const struct iomap_ops *ops)
1206 : {
1207 0 : struct iomap_iter iter = {
1208 : .inode = inode,
1209 : .pos = pos,
1210 : .len = len,
1211 : .flags = IOMAP_WRITE,
1212 : };
1213 0 : int ret;
1214 :
1215 0 : if (IS_DAX(inode))
1216 : return -EINVAL;
1217 :
1218 0 : while ((ret = iomap_iter(&iter, ops)) > 0)
1219 0 : iter.processed = iomap_dirty_iter(&iter);
1220 : return ret;
1221 : }
1222 : EXPORT_SYMBOL_GPL(iomap_dirty_range);
1223 :
1224 154571313 : static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
1225 : {
1226 154571313 : const struct iomap *srcmap = iomap_iter_srcmap(iter);
1227 154571313 : loff_t pos = iter->pos;
1228 154571313 : loff_t length = iomap_length(iter);
1229 154571313 : loff_t written = 0;
1230 :
1231 : /* already zeroed? we're done. */
1232 154571313 : if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
1233 : return length;
1234 :
1235 47135017 : do {
1236 47135017 : struct folio *folio;
1237 47135017 : int status;
1238 47135017 : size_t offset;
1239 47135017 : size_t bytes = min_t(u64, SIZE_MAX, length);
1240 :
1241 47135017 : status = iomap_write_begin(iter, pos, bytes, &folio);
1242 47286568 : if (status)
1243 1161 : return status;
1244 47285407 : if (iter->iomap.flags & IOMAP_F_STALE)
1245 : break;
1246 :
1247 47283466 : offset = offset_in_folio(folio, pos);
1248 47217207 : if (bytes > folio_size(folio) - offset)
1249 6594230 : bytes = folio_size(folio) - offset;
1250 :
1251 47194193 : folio_zero_range(folio, offset, bytes);
1252 47189751 : folio_mark_accessed(folio);
1253 :
1254 47557131 : bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
1255 47376583 : if (WARN_ON_ONCE(bytes == 0))
1256 : return -EIO;
1257 :
1258 47376583 : pos += bytes;
1259 47376583 : length -= bytes;
1260 47376583 : written += bytes;
1261 47376583 : } while (length > 0);
1262 :
1263 40793435 : if (did_zero)
1264 6059119 : *did_zero = true;
1265 : return written;
1266 : }
1267 :
1268 : int
1269 129204219 : iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
1270 : const struct iomap_ops *ops)
1271 : {
1272 129204219 : struct iomap_iter iter = {
1273 : .inode = inode,
1274 : .pos = pos,
1275 : .len = len,
1276 : .flags = IOMAP_ZERO,
1277 : };
1278 129204219 : int ret;
1279 :
1280 283998737 : while ((ret = iomap_iter(&iter, ops)) > 0)
1281 154608099 : iter.processed = iomap_zero_iter(&iter, did_zero);
1282 129102099 : return ret;
1283 : }
1284 : EXPORT_SYMBOL_GPL(iomap_zero_range);
1285 :
1286 : int
1287 7412357 : iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
1288 : const struct iomap_ops *ops)
1289 : {
1290 7412357 : unsigned int blocksize = i_blocksize(inode);
1291 7412292 : unsigned int off = pos & (blocksize - 1);
1292 :
1293 : /* Block boundary? Nothing to do */
1294 7412292 : if (!off)
1295 : return 0;
1296 5166796 : return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
1297 : }
1298 : EXPORT_SYMBOL_GPL(iomap_truncate_page);
1299 :
1300 95571618 : static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
1301 : struct folio *folio)
1302 : {
1303 95571618 : loff_t length = iomap_length(iter);
1304 95571618 : int ret;
1305 :
1306 95571618 : if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) {
1307 0 : ret = __block_write_begin_int(folio, iter->pos, length, NULL,
1308 0 : &iter->iomap);
1309 0 : if (ret)
1310 0 : return ret;
1311 0 : block_commit_write(&folio->page, 0, length);
1312 : } else {
1313 191135811 : WARN_ON_ONCE(!folio_test_uptodate(folio));
1314 95563883 : folio_mark_dirty(folio);
1315 : }
1316 :
1317 : return length;
1318 : }
1319 :
1320 92818169 : vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
1321 : {
1322 92818169 : struct iomap_iter iter = {
1323 92818169 : .inode = file_inode(vmf->vma->vm_file),
1324 : .flags = IOMAP_WRITE | IOMAP_FAULT,
1325 : };
1326 92818169 : struct folio *folio = page_folio(vmf->page);
1327 92535240 : ssize_t ret;
1328 :
1329 92535240 : folio_lock(folio);
1330 92848854 : ret = folio_mkwrite_check_truncate(folio, iter.inode);
1331 92872019 : if (ret < 0)
1332 13598 : goto out_unlock;
1333 92858421 : iter.pos = folio_pos(folio);
1334 92858421 : iter.len = ret;
1335 188730178 : while ((ret = iomap_iter(&iter, ops)) > 0)
1336 95538337 : iter.processed = iomap_folio_mkwrite_iter(&iter, folio);
1337 :
1338 92837532 : if (ret < 0)
1339 69313 : goto out_unlock;
1340 92768219 : folio_wait_stable(folio);
1341 92768219 : return VM_FAULT_LOCKED;
1342 82911 : out_unlock:
1343 82911 : folio_unlock(folio);
1344 82912 : return block_page_mkwrite_return(ret);
1345 : }
1346 : EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
1347 :
1348 879461754 : static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
1349 : size_t len, int error)
1350 : {
1351 879461754 : struct iomap_page *iop = to_iomap_page(folio);
1352 :
1353 879461754 : if (error) {
1354 3516771 : folio_set_error(folio);
1355 3516779 : mapping_set_error(inode->i_mapping, error);
1356 : }
1357 :
1358 1758923521 : WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !iop);
1359 894040853 : WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0);
1360 :
1361 879461759 : if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending))
1362 878416183 : folio_end_writeback(folio);
1363 879461764 : }
1364 :
1365 : /*
1366 : * We're now finished for good with this ioend structure. Update the page
1367 : * state, release holds on bios, and finally free up memory. Do not use the
1368 : * ioend after this.
1369 : */
1370 : static u32
1371 93871316 : iomap_finish_ioend(struct iomap_ioend *ioend, int error)
1372 : {
1373 93871316 : struct inode *inode = ioend->io_inode;
1374 93871316 : struct bio *bio = &ioend->io_inline_bio;
1375 93871316 : struct bio *last = ioend->io_bio, *next;
1376 93871316 : u64 start = bio->bi_iter.bi_sector;
1377 93871316 : loff_t offset = ioend->io_offset;
1378 93871316 : bool quiet = bio_flagged(bio, BIO_QUIET);
1379 93871316 : u32 folio_count = 0;
1380 :
1381 187920996 : for (bio = &ioend->io_inline_bio; bio; bio = next) {
1382 94049679 : struct folio_iter fi;
1383 :
1384 : /*
1385 : * For the last bio, bi_private points to the ioend, so we
1386 : * need to explicitly end the iteration here.
1387 : */
1388 94049679 : if (bio == last)
1389 : next = NULL;
1390 : else
1391 178362 : next = bio->bi_private;
1392 :
1393 : /* walk all folios in bio, ending page IO on them */
1394 973511430 : bio_for_each_folio_all(fi, bio) {
1395 879461751 : iomap_finish_folio_write(inode, fi.folio, fi.length,
1396 : error);
1397 879461764 : folio_count++;
1398 : }
1399 94049679 : bio_put(bio);
1400 : }
1401 : /* The ioend has been freed by bio_put() */
1402 :
1403 93871317 : if (unlikely(error && !quiet)) {
1404 50442 : printk_ratelimited(KERN_ERR
1405 : "%s: writeback error on inode %lu, offset %lld, sector %llu",
1406 : inode->i_sb->s_id, inode->i_ino, offset, start);
1407 : }
1408 93871317 : return folio_count;
1409 : }
1410 :
1411 : /*
1412 : * Ioend completion routine for merged bios. This can only be called from task
1413 : * contexts as merged ioends can be of unbound length. Hence we have to break up
1414 : * the writeback completions into manageable chunks to avoid long scheduler
1415 : * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
1416 : * good batch processing throughput without creating adverse scheduler latency
1417 : * conditions.
1418 : */
1419 : void
1420 82874309 : iomap_finish_ioends(struct iomap_ioend *ioend, int error)
1421 : {
1422 82874309 : struct list_head tmp;
1423 82874309 : u32 completions;
1424 :
1425 82874309 : might_sleep();
1426 :
1427 82874309 : list_replace_init(&ioend->io_list, &tmp);
1428 82874309 : completions = iomap_finish_ioend(ioend, error);
1429 :
1430 83148363 : while (!list_empty(&tmp)) {
1431 274054 : if (completions > IOEND_BATCH_SIZE * 8) {
1432 27 : cond_resched();
1433 27 : completions = 0;
1434 : }
1435 274054 : ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
1436 274054 : list_del_init(&ioend->io_list);
1437 274054 : completions += iomap_finish_ioend(ioend, error);
1438 : }
1439 82874310 : }
1440 : EXPORT_SYMBOL_GPL(iomap_finish_ioends);
1441 :
1442 : /*
1443 : * We can merge two adjacent ioends if they have the same set of work to do.
1444 : */
1445 : static bool
1446 21531581 : iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
1447 : {
1448 21531581 : if (ioend->io_bio->bi_status != next->io_bio->bi_status)
1449 : return false;
1450 21531556 : if ((ioend->io_flags & IOMAP_F_SHARED) ^
1451 21531556 : (next->io_flags & IOMAP_F_SHARED))
1452 : return false;
1453 21291154 : if ((ioend->io_type == IOMAP_UNWRITTEN) ^
1454 21291154 : (next->io_type == IOMAP_UNWRITTEN))
1455 : return false;
1456 19380705 : if (ioend->io_offset + ioend->io_size != next->io_offset)
1457 : return false;
1458 : /*
1459 : * Do not merge physically discontiguous ioends. The filesystem
1460 : * completion functions will have to iterate the physical
1461 : * discontiguities even if we merge the ioends at a logical level, so
1462 : * we don't gain anything by merging physical discontiguities here.
1463 : *
1464 : * We cannot use bio->bi_iter.bi_sector here as it is modified during
1465 : * submission so does not point to the start sector of the bio at
1466 : * completion.
1467 : */
1468 9463032 : if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
1469 9188978 : return false;
1470 : return true;
1471 : }
1472 :
1473 : void
1474 82874310 : iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
1475 : {
1476 82874310 : struct iomap_ioend *next;
1477 :
1478 82874310 : INIT_LIST_HEAD(&ioend->io_list);
1479 :
1480 83148364 : while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
1481 : io_list))) {
1482 21531581 : if (!iomap_ioend_can_merge(ioend, next))
1483 : break;
1484 274054 : list_move_tail(&next->io_list, &ioend->io_list);
1485 274054 : ioend->io_size += next->io_size;
1486 : }
1487 82874310 : }
1488 : EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
1489 :
1490 : static int
1491 62548415 : iomap_ioend_compare(void *priv, const struct list_head *a,
1492 : const struct list_head *b)
1493 : {
1494 62548415 : struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
1495 62548415 : struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
1496 :
1497 62548415 : if (ia->io_offset < ib->io_offset)
1498 : return -1;
1499 25690477 : if (ia->io_offset > ib->io_offset)
1500 25685856 : return 1;
1501 : return 0;
1502 : }
1503 :
1504 : void
1505 61616783 : iomap_sort_ioends(struct list_head *ioend_list)
1506 : {
1507 61616783 : list_sort(NULL, ioend_list, iomap_ioend_compare);
1508 61616783 : }
1509 : EXPORT_SYMBOL_GPL(iomap_sort_ioends);
1510 :
1511 10722953 : static void iomap_writepage_end_bio(struct bio *bio)
1512 : {
1513 10722953 : struct iomap_ioend *ioend = bio->bi_private;
1514 :
1515 10722953 : iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status));
1516 10722953 : }
1517 :
1518 : /*
1519 : * Submit the final bio for an ioend.
1520 : *
1521 : * If @error is non-zero, it means that we have a situation where some part of
1522 : * the submission process has failed after we've marked pages for writeback
1523 : * and unlocked them. In this situation, we need to fail the bio instead of
1524 : * submitting it. This typically only happens on a filesystem shutdown.
1525 : */
1526 : static int
1527 93870717 : iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
1528 : int error)
1529 : {
1530 93870717 : ioend->io_bio->bi_private = ioend;
1531 93870717 : ioend->io_bio->bi_end_io = iomap_writepage_end_bio;
1532 :
1533 93870717 : if (wpc->ops->prepare_ioend)
1534 93870717 : error = wpc->ops->prepare_ioend(ioend, error);
1535 93870627 : if (error) {
1536 : /*
1537 : * If we're failing the IO now, just mark the ioend with an
1538 : * error and finish it. This will run IO completion immediately
1539 : * as there is only one reference to the ioend at this point in
1540 : * time.
1541 : */
1542 331 : ioend->io_bio->bi_status = errno_to_blk_status(error);
1543 331 : bio_endio(ioend->io_bio);
1544 331 : return error;
1545 : }
1546 :
1547 93870296 : submit_bio(ioend->io_bio);
1548 93870296 : return 0;
1549 : }
1550 :
1551 : static struct iomap_ioend *
1552 93870728 : iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
1553 : loff_t offset, sector_t sector, struct writeback_control *wbc)
1554 : {
1555 93870728 : struct iomap_ioend *ioend;
1556 93870728 : struct bio *bio;
1557 :
1558 152839813 : bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
1559 : REQ_OP_WRITE | wbc_to_write_flags(wbc),
1560 : GFP_NOFS, &iomap_ioend_bioset);
1561 93870920 : bio->bi_iter.bi_sector = sector;
1562 93870920 : wbc_init_bio(wbc, bio);
1563 :
1564 93870743 : ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
1565 93870743 : INIT_LIST_HEAD(&ioend->io_list);
1566 93870743 : ioend->io_type = wpc->iomap.type;
1567 93870743 : ioend->io_flags = wpc->iomap.flags;
1568 93870743 : ioend->io_inode = inode;
1569 93870743 : ioend->io_size = 0;
1570 93870743 : ioend->io_folios = 0;
1571 93870743 : ioend->io_offset = offset;
1572 93870743 : ioend->io_bio = bio;
1573 93870743 : ioend->io_sector = sector;
1574 93870743 : return ioend;
1575 : }
1576 :
1577 : /*
1578 : * Allocate a new bio, and chain the old bio to the new one.
1579 : *
1580 : * Note that we have to perform the chaining in this unintuitive order
1581 : * so that the bi_private linkage is set up in the right direction for the
1582 : * traversal in iomap_finish_ioend().
1583 : */
1584 : static struct bio *
1585 178362 : iomap_chain_bio(struct bio *prev)
1586 : {
1587 178362 : struct bio *new;
1588 :
1589 178362 : new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
1590 178361 : bio_clone_blkg_association(new, prev);
1591 178361 : new->bi_iter.bi_sector = bio_end_sector(prev);
1592 :
1593 178361 : bio_chain(prev, new);
1594 178361 : bio_get(prev); /* for iomap_finish_ioend */
1595 178362 : submit_bio(prev);
1596 178362 : return new;
1597 : }
1598 :
1599 : static bool
1600 877175013 : iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
1601 : sector_t sector)
1602 : {
1603 877175013 : if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
1604 877175013 : (wpc->ioend->io_flags & IOMAP_F_SHARED))
1605 : return false;
1606 876297954 : if (wpc->iomap.type != wpc->ioend->io_type)
1607 : return false;
1608 865923075 : if (offset != wpc->ioend->io_offset + wpc->ioend->io_size)
1609 : return false;
1610 854465499 : if (sector != bio_end_sector(wpc->ioend->io_bio))
1611 : return false;
1612 : /*
1613 : * Limit ioend bio chain lengths to minimise IO completion latency. This
1614 : * also prevents long tight loops ending page writeback on all the
1615 : * folios in the ioend.
1616 : */
1617 837314038 : if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE)
1618 32387 : return false;
1619 : return true;
1620 : }
1621 :
1622 : /*
1623 : * Test to see if we have an existing ioend structure that we could append to
1624 : * first; otherwise finish off the current ioend and start another.
1625 : */
1626 : static void
1627 931151000 : iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
1628 : struct iomap_page *iop, struct iomap_writepage_ctx *wpc,
1629 : struct writeback_control *wbc, struct list_head *iolist)
1630 : {
1631 931151000 : sector_t sector = iomap_sector(&wpc->iomap, pos);
1632 931151000 : unsigned len = i_blocksize(inode);
1633 931149210 : size_t poff = offset_in_folio(folio, pos);
1634 :
1635 931145008 : if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) {
1636 93870584 : if (wpc->ioend)
1637 39893734 : list_add(&wpc->ioend->io_list, iolist);
1638 93870587 : wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc);
1639 : }
1640 :
1641 931145143 : if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) {
1642 178362 : wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio);
1643 178362 : bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff);
1644 : }
1645 :
1646 931168681 : if (iop)
1647 66312295 : atomic_add(len, &iop->write_bytes_pending);
1648 931170380 : wpc->ioend->io_size += len;
1649 931170380 : wbc_account_cgroup_owner(wbc, &folio->page, len);
1650 931146631 : }
1651 :
1652 : /*
1653 : * We implement an immediate ioend submission policy here to avoid needing to
1654 : * chain multiple ioends and hence nest mempool allocations which can violate
1655 : * the forward progress guarantees we need to provide. The current ioend we're
1656 : * adding blocks to is cached in the writepage context, and if the new block
1657 : * doesn't append to the cached ioend, it will create a new ioend and cache that
1658 : * instead.
1659 : *
1660 : * If a new ioend is created and cached, the old ioend is returned and queued
1661 : * locally for submission once the entire page is processed or an error has been
1662 : * detected. While ioends are submitted immediately after they are completed,
1663 : * batching optimisations are provided by higher level block plugging.
1664 : *
1665 : * At the end of a writeback pass, there will be a cached ioend remaining on the
1666 : * writepage context that the caller will need to submit.
1667 : */
1668 : static int
1669 879875416 : iomap_writepage_map(struct iomap_writepage_ctx *wpc,
1670 : struct writeback_control *wbc, struct inode *inode,
1671 : struct folio *folio, u64 end_pos)
1672 : {
1673 879875416 : struct iomap_page *iop = iomap_page_create(inode, folio, 0);
1674 879869233 : struct iomap_ioend *ioend, *next;
1675 879869233 : unsigned len = i_blocksize(inode);
1676 879855542 : unsigned nblocks = i_blocks_per_folio(inode, folio);
1677 879851666 : u64 pos = folio_pos(folio);
1678 879851666 : int error = 0, count = 0, i;
1679 879851666 : LIST_HEAD(submit_list);
1680 :
1681 893403405 : WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0);
1682 :
1683 : /*
1684 : * Walk through the folio to find areas to write back. If we
1685 : * run off the end of the current map or find the current map
1686 : * invalid, grab a new one.
1687 : */
1688 1812868779 : for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
1689 1002689543 : if (iop && !test_bit(i, iop->uptodate))
1690 1404 : continue;
1691 :
1692 934492619 : error = wpc->ops->map_blocks(wpc, inode, pos);
1693 934518849 : if (error)
1694 : break;
1695 933017941 : trace_iomap_writepage_map(inode, &wpc->iomap);
1696 933020102 : if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
1697 0 : continue;
1698 933020102 : if (wpc->iomap.type == IOMAP_HOLE)
1699 1869385 : continue;
1700 931150717 : iomap_add_to_ioend(inode, pos, folio, iop, wpc, wbc,
1701 : &submit_list);
1702 931146324 : count++;
1703 : }
1704 879875799 : if (count)
1705 878369113 : wpc->ioend->io_folios++;
1706 :
1707 881378135 : WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
1708 879875799 : WARN_ON_ONCE(!folio_test_locked(folio));
1709 879875799 : WARN_ON_ONCE(folio_test_writeback(folio));
1710 879875799 : WARN_ON_ONCE(folio_test_dirty(folio));
1711 :
1712 : /*
1713 : * We cannot cancel the ioend directly here on error. We may have
1714 : * already set other pages under writeback and hence we have to run I/O
1715 : * completion to mark the error state of the pages under writeback
1716 : * appropriately.
1717 : */
1718 879875799 : if (unlikely(error)) {
1719 : /*
1720 : * Let the filesystem know what portion of the current page
1721 : * failed to map. If the page hasn't been added to ioend, it
1722 : * won't be affected by I/O completion and we must unlock it
1723 : * now.
1724 : */
1725 1500908 : if (wpc->ops->discard_folio)
1726 1500908 : wpc->ops->discard_folio(folio, pos);
1727 1500908 : if (!count) {
1728 1500905 : folio_unlock(folio);
1729 1500905 : goto done;
1730 : }
1731 : }
1732 :
1733 878374894 : folio_start_writeback(folio);
1734 878397936 : folio_unlock(folio);
1735 :
1736 : /*
1737 : * Preserve the original error if there was one; catch
1738 : * submission errors here and propagate into subsequent ioend
1739 : * submissions.
1740 : */
1741 918300326 : list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
1742 39893016 : int error2;
1743 :
1744 39893016 : list_del_init(&ioend->io_list);
1745 39893650 : error2 = iomap_submit_ioend(wpc, ioend, error);
1746 39893790 : if (error2 && !error)
1747 0 : error = error2;
1748 : }
1749 :
1750 : /*
1751 : * We can end up here with no error and nothing to write only if we race
1752 : * with a partial page truncate on a sub-page block sized filesystem.
1753 : */
1754 878407310 : if (!count)
1755 5781 : folio_end_writeback(folio);
1756 878401529 : done:
1757 879908215 : mapping_set_error(inode->i_mapping, error);
1758 879906715 : return error;
1759 : }
1760 :
1761 : /*
1762 : * Write out a dirty page.
1763 : *
1764 : * For delalloc space on the page, we need to allocate space and flush it.
1765 : * For unwritten space on the page, we need to start the conversion to
1766 : * regular allocated space.
1767 : */
1768 879897507 : static int iomap_do_writepage(struct folio *folio,
1769 : struct writeback_control *wbc, void *data)
1770 : {
1771 879897507 : struct iomap_writepage_ctx *wpc = data;
1772 879897507 : struct inode *inode = folio->mapping->host;
1773 879897507 : u64 end_pos, isize;
1774 :
1775 879897507 : trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio));
1776 :
1777 : /*
1778 : * Refuse to write the folio out if we're called from reclaim context.
1779 : *
1780 : * This avoids stack overflows when called from deeply used stacks in
1781 : * random callers for direct reclaim or memcg reclaim. We explicitly
1782 : * allow reclaim from kswapd as the stack usage there is relatively low.
1783 : *
1784 : * This should never happen except in the case of a VM regression so
1785 : * warn about it.
1786 : */
1787 879886660 : if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
1788 : PF_MEMALLOC))
1789 0 : goto redirty;
1790 :
1791 : /*
1792 : * Is this folio beyond the end of the file?
1793 : *
1794 : * The folio index is less than the end_index, adjust the end_pos
1795 : * to the highest offset that this folio should represent.
1796 : * -----------------------------------------------------
1797 : * | file mapping | <EOF> |
1798 : * -----------------------------------------------------
1799 : * | Page ... | Page N-2 | Page N-1 | Page N | |
1800 : * ^--------------------------------^----------|--------
1801 : * | desired writeback range | see else |
1802 : * ---------------------------------^------------------|
1803 : */
1804 879886660 : isize = i_size_read(inode);
1805 879886660 : end_pos = folio_pos(folio) + folio_size(folio);
1806 879887161 : if (end_pos > isize) {
1807 : /*
1808 : * Check whether the page to write out is beyond or straddles
1809 : * i_size or not.
1810 : * -------------------------------------------------------
1811 : * | file mapping | <EOF> |
1812 : * -------------------------------------------------------
1813 : * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
1814 : * ^--------------------------------^-----------|---------
1815 : * | | Straddles |
1816 : * ---------------------------------^-----------|--------|
1817 : */
1818 20180138 : size_t poff = offset_in_folio(folio, isize);
1819 20180130 : pgoff_t end_index = isize >> PAGE_SHIFT;
1820 :
1821 : /*
1822 : * Skip the page if it's fully outside i_size, e.g.
1823 : * due to a truncate operation that's in progress. We've
1824 : * cleaned this page and truncate will finish things off for
1825 : * us.
1826 : *
1827 : * Note that the end_index is unsigned long. If the given
1828 : * offset is greater than 16TB on a 32-bit system then if we
1829 : * checked if the page is fully outside i_size with
1830 : * "if (page->index >= end_index + 1)", "end_index + 1" would
1831 : * overflow and evaluate to 0. Hence this page would be
1832 : * redirtied and written out repeatedly, which would result in
1833 : * an infinite loop; the user program performing this operation
1834 : * would hang. Instead, we can detect this situation by
1835 : * checking if the page is totally beyond i_size or if its
1836 : * offset is just equal to the EOF.
1837 : */
1838 20180130 : if (folio->index > end_index ||
1839 20150066 : (folio->index == end_index && poff == 0))
1840 13933 : goto unlock;
1841 :
1842 : /*
1843 : * The page straddles i_size. It must be zeroed out on each
1844 : * and every writepage invocation because it may be mmapped.
1845 : * "A file is mapped in multiples of the page size. For a file
1846 : * that is not a multiple of the page size, the remaining
1847 : * memory is zeroed when mapped, and writes to that region are
1848 : * not written out to the file."
1849 : */
1850 20166197 : folio_zero_segment(folio, poff, folio_size(folio));
1851 20166214 : end_pos = isize;
1852 : }
1853 :
1854 879873252 : return iomap_writepage_map(wpc, wbc, inode, folio, end_pos);
1855 :
1856 : redirty:
1857 0 : folio_redirty_for_writepage(wbc, folio);
1858 13933 : unlock:
1859 13933 : folio_unlock(folio);
1860 13933 : return 0;
1861 : }
1862 :
1863 : int
1864 98171391 : iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
1865 : struct iomap_writepage_ctx *wpc,
1866 : const struct iomap_writeback_ops *ops)
1867 : {
1868 98171391 : int ret;
1869 :
1870 98171391 : wpc->ops = ops;
1871 98171391 : ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
1872 98169683 : if (!wpc->ioend)
1873 : return ret;
1874 53977205 : return iomap_submit_ioend(wpc, wpc->ioend, ret);
1875 : }
1876 : EXPORT_SYMBOL_GPL(iomap_writepages);
1877 :
1878 0 : static int __init iomap_init(void)
1879 : {
1880 0 : return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
1881 : offsetof(struct iomap_ioend, io_inline_bio),
1882 : BIOSET_NEED_BVECS);
1883 : }
1884 : fs_initcall(iomap_init);
|