Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * linux/fs/buffer.c
4 : *
5 : * Copyright (C) 1991, 1992, 2002 Linus Torvalds
6 : */
7 :
8 : /*
9 : * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10 : *
11 : * Removed a lot of unnecessary code and simplified things now that
12 : * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13 : *
14 : * Speed up hash, lru, and free list operations. Use gfp() for allocating
15 : * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16 : *
17 : * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18 : *
19 : * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
20 : */
21 :
22 : #include <linux/kernel.h>
23 : #include <linux/sched/signal.h>
24 : #include <linux/syscalls.h>
25 : #include <linux/fs.h>
26 : #include <linux/iomap.h>
27 : #include <linux/mm.h>
28 : #include <linux/percpu.h>
29 : #include <linux/slab.h>
30 : #include <linux/capability.h>
31 : #include <linux/blkdev.h>
32 : #include <linux/file.h>
33 : #include <linux/quotaops.h>
34 : #include <linux/highmem.h>
35 : #include <linux/export.h>
36 : #include <linux/backing-dev.h>
37 : #include <linux/writeback.h>
38 : #include <linux/hash.h>
39 : #include <linux/suspend.h>
40 : #include <linux/buffer_head.h>
41 : #include <linux/task_io_accounting_ops.h>
42 : #include <linux/bio.h>
43 : #include <linux/cpu.h>
44 : #include <linux/bitops.h>
45 : #include <linux/mpage.h>
46 : #include <linux/bit_spinlock.h>
47 : #include <linux/pagevec.h>
48 : #include <linux/sched/mm.h>
49 : #include <trace/events/block.h>
50 : #include <linux/fscrypt.h>
51 : #include <linux/fsverity.h>
52 :
53 : #include "internal.h"
54 :
55 : static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
56 : static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
57 : struct writeback_control *wbc);
58 :
59 : #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
60 :
61 120691381 : inline void touch_buffer(struct buffer_head *bh)
62 : {
63 120691381 : trace_block_touch_buffer(bh);
64 120664017 : folio_mark_accessed(bh->b_folio);
65 120691052 : }
66 : EXPORT_SYMBOL(touch_buffer);
67 :
68 75033 : void __lock_buffer(struct buffer_head *bh)
69 : {
70 79606 : wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
71 4573 : }
72 : EXPORT_SYMBOL(__lock_buffer);
73 :
74 68559237 : void unlock_buffer(struct buffer_head *bh)
75 : {
76 68559237 : clear_bit_unlock(BH_Lock, &bh->b_state);
77 68565468 : smp_mb__after_atomic();
78 68565468 : wake_up_bit(&bh->b_state, BH_Lock);
79 68563294 : }
80 : EXPORT_SYMBOL(unlock_buffer);
81 :
82 : /*
83 : * Returns if the folio has dirty or writeback buffers. If all the buffers
84 : * are unlocked and clean then the folio_test_dirty information is stale. If
85 : * any of the buffers are locked, it is assumed they are locked for IO.
86 : */
87 22 : void buffer_check_dirty_writeback(struct folio *folio,
88 : bool *dirty, bool *writeback)
89 : {
90 22 : struct buffer_head *head, *bh;
91 22 : *dirty = false;
92 22 : *writeback = false;
93 :
94 22 : BUG_ON(!folio_test_locked(folio));
95 :
96 22 : head = folio_buffers(folio);
97 22 : if (!head)
98 : return;
99 :
100 22 : if (folio_test_writeback(folio))
101 0 : *writeback = true;
102 :
103 : bh = head;
104 22 : do {
105 44 : if (buffer_locked(bh))
106 0 : *writeback = true;
107 :
108 44 : if (buffer_dirty(bh))
109 0 : *dirty = true;
110 :
111 22 : bh = bh->b_this_page;
112 22 : } while (bh != head);
113 : }
114 :
115 : /*
116 : * Block until a buffer comes unlocked. This doesn't stop it
117 : * from becoming locked again - you have to lock it yourself
118 : * if you want to preserve its state.
119 : */
120 574782 : void __wait_on_buffer(struct buffer_head * bh)
121 : {
122 2524604 : wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
123 1949821 : }
124 : EXPORT_SYMBOL(__wait_on_buffer);
125 :
126 275337 : static void buffer_io_error(struct buffer_head *bh, char *msg)
127 : {
128 275337 : if (!test_bit(BH_Quiet, &bh->b_state))
129 275337 : printk_ratelimited(KERN_ERR
130 : "Buffer I/O error on dev %pg, logical block %llu%s\n",
131 : bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
132 275337 : }
133 :
134 : /*
135 : * End-of-IO handler helper function which does not touch the bh after
136 : * unlocking it.
137 : * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
138 : * a race there is benign: unlock_buffer() only use the bh's address for
139 : * hashing after unlocking the buffer, so it doesn't actually touch the bh
140 : * itself.
141 : */
142 2183536 : static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
143 : {
144 2183536 : if (uptodate) {
145 2183532 : set_buffer_uptodate(bh);
146 : } else {
147 : /* This happens, due to failed read-ahead attempts. */
148 4 : clear_buffer_uptodate(bh);
149 : }
150 2183536 : unlock_buffer(bh);
151 2183536 : }
152 :
153 : /*
154 : * Default synchronous end-of-IO handler.. Just mark it up-to-date and
155 : * unlock the buffer.
156 : */
157 2183536 : void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
158 : {
159 2183536 : __end_buffer_read_notouch(bh, uptodate);
160 2183536 : put_bh(bh);
161 2183536 : }
162 : EXPORT_SYMBOL(end_buffer_read_sync);
163 :
164 182931 : void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
165 : {
166 182931 : if (uptodate) {
167 182915 : set_buffer_uptodate(bh);
168 : } else {
169 16 : buffer_io_error(bh, ", lost sync page write");
170 16 : mark_buffer_write_io_error(bh);
171 16 : clear_buffer_uptodate(bh);
172 : }
173 182931 : unlock_buffer(bh);
174 182931 : put_bh(bh);
175 182931 : }
176 : EXPORT_SYMBOL(end_buffer_write_sync);
177 :
178 : /*
179 : * Various filesystems appear to want __find_get_block to be non-blocking.
180 : * But it's the page lock which protects the buffers. To get around this,
181 : * we get exclusion from try_to_free_buffers with the blockdev mapping's
182 : * private_lock.
183 : *
184 : * Hack idea: for the blockdev mapping, private_lock contention
185 : * may be quite high. This code could TryLock the page, and if that
186 : * succeeds, there is no need to take private_lock.
187 : */
188 : static struct buffer_head *
189 13485698 : __find_get_block_slow(struct block_device *bdev, sector_t block)
190 : {
191 13485698 : struct inode *bd_inode = bdev->bd_inode;
192 13485698 : struct address_space *bd_mapping = bd_inode->i_mapping;
193 13485698 : struct buffer_head *ret = NULL;
194 13485698 : pgoff_t index;
195 13485698 : struct buffer_head *bh;
196 13485698 : struct buffer_head *head;
197 13485698 : struct folio *folio;
198 13485698 : int all_mapped = 1;
199 13485698 : static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
200 :
201 13485698 : index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
202 13485698 : folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0);
203 13485638 : if (IS_ERR(folio))
204 2098713 : goto out;
205 :
206 11386925 : spin_lock(&bd_mapping->private_lock);
207 11390799 : head = folio_buffers(folio);
208 11390799 : if (!head)
209 2645 : goto out_unlock;
210 : bh = head;
211 13155358 : do {
212 26310716 : if (!buffer_mapped(bh))
213 : all_mapped = 0;
214 13155360 : else if (bh->b_blocknr == block) {
215 11388154 : ret = bh;
216 11388154 : get_bh(bh);
217 11388159 : goto out_unlock;
218 : }
219 1767204 : bh = bh->b_this_page;
220 1767204 : } while (bh != head);
221 :
222 : /* we might be here because some of the buffers on this page are
223 : * not mapped. This is due to various races between
224 : * file io on the block device and getblk. It gets dealt with
225 : * elsewhere, don't buffer_error if we had some unmapped buffers
226 : */
227 0 : ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
228 0 : if (all_mapped && __ratelimit(&last_warned)) {
229 0 : printk("__find_get_block_slow() failed. block=%llu, "
230 : "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
231 : "device %pg blocksize: %d\n",
232 : (unsigned long long)block,
233 : (unsigned long long)bh->b_blocknr,
234 : bh->b_state, bh->b_size, bdev,
235 : 1 << bd_inode->i_blkbits);
236 : }
237 0 : out_unlock:
238 11390804 : spin_unlock(&bd_mapping->private_lock);
239 11390603 : folio_put(folio);
240 13489328 : out:
241 13489328 : return ret;
242 : }
243 :
244 42466 : static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
245 : {
246 42466 : unsigned long flags;
247 42466 : struct buffer_head *first;
248 42466 : struct buffer_head *tmp;
249 42466 : struct folio *folio;
250 42466 : int folio_uptodate = 1;
251 :
252 84932 : BUG_ON(!buffer_async_read(bh));
253 :
254 42466 : folio = bh->b_folio;
255 42466 : if (uptodate) {
256 36197 : set_buffer_uptodate(bh);
257 : } else {
258 6269 : clear_buffer_uptodate(bh);
259 6269 : buffer_io_error(bh, ", async page read");
260 6269 : folio_set_error(folio);
261 : }
262 :
263 : /*
264 : * Be _very_ careful from here on. Bad things can happen if
265 : * two buffer heads end IO at almost the same time and both
266 : * decide that the page is now completely done.
267 : */
268 42466 : first = folio_buffers(folio);
269 42466 : spin_lock_irqsave(&first->b_uptodate_lock, flags);
270 42466 : clear_buffer_async_read(bh);
271 42466 : unlock_buffer(bh);
272 42466 : tmp = bh;
273 44940 : do {
274 89880 : if (!buffer_uptodate(tmp))
275 7774 : folio_uptodate = 0;
276 89880 : if (buffer_async_read(tmp)) {
277 1946 : BUG_ON(!buffer_locked(tmp));
278 973 : goto still_busy;
279 : }
280 43967 : tmp = tmp->b_this_page;
281 43967 : } while (tmp != bh);
282 41493 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
283 :
284 : /*
285 : * If all of the buffers are uptodate then we can set the page
286 : * uptodate.
287 : */
288 41493 : if (folio_uptodate)
289 35735 : folio_mark_uptodate(folio);
290 41493 : folio_unlock(folio);
291 41493 : return;
292 :
293 : still_busy:
294 973 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
295 : return;
296 : }
297 :
298 : struct postprocess_bh_ctx {
299 : struct work_struct work;
300 : struct buffer_head *bh;
301 : };
302 :
303 : static void verify_bh(struct work_struct *work)
304 : {
305 : struct postprocess_bh_ctx *ctx =
306 : container_of(work, struct postprocess_bh_ctx, work);
307 : struct buffer_head *bh = ctx->bh;
308 : bool valid;
309 :
310 : valid = fsverity_verify_blocks(bh->b_folio, bh->b_size, bh_offset(bh));
311 : end_buffer_async_read(bh, valid);
312 : kfree(ctx);
313 : }
314 :
315 : static bool need_fsverity(struct buffer_head *bh)
316 : {
317 : struct folio *folio = bh->b_folio;
318 : struct inode *inode = folio->mapping->host;
319 :
320 : return fsverity_active(inode) &&
321 : /* needed by ext4 */
322 : folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
323 : }
324 :
325 : static void decrypt_bh(struct work_struct *work)
326 : {
327 : struct postprocess_bh_ctx *ctx =
328 : container_of(work, struct postprocess_bh_ctx, work);
329 : struct buffer_head *bh = ctx->bh;
330 : int err;
331 :
332 : err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size,
333 : bh_offset(bh));
334 : if (err == 0 && need_fsverity(bh)) {
335 : /*
336 : * We use different work queues for decryption and for verity
337 : * because verity may require reading metadata pages that need
338 : * decryption, and we shouldn't recurse to the same workqueue.
339 : */
340 : INIT_WORK(&ctx->work, verify_bh);
341 : fsverity_enqueue_verify_work(&ctx->work);
342 : return;
343 : }
344 : end_buffer_async_read(bh, err == 0);
345 : kfree(ctx);
346 : }
347 :
348 : /*
349 : * I/O completion handler for block_read_full_folio() - pages
350 : * which come unlocked at the end of I/O.
351 : */
352 42430 : static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
353 : {
354 42430 : struct inode *inode = bh->b_folio->mapping->host;
355 42430 : bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
356 42430 : bool verify = need_fsverity(bh);
357 :
358 : /* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
359 42430 : if (uptodate && (decrypt || verify)) {
360 : struct postprocess_bh_ctx *ctx =
361 : kmalloc(sizeof(*ctx), GFP_ATOMIC);
362 :
363 : if (ctx) {
364 : ctx->bh = bh;
365 : if (decrypt) {
366 : INIT_WORK(&ctx->work, decrypt_bh);
367 : fscrypt_enqueue_decrypt_work(&ctx->work);
368 : } else {
369 : INIT_WORK(&ctx->work, verify_bh);
370 : fsverity_enqueue_verify_work(&ctx->work);
371 : }
372 : return;
373 : }
374 : uptodate = 0;
375 : }
376 42430 : end_buffer_async_read(bh, uptodate);
377 : }
378 :
379 : /*
380 : * Completion handler for block_write_full_page() - pages which are unlocked
381 : * during I/O, and which have PageWriteback cleared upon I/O completion.
382 : */
383 11982691 : void end_buffer_async_write(struct buffer_head *bh, int uptodate)
384 : {
385 11982691 : unsigned long flags;
386 11982691 : struct buffer_head *first;
387 11982691 : struct buffer_head *tmp;
388 11982691 : struct folio *folio;
389 :
390 23965382 : BUG_ON(!buffer_async_write(bh));
391 :
392 11982691 : folio = bh->b_folio;
393 11982691 : if (uptodate) {
394 11713639 : set_buffer_uptodate(bh);
395 : } else {
396 269052 : buffer_io_error(bh, ", lost async page write");
397 269052 : mark_buffer_write_io_error(bh);
398 269052 : clear_buffer_uptodate(bh);
399 269052 : folio_set_error(folio);
400 : }
401 :
402 11982691 : first = folio_buffers(folio);
403 11982691 : spin_lock_irqsave(&first->b_uptodate_lock, flags);
404 :
405 11982691 : clear_buffer_async_write(bh);
406 11982690 : unlock_buffer(bh);
407 11982691 : tmp = bh->b_this_page;
408 12157778 : while (tmp != bh) {
409 685472 : if (buffer_async_write(tmp)) {
410 335298 : BUG_ON(!buffer_locked(tmp));
411 167649 : goto still_busy;
412 : }
413 175087 : tmp = tmp->b_this_page;
414 : }
415 11815042 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
416 11815042 : folio_end_writeback(folio);
417 11815042 : return;
418 :
419 : still_busy:
420 167649 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
421 : return;
422 : }
423 : EXPORT_SYMBOL(end_buffer_async_write);
424 :
425 : /*
426 : * If a page's buffers are under async readin (end_buffer_async_read
427 : * completion) then there is a possibility that another thread of
428 : * control could lock one of the buffers after it has completed
429 : * but while some of the other buffers have not completed. This
430 : * locked buffer would confuse end_buffer_async_read() into not unlocking
431 : * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
432 : * that this buffer is not under async I/O.
433 : *
434 : * The page comes unlocked when it has no locked buffer_async buffers
435 : * left.
436 : *
437 : * PageLocked prevents anyone starting new async I/O reads any of
438 : * the buffers.
439 : *
440 : * PageWriteback is used to prevent simultaneous writeout of the same
441 : * page.
442 : *
443 : * PageLocked prevents anyone from starting writeback of a page which is
444 : * under read I/O (PageWriteback is only ever set against a locked page).
445 : */
446 42466 : static void mark_buffer_async_read(struct buffer_head *bh)
447 : {
448 42466 : bh->b_end_io = end_buffer_async_read_io;
449 42466 : set_buffer_async_read(bh);
450 42466 : }
451 :
452 11982690 : static void mark_buffer_async_write_endio(struct buffer_head *bh,
453 : bh_end_io_t *handler)
454 : {
455 11982690 : bh->b_end_io = handler;
456 11982690 : set_buffer_async_write(bh);
457 11982690 : }
458 :
459 0 : void mark_buffer_async_write(struct buffer_head *bh)
460 : {
461 0 : mark_buffer_async_write_endio(bh, end_buffer_async_write);
462 0 : }
463 : EXPORT_SYMBOL(mark_buffer_async_write);
464 :
465 :
466 : /*
467 : * fs/buffer.c contains helper functions for buffer-backed address space's
468 : * fsync functions. A common requirement for buffer-based filesystems is
469 : * that certain data from the backing blockdev needs to be written out for
470 : * a successful fsync(). For example, ext2 indirect blocks need to be
471 : * written back and waited upon before fsync() returns.
472 : *
473 : * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
474 : * inode_has_buffers() and invalidate_inode_buffers() are provided for the
475 : * management of a list of dependent buffers at ->i_mapping->private_list.
476 : *
477 : * Locking is a little subtle: try_to_free_buffers() will remove buffers
478 : * from their controlling inode's queue when they are being freed. But
479 : * try_to_free_buffers() will be operating against the *blockdev* mapping
480 : * at the time, not against the S_ISREG file which depends on those buffers.
481 : * So the locking for private_list is via the private_lock in the address_space
482 : * which backs the buffers. Which is different from the address_space
483 : * against which the buffers are listed. So for a particular address_space,
484 : * mapping->private_lock does *not* protect mapping->private_list! In fact,
485 : * mapping->private_list will always be protected by the backing blockdev's
486 : * ->private_lock.
487 : *
488 : * Which introduces a requirement: all buffers on an address_space's
489 : * ->private_list must be from the same address_space: the blockdev's.
490 : *
491 : * address_spaces which do not place buffers at ->private_list via these
492 : * utility functions are free to use private_lock and private_list for
493 : * whatever they want. The only requirement is that list_empty(private_list)
494 : * be true at clear_inode() time.
495 : *
496 : * FIXME: clear_inode should not call invalidate_inode_buffers(). The
497 : * filesystems should do that. invalidate_inode_buffers() should just go
498 : * BUG_ON(!list_empty).
499 : *
500 : * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
501 : * take an address_space, not an inode. And it should be called
502 : * mark_buffer_dirty_fsync() to clearly define why those buffers are being
503 : * queued up.
504 : *
505 : * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
506 : * list if it is already on a list. Because if the buffer is on a list,
507 : * it *must* already be on the right one. If not, the filesystem is being
508 : * silly. This will save a ton of locking. But first we have to ensure
509 : * that buffers are taken *off* the old inode's list when they are freed
510 : * (presumably in truncate). That requires careful auditing of all
511 : * filesystems (do it inside bforget()). It could also be done by bringing
512 : * b_inode back.
513 : */
514 :
515 : /*
516 : * The buffer's backing address_space's private_lock must be held
517 : */
518 118035 : static void __remove_assoc_queue(struct buffer_head *bh)
519 : {
520 118035 : list_del_init(&bh->b_assoc_buffers);
521 118035 : WARN_ON(!bh->b_assoc_map);
522 118035 : bh->b_assoc_map = NULL;
523 118035 : }
524 :
525 1653525336 : int inode_has_buffers(struct inode *inode)
526 : {
527 1656716793 : return !list_empty(&inode->i_data.private_list);
528 : }
529 :
530 : /*
531 : * osync is designed to support O_SYNC io. It waits synchronously for
532 : * all already-submitted IO to complete, but does not queue any new
533 : * writes to the disk.
534 : *
535 : * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
536 : * as you dirty the buffers, and then use osync_inode_buffers to wait for
537 : * completion. Any other dirty buffers which are not yet queued for
538 : * write will not be flushed to disk by the osync.
539 : */
540 339 : static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
541 : {
542 339 : struct buffer_head *bh;
543 339 : struct list_head *p;
544 339 : int err = 0;
545 :
546 339 : spin_lock(lock);
547 339 : repeat:
548 339 : list_for_each_prev(p, list) {
549 0 : bh = BH_ENTRY(p);
550 0 : if (buffer_locked(bh)) {
551 0 : get_bh(bh);
552 0 : spin_unlock(lock);
553 0 : wait_on_buffer(bh);
554 0 : if (!buffer_uptodate(bh))
555 0 : err = -EIO;
556 0 : brelse(bh);
557 0 : spin_lock(lock);
558 0 : goto repeat;
559 : }
560 : }
561 339 : spin_unlock(lock);
562 339 : return err;
563 : }
564 :
565 0 : void emergency_thaw_bdev(struct super_block *sb)
566 : {
567 0 : while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
568 0 : printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
569 0 : }
570 :
571 : /**
572 : * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
573 : * @mapping: the mapping which wants those buffers written
574 : *
575 : * Starts I/O against the buffers at mapping->private_list, and waits upon
576 : * that I/O.
577 : *
578 : * Basically, this is a convenience function for fsync().
579 : * @mapping is a file or directory which needs those buffers to be written for
580 : * a successful fsync().
581 : */
582 856 : int sync_mapping_buffers(struct address_space *mapping)
583 : {
584 856 : struct address_space *buffer_mapping = mapping->private_data;
585 :
586 856 : if (buffer_mapping == NULL || list_empty(&mapping->private_list))
587 : return 0;
588 :
589 339 : return fsync_buffers_list(&buffer_mapping->private_lock,
590 : &mapping->private_list);
591 : }
592 : EXPORT_SYMBOL(sync_mapping_buffers);
593 :
594 : /**
595 : * generic_buffers_fsync_noflush - generic buffer fsync implementation
596 : * for simple filesystems with no inode lock
597 : *
598 : * @file: file to synchronize
599 : * @start: start offset in bytes
600 : * @end: end offset in bytes (inclusive)
601 : * @datasync: only synchronize essential metadata if true
602 : *
603 : * This is a generic implementation of the fsync method for simple
604 : * filesystems which track all non-inode metadata in the buffers list
605 : * hanging off the address_space structure.
606 : */
607 855 : int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
608 : bool datasync)
609 : {
610 855 : struct inode *inode = file->f_mapping->host;
611 855 : int err;
612 855 : int ret;
613 :
614 855 : err = file_write_and_wait_range(file, start, end);
615 855 : if (err)
616 : return err;
617 :
618 855 : ret = sync_mapping_buffers(inode->i_mapping);
619 855 : if (!(inode->i_state & I_DIRTY_ALL))
620 537 : goto out;
621 318 : if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
622 1 : goto out;
623 :
624 317 : err = sync_inode_metadata(inode, 1);
625 317 : if (ret == 0)
626 317 : ret = err;
627 :
628 0 : out:
629 : /* check and advance again to catch errors after syncing out buffers */
630 855 : err = file_check_and_advance_wb_err(file);
631 855 : if (ret == 0)
632 855 : ret = err;
633 : return ret;
634 : }
635 : EXPORT_SYMBOL(generic_buffers_fsync_noflush);
636 :
637 : /**
638 : * generic_buffers_fsync - generic buffer fsync implementation
639 : * for simple filesystems with no inode lock
640 : *
641 : * @file: file to synchronize
642 : * @start: start offset in bytes
643 : * @end: end offset in bytes (inclusive)
644 : * @datasync: only synchronize essential metadata if true
645 : *
646 : * This is a generic implementation of the fsync method for simple
647 : * filesystems which track all non-inode metadata in the buffers list
648 : * hanging off the address_space structure. This also makes sure that
649 : * a device cache flush operation is called at the end.
650 : */
651 834 : int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
652 : bool datasync)
653 : {
654 834 : struct inode *inode = file->f_mapping->host;
655 834 : int ret;
656 :
657 834 : ret = generic_buffers_fsync_noflush(file, start, end, datasync);
658 834 : if (!ret)
659 834 : ret = blkdev_issue_flush(inode->i_sb->s_bdev);
660 834 : return ret;
661 : }
662 : EXPORT_SYMBOL(generic_buffers_fsync);
663 :
664 : /*
665 : * Called when we've recently written block `bblock', and it is known that
666 : * `bblock' was for a buffer_boundary() buffer. This means that the block at
667 : * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
668 : * dirty, schedule it for IO. So that indirects merge nicely with their data.
669 : */
670 192 : void write_boundary_block(struct block_device *bdev,
671 : sector_t bblock, unsigned blocksize)
672 : {
673 192 : struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
674 192 : if (bh) {
675 268 : if (buffer_dirty(bh))
676 60 : write_dirty_buffer(bh, 0);
677 134 : put_bh(bh);
678 : }
679 192 : }
680 :
681 755554 : void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
682 : {
683 755554 : struct address_space *mapping = inode->i_mapping;
684 755554 : struct address_space *buffer_mapping = bh->b_folio->mapping;
685 :
686 755554 : mark_buffer_dirty(bh);
687 755554 : if (!mapping->private_data) {
688 100176 : mapping->private_data = buffer_mapping;
689 : } else {
690 655378 : BUG_ON(mapping->private_data != buffer_mapping);
691 : }
692 755554 : if (!bh->b_assoc_map) {
693 117651 : spin_lock(&buffer_mapping->private_lock);
694 117651 : list_move_tail(&bh->b_assoc_buffers,
695 : &mapping->private_list);
696 117651 : bh->b_assoc_map = mapping;
697 117651 : spin_unlock(&buffer_mapping->private_lock);
698 : }
699 755554 : }
700 : EXPORT_SYMBOL(mark_buffer_dirty_inode);
701 :
702 : /*
703 : * Add a page to the dirty page list.
704 : *
705 : * It is a sad fact of life that this function is called from several places
706 : * deeply under spinlocking. It may not sleep.
707 : *
708 : * If the page has buffers, the uptodate buffers are set dirty, to preserve
709 : * dirty-state coherency between the page and the buffers. It the page does
710 : * not have buffers then when they are later attached they will all be set
711 : * dirty.
712 : *
713 : * The buffers are dirtied before the page is dirtied. There's a small race
714 : * window in which a writepage caller may see the page cleanness but not the
715 : * buffer dirtiness. That's fine. If this code were to set the page dirty
716 : * before the buffers, a concurrent writepage caller could clear the page dirty
717 : * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
718 : * page on the dirty page list.
719 : *
720 : * We use private_lock to lock against try_to_free_buffers while using the
721 : * page's buffer list. Also use this to protect against clean buffers being
722 : * added to the page after it was set dirty.
723 : *
724 : * FIXME: may need to call ->reservepage here as well. That's rather up to the
725 : * address_space though.
726 : */
727 24014721 : bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
728 : {
729 24014721 : struct buffer_head *head;
730 24014721 : bool newly_dirty;
731 :
732 24014721 : spin_lock(&mapping->private_lock);
733 24064477 : head = folio_buffers(folio);
734 24064477 : if (head) {
735 : struct buffer_head *bh = head;
736 :
737 24036526 : do {
738 24036526 : set_buffer_dirty(bh);
739 24011401 : bh = bh->b_this_page;
740 24011401 : } while (bh != head);
741 : }
742 : /*
743 : * Lock out page's memcg migration to keep PageDirty
744 : * synchronized with per-memcg dirty page counters.
745 : */
746 24039352 : folio_memcg_lock(folio);
747 24012699 : newly_dirty = !folio_test_set_dirty(folio);
748 24069604 : spin_unlock(&mapping->private_lock);
749 :
750 24044646 : if (newly_dirty)
751 2236 : __folio_mark_dirty(folio, mapping, 1);
752 :
753 24044646 : folio_memcg_unlock(folio);
754 :
755 24032876 : if (newly_dirty)
756 2236 : __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
757 :
758 24032876 : return newly_dirty;
759 : }
760 : EXPORT_SYMBOL(block_dirty_folio);
761 :
762 : /*
763 : * Write out and wait upon a list of buffers.
764 : *
765 : * We have conflicting pressures: we want to make sure that all
766 : * initially dirty buffers get waited on, but that any subsequently
767 : * dirtied buffers don't. After all, we don't want fsync to last
768 : * forever if somebody is actively writing to the file.
769 : *
770 : * Do this in two main stages: first we copy dirty buffers to a
771 : * temporary inode list, queueing the writes as we go. Then we clean
772 : * up, waiting for those writes to complete.
773 : *
774 : * During this second stage, any subsequent updates to the file may end
775 : * up refiling the buffer on the original inode's dirty list again, so
776 : * there is a chance we will end up with a buffer queued for write but
777 : * not yet completed on that list. So, as a final cleanup we go through
778 : * the osync code to catch these locked, dirty buffers without requeuing
779 : * any newly dirty buffers for write.
780 : */
781 339 : static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
782 : {
783 339 : struct buffer_head *bh;
784 339 : struct list_head tmp;
785 339 : struct address_space *mapping;
786 339 : int err = 0, err2;
787 339 : struct blk_plug plug;
788 :
789 339 : INIT_LIST_HEAD(&tmp);
790 339 : blk_start_plug(&plug);
791 :
792 339 : spin_lock(lock);
793 1116 : while (!list_empty(list)) {
794 777 : bh = BH_ENTRY(list->next);
795 777 : mapping = bh->b_assoc_map;
796 777 : __remove_assoc_queue(bh);
797 : /* Avoid race with mark_buffer_dirty_inode() which does
798 : * a lockless check and we rely on seeing the dirty bit */
799 777 : smp_mb();
800 1795 : if (buffer_dirty(bh) || buffer_locked(bh)) {
801 547 : list_add(&bh->b_assoc_buffers, &tmp);
802 547 : bh->b_assoc_map = mapping;
803 1094 : if (buffer_dirty(bh)) {
804 536 : get_bh(bh);
805 536 : spin_unlock(lock);
806 : /*
807 : * Ensure any pending I/O completes so that
808 : * write_dirty_buffer() actually writes the
809 : * current contents - it is a noop if I/O is
810 : * still in flight on potentially older
811 : * contents.
812 : */
813 536 : write_dirty_buffer(bh, REQ_SYNC);
814 :
815 : /*
816 : * Kick off IO for the previous mapping. Note
817 : * that we will not run the very last mapping,
818 : * wait_on_buffer() will do that for us
819 : * through sync_buffer().
820 : */
821 536 : brelse(bh);
822 536 : spin_lock(lock);
823 : }
824 : }
825 : }
826 :
827 339 : spin_unlock(lock);
828 339 : blk_finish_plug(&plug);
829 339 : spin_lock(lock);
830 :
831 886 : while (!list_empty(&tmp)) {
832 547 : bh = BH_ENTRY(tmp.prev);
833 547 : get_bh(bh);
834 547 : mapping = bh->b_assoc_map;
835 547 : __remove_assoc_queue(bh);
836 : /* Avoid race with mark_buffer_dirty_inode() which does
837 : * a lockless check and we rely on seeing the dirty bit */
838 547 : smp_mb();
839 1094 : if (buffer_dirty(bh)) {
840 0 : list_add(&bh->b_assoc_buffers,
841 : &mapping->private_list);
842 0 : bh->b_assoc_map = mapping;
843 : }
844 547 : spin_unlock(lock);
845 547 : wait_on_buffer(bh);
846 1094 : if (!buffer_uptodate(bh))
847 0 : err = -EIO;
848 547 : brelse(bh);
849 547 : spin_lock(lock);
850 : }
851 :
852 339 : spin_unlock(lock);
853 339 : err2 = osync_buffers_list(lock, list);
854 339 : if (err)
855 : return err;
856 : else
857 339 : return err2;
858 : }
859 :
860 : /*
861 : * Invalidate any and all dirty buffers on a given inode. We are
862 : * probably unmounting the fs, but that doesn't mean we have already
863 : * done a sync(). Just drop the buffers from the inode list.
864 : *
865 : * NOTE: we take the inode's blockdev's mapping's private_lock. Which
866 : * assumes that all the buffers are against the blockdev. Not true
867 : * for reiserfs.
868 : */
869 3163545 : void invalidate_inode_buffers(struct inode *inode)
870 : {
871 3163545 : if (inode_has_buffers(inode)) {
872 100146 : struct address_space *mapping = &inode->i_data;
873 100146 : struct list_head *list = &mapping->private_list;
874 100146 : struct address_space *buffer_mapping = mapping->private_data;
875 :
876 100146 : spin_lock(&buffer_mapping->private_lock);
877 216857 : while (!list_empty(list))
878 116711 : __remove_assoc_queue(BH_ENTRY(list->next));
879 100146 : spin_unlock(&buffer_mapping->private_lock);
880 : }
881 3163545 : }
882 : EXPORT_SYMBOL(invalidate_inode_buffers);
883 :
884 : /*
885 : * Remove any clean buffers from the inode's buffer list. This is called
886 : * when we're trying to free the inode itself. Those buffers can pin it.
887 : *
888 : * Returns true if all buffers were removed.
889 : */
890 27912 : int remove_inode_buffers(struct inode *inode)
891 : {
892 27912 : int ret = 1;
893 :
894 27912 : if (inode_has_buffers(inode)) {
895 0 : struct address_space *mapping = &inode->i_data;
896 0 : struct list_head *list = &mapping->private_list;
897 0 : struct address_space *buffer_mapping = mapping->private_data;
898 :
899 0 : spin_lock(&buffer_mapping->private_lock);
900 0 : while (!list_empty(list)) {
901 0 : struct buffer_head *bh = BH_ENTRY(list->next);
902 0 : if (buffer_dirty(bh)) {
903 : ret = 0;
904 : break;
905 : }
906 0 : __remove_assoc_queue(bh);
907 : }
908 0 : spin_unlock(&buffer_mapping->private_lock);
909 : }
910 27912 : return ret;
911 : }
912 :
913 : /*
914 : * Create the appropriate buffers when given a folio for data area and
915 : * the size of each buffer.. Use the bh->b_this_page linked list to
916 : * follow the buffers created. Return NULL if unable to create more
917 : * buffers.
918 : *
919 : * The retry flag is used to differentiate async IO (paging, swapping)
920 : * which may not fail from ordinary buffer allocations.
921 : */
922 44753878 : struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
923 : bool retry)
924 : {
925 44753878 : struct buffer_head *bh, *head;
926 44753878 : gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
927 44753878 : long offset;
928 44753878 : struct mem_cgroup *memcg, *old_memcg;
929 :
930 44753878 : if (retry)
931 44763022 : gfp |= __GFP_NOFAIL;
932 :
933 : /* The folio lock pins the memcg */
934 44753878 : memcg = folio_memcg(folio);
935 44753878 : old_memcg = set_active_memcg(memcg);
936 :
937 44738729 : head = NULL;
938 44738729 : offset = folio_size(folio);
939 89798871 : while ((offset -= size) >= 0) {
940 44973885 : bh = alloc_buffer_head(gfp);
941 45061465 : if (!bh)
942 0 : goto no_grow;
943 :
944 45061465 : bh->b_this_page = head;
945 45061465 : bh->b_blocknr = -1;
946 45061465 : head = bh;
947 :
948 45061465 : bh->b_size = size;
949 :
950 : /* Link the buffer to its folio */
951 45061465 : folio_set_bh(bh, folio, offset);
952 : }
953 44824986 : out:
954 44824986 : set_active_memcg(old_memcg);
955 44829812 : return head;
956 : /*
957 : * In case anything failed, we just free everything we got.
958 : */
959 : no_grow:
960 0 : if (head) {
961 0 : do {
962 0 : bh = head;
963 0 : head = head->b_this_page;
964 0 : free_buffer_head(bh);
965 0 : } while (head);
966 : }
967 :
968 0 : goto out;
969 : }
970 : EXPORT_SYMBOL_GPL(folio_alloc_buffers);
971 :
972 0 : struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
973 : bool retry)
974 : {
975 0 : return folio_alloc_buffers(page_folio(page), size, retry);
976 : }
977 : EXPORT_SYMBOL_GPL(alloc_page_buffers);
978 :
979 1032261 : static inline void link_dev_buffers(struct folio *folio,
980 : struct buffer_head *head)
981 : {
982 1032261 : struct buffer_head *bh, *tail;
983 :
984 1032261 : bh = head;
985 1210177 : do {
986 1210177 : tail = bh;
987 1210177 : bh = bh->b_this_page;
988 1210177 : } while (bh);
989 1032261 : tail->b_this_page = head;
990 1032261 : folio_attach_private(folio, head);
991 1032262 : }
992 :
993 1034814 : static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
994 : {
995 1034814 : sector_t retval = ~((sector_t)0);
996 1034814 : loff_t sz = bdev_nr_bytes(bdev);
997 :
998 1034814 : if (sz) {
999 1034813 : unsigned int sizebits = blksize_bits(size);
1000 1034813 : retval = (sz >> sizebits);
1001 : }
1002 1034814 : return retval;
1003 : }
1004 :
1005 : /*
1006 : * Initialise the state of a blockdev folio's buffers.
1007 : */
1008 1034814 : static sector_t folio_init_buffers(struct folio *folio,
1009 : struct block_device *bdev, sector_t block, int size)
1010 : {
1011 1034814 : struct buffer_head *head = folio_buffers(folio);
1012 1034814 : struct buffer_head *bh = head;
1013 1034814 : bool uptodate = folio_test_uptodate(folio);
1014 1034814 : sector_t end_block = blkdev_max_block(bdev, size);
1015 :
1016 1212730 : do {
1017 2425460 : if (!buffer_mapped(bh)) {
1018 1210178 : bh->b_end_io = NULL;
1019 1210178 : bh->b_private = NULL;
1020 1210178 : bh->b_bdev = bdev;
1021 1210178 : bh->b_blocknr = block;
1022 1210178 : if (uptodate)
1023 998 : set_buffer_uptodate(bh);
1024 1210178 : if (block < end_block)
1025 1210178 : set_buffer_mapped(bh);
1026 : }
1027 1212730 : block++;
1028 1212730 : bh = bh->b_this_page;
1029 1212730 : } while (bh != head);
1030 :
1031 : /*
1032 : * Caller needs to validate requested block against end of device.
1033 : */
1034 1034814 : return end_block;
1035 : }
1036 :
1037 : /*
1038 : * Create the page-cache page that contains the requested block.
1039 : *
1040 : * This is used purely for blockdev mappings.
1041 : */
1042 : static int
1043 1034412 : grow_dev_page(struct block_device *bdev, sector_t block,
1044 : pgoff_t index, int size, int sizebits, gfp_t gfp)
1045 : {
1046 1034412 : struct inode *inode = bdev->bd_inode;
1047 1034412 : struct folio *folio;
1048 1034412 : struct buffer_head *bh;
1049 1034412 : sector_t end_block;
1050 1034412 : int ret = 0;
1051 1034412 : gfp_t gfp_mask;
1052 :
1053 1034412 : gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
1054 :
1055 : /*
1056 : * XXX: __getblk_slow() can not really deal with failure and
1057 : * will endlessly loop on improvised global reclaim. Prefer
1058 : * looping in the allocator rather than here, at least that
1059 : * code knows what it's doing.
1060 : */
1061 1034412 : gfp_mask |= __GFP_NOFAIL;
1062 :
1063 1034412 : folio = __filemap_get_folio(inode->i_mapping, index,
1064 : FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp_mask);
1065 :
1066 1034757 : bh = folio_buffers(folio);
1067 1034757 : if (bh) {
1068 2552 : if (bh->b_size == size) {
1069 5104 : end_block = folio_init_buffers(folio, bdev,
1070 2552 : (sector_t)index << sizebits, size);
1071 2552 : goto done;
1072 : }
1073 0 : if (!try_to_free_buffers(folio))
1074 0 : goto failed;
1075 : }
1076 :
1077 1032205 : bh = folio_alloc_buffers(folio, size, true);
1078 :
1079 : /*
1080 : * Link the folio to the buffers and initialise them. Take the
1081 : * lock to be atomic wrt __find_get_block(), which does not
1082 : * run under the folio lock.
1083 : */
1084 1031489 : spin_lock(&inode->i_mapping->private_lock);
1085 1032262 : link_dev_buffers(folio, bh);
1086 2064524 : end_block = folio_init_buffers(folio, bdev,
1087 1032262 : (sector_t)index << sizebits, size);
1088 1032262 : spin_unlock(&inode->i_mapping->private_lock);
1089 1034814 : done:
1090 1034814 : ret = (block < end_block) ? 1 : -ENXIO;
1091 1034814 : failed:
1092 1034814 : folio_unlock(folio);
1093 1034813 : folio_put(folio);
1094 1034814 : return ret;
1095 : }
1096 :
1097 : /*
1098 : * Create buffers for the specified block device block's page. If
1099 : * that page was dirty, the buffers are set dirty also.
1100 : */
1101 : static int
1102 1034406 : grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
1103 : {
1104 1034406 : pgoff_t index;
1105 1034406 : int sizebits;
1106 :
1107 1034406 : sizebits = PAGE_SHIFT - __ffs(size);
1108 1034406 : index = block >> sizebits;
1109 :
1110 : /*
1111 : * Check for a block which wants to lie outside our maximum possible
1112 : * pagecache index. (this comparison is done using sector_t types).
1113 : */
1114 1034406 : if (unlikely(index != block >> sizebits)) {
1115 : printk(KERN_ERR "%s: requested out-of-range block %llu for "
1116 : "device %pg\n",
1117 : __func__, (unsigned long long)block,
1118 : bdev);
1119 : return -EIO;
1120 : }
1121 :
1122 : /* Create a page with the proper size buffers.. */
1123 1034406 : return grow_dev_page(bdev, block, index, size, sizebits, gfp);
1124 : }
1125 :
1126 : static struct buffer_head *
1127 1033560 : __getblk_slow(struct block_device *bdev, sector_t block,
1128 : unsigned size, gfp_t gfp)
1129 : {
1130 : /* Size must be multiple of hard sectorsize */
1131 2067120 : if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1132 : (size < 512 || size > PAGE_SIZE))) {
1133 0 : printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1134 : size);
1135 0 : printk(KERN_ERR "logical block size: %d\n",
1136 : bdev_logical_block_size(bdev));
1137 :
1138 0 : dump_stack();
1139 0 : return NULL;
1140 : }
1141 :
1142 2068374 : for (;;) {
1143 2068374 : struct buffer_head *bh;
1144 2068374 : int ret;
1145 :
1146 2068374 : bh = __find_get_block(bdev, block, size);
1147 2069371 : if (bh)
1148 1034906 : return bh;
1149 :
1150 1034465 : ret = grow_buffers(bdev, block, size, gfp);
1151 1034814 : if (ret < 0)
1152 : return NULL;
1153 : }
1154 : }
1155 :
1156 : /*
1157 : * The relationship between dirty buffers and dirty pages:
1158 : *
1159 : * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1160 : * the page is tagged dirty in the page cache.
1161 : *
1162 : * At all times, the dirtiness of the buffers represents the dirtiness of
1163 : * subsections of the page. If the page has buffers, the page dirty bit is
1164 : * merely a hint about the true dirty state.
1165 : *
1166 : * When a page is set dirty in its entirety, all its buffers are marked dirty
1167 : * (if the page has buffers).
1168 : *
1169 : * When a buffer is marked dirty, its page is dirtied, but the page's other
1170 : * buffers are not.
1171 : *
1172 : * Also. When blockdev buffers are explicitly read with bread(), they
1173 : * individually become uptodate. But their backing page remains not
1174 : * uptodate - even if all of its buffers are uptodate. A subsequent
1175 : * block_read_full_folio() against that folio will discover all the uptodate
1176 : * buffers, will set the folio uptodate and will perform no I/O.
1177 : */
1178 :
1179 : /**
1180 : * mark_buffer_dirty - mark a buffer_head as needing writeout
1181 : * @bh: the buffer_head to mark dirty
1182 : *
1183 : * mark_buffer_dirty() will set the dirty bit against the buffer, then set
1184 : * its backing page dirty, then tag the page as dirty in the page cache
1185 : * and then attach the address_space's inode to its superblock's dirty
1186 : * inode list.
1187 : *
1188 : * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->private_lock,
1189 : * i_pages lock and mapping->host->i_lock.
1190 : */
1191 158987310 : void mark_buffer_dirty(struct buffer_head *bh)
1192 : {
1193 317976169 : WARN_ON_ONCE(!buffer_uptodate(bh));
1194 :
1195 158988859 : trace_block_dirty_buffer(bh);
1196 :
1197 : /*
1198 : * Very *carefully* optimize the it-is-already-dirty case.
1199 : *
1200 : * Don't let the final "is it dirty" escape to before we
1201 : * perhaps modified the buffer.
1202 : */
1203 317955690 : if (buffer_dirty(bh)) {
1204 111775845 : smp_mb();
1205 223787112 : if (buffer_dirty(bh))
1206 : return;
1207 : }
1208 :
1209 47210061 : if (!test_set_buffer_dirty(bh)) {
1210 47245763 : struct folio *folio = bh->b_folio;
1211 47245763 : struct address_space *mapping = NULL;
1212 :
1213 47245763 : folio_memcg_lock(folio);
1214 47239755 : if (!folio_test_set_dirty(folio)) {
1215 45725034 : mapping = folio->mapping;
1216 45725034 : if (mapping)
1217 45725034 : __folio_mark_dirty(folio, mapping, 0);
1218 : }
1219 47190702 : folio_memcg_unlock(folio);
1220 47180610 : if (mapping)
1221 45662932 : __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1222 : }
1223 : }
1224 : EXPORT_SYMBOL(mark_buffer_dirty);
1225 :
1226 269068 : void mark_buffer_write_io_error(struct buffer_head *bh)
1227 : {
1228 269068 : struct super_block *sb;
1229 :
1230 269068 : set_buffer_write_io_error(bh);
1231 : /* FIXME: do we need to set this in both places? */
1232 269068 : if (bh->b_folio && bh->b_folio->mapping)
1233 269068 : mapping_set_error(bh->b_folio->mapping, -EIO);
1234 269068 : if (bh->b_assoc_map)
1235 0 : mapping_set_error(bh->b_assoc_map, -EIO);
1236 269068 : rcu_read_lock();
1237 269068 : sb = READ_ONCE(bh->b_bdev->bd_super);
1238 269068 : if (sb)
1239 7 : errseq_set(&sb->s_wb_err, -EIO);
1240 269068 : rcu_read_unlock();
1241 269068 : }
1242 : EXPORT_SYMBOL(mark_buffer_write_io_error);
1243 :
1244 : /*
1245 : * Decrement a buffer_head's reference count. If all buffers against a page
1246 : * have zero reference count, are clean and unlocked, and if the page is clean
1247 : * and unlocked then try_to_free_buffers() may strip the buffers from the page
1248 : * in preparation for freeing it (sometimes, rarely, buffers are removed from
1249 : * a page but it ends up not being freed, and buffers may later be reattached).
1250 : */
1251 154024365 : void __brelse(struct buffer_head * buf)
1252 : {
1253 154024365 : if (atomic_read(&buf->b_count)) {
1254 154024365 : put_bh(buf);
1255 154024365 : return;
1256 : }
1257 0 : WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1258 : }
1259 : EXPORT_SYMBOL(__brelse);
1260 :
1261 : /*
1262 : * bforget() is like brelse(), except it discards any
1263 : * potentially dirty data.
1264 : */
1265 61761 : void __bforget(struct buffer_head *bh)
1266 : {
1267 61761 : clear_buffer_dirty(bh);
1268 61768 : if (bh->b_assoc_map) {
1269 163 : struct address_space *buffer_mapping = bh->b_folio->mapping;
1270 :
1271 163 : spin_lock(&buffer_mapping->private_lock);
1272 163 : list_del_init(&bh->b_assoc_buffers);
1273 163 : bh->b_assoc_map = NULL;
1274 163 : spin_unlock(&buffer_mapping->private_lock);
1275 : }
1276 61768 : __brelse(bh);
1277 61768 : }
1278 : EXPORT_SYMBOL(__bforget);
1279 :
1280 86 : static struct buffer_head *__bread_slow(struct buffer_head *bh)
1281 : {
1282 86 : lock_buffer(bh);
1283 172 : if (buffer_uptodate(bh)) {
1284 1 : unlock_buffer(bh);
1285 1 : return bh;
1286 : } else {
1287 85 : get_bh(bh);
1288 85 : bh->b_end_io = end_buffer_read_sync;
1289 85 : submit_bh(REQ_OP_READ, bh);
1290 85 : wait_on_buffer(bh);
1291 170 : if (buffer_uptodate(bh))
1292 : return bh;
1293 : }
1294 0 : brelse(bh);
1295 : return NULL;
1296 : }
1297 :
1298 : /*
1299 : * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1300 : * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1301 : * refcount elevated by one when they're in an LRU. A buffer can only appear
1302 : * once in a particular CPU's LRU. A single buffer can be present in multiple
1303 : * CPU's LRUs at the same time.
1304 : *
1305 : * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1306 : * sb_find_get_block().
1307 : *
1308 : * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1309 : * a local interrupt disable for that.
1310 : */
1311 :
1312 : #define BH_LRU_SIZE 16
1313 :
1314 : struct bh_lru {
1315 : struct buffer_head *bhs[BH_LRU_SIZE];
1316 : };
1317 :
1318 : static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1319 :
1320 : #ifdef CONFIG_SMP
1321 : #define bh_lru_lock() local_irq_disable()
1322 : #define bh_lru_unlock() local_irq_enable()
1323 : #else
1324 : #define bh_lru_lock() preempt_disable()
1325 : #define bh_lru_unlock() preempt_enable()
1326 : #endif
1327 :
1328 : static inline void check_irqs_on(void)
1329 : {
1330 : #ifdef irqs_disabled
1331 145425313 : BUG_ON(irqs_disabled());
1332 : #endif
1333 : }
1334 :
1335 : /*
1336 : * Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
1337 : * inserted at the front, and the buffer_head at the back if any is evicted.
1338 : * Or, if already in the LRU it is moved to the front.
1339 : */
1340 11387656 : static void bh_lru_install(struct buffer_head *bh)
1341 : {
1342 11387656 : struct buffer_head *evictee = bh;
1343 11387656 : struct bh_lru *b;
1344 11387656 : int i;
1345 :
1346 11387656 : check_irqs_on();
1347 11387653 : bh_lru_lock();
1348 :
1349 : /*
1350 : * the refcount of buffer_head in bh_lru prevents dropping the
1351 : * attached page(i.e., try_to_free_buffers) so it could cause
1352 : * failing page migration.
1353 : * Skip putting upcoming bh into bh_lru until migration is done.
1354 : */
1355 11387636 : if (lru_cache_disabled()) {
1356 0 : bh_lru_unlock();
1357 0 : return;
1358 : }
1359 :
1360 11387636 : b = this_cpu_ptr(&bh_lrus);
1361 193482453 : for (i = 0; i < BH_LRU_SIZE; i++) {
1362 182096285 : swap(evictee, b->bhs[i]);
1363 182095417 : if (evictee == bh) {
1364 51 : bh_lru_unlock();
1365 51 : return;
1366 : }
1367 : }
1368 :
1369 11386168 : get_bh(bh);
1370 11386232 : bh_lru_unlock();
1371 11386096 : brelse(evictee);
1372 : }
1373 :
1374 : /*
1375 : * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1376 : */
1377 : static struct buffer_head *
1378 134027088 : lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1379 : {
1380 134027088 : struct buffer_head *ret = NULL;
1381 134027088 : unsigned int i;
1382 :
1383 134027088 : check_irqs_on();
1384 134037660 : bh_lru_lock();
1385 664211213 : for (i = 0; i < BH_LRU_SIZE; i++) {
1386 516687247 : struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1387 :
1388 516677314 : if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1389 120654254 : bh->b_size == size) {
1390 120658751 : if (i) {
1391 247717668 : while (i) {
1392 181145457 : __this_cpu_write(bh_lrus.bhs[i],
1393 : __this_cpu_read(bh_lrus.bhs[i - 1]));
1394 181150285 : i--;
1395 : }
1396 66572211 : __this_cpu_write(bh_lrus.bhs[0], bh);
1397 : }
1398 120669243 : get_bh(bh);
1399 120669243 : ret = bh;
1400 120669243 : break;
1401 : }
1402 : }
1403 134250804 : bh_lru_unlock();
1404 134191694 : return ret;
1405 : }
1406 :
1407 : /*
1408 : * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1409 : * it in the LRU and mark it as accessed. If it is not present then return
1410 : * NULL
1411 : */
1412 : struct buffer_head *
1413 134031991 : __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1414 : {
1415 134031991 : struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1416 :
1417 134212946 : if (bh == NULL) {
1418 : /* __find_get_block_slow will mark the page accessed */
1419 13486289 : bh = __find_get_block_slow(bdev, block);
1420 13489432 : if (bh)
1421 11387930 : bh_lru_install(bh);
1422 : } else
1423 120726657 : touch_buffer(bh);
1424 :
1425 134180020 : return bh;
1426 : }
1427 : EXPORT_SYMBOL(__find_get_block);
1428 :
1429 : /*
1430 : * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
1431 : * which corresponds to the passed block_device, block and size. The
1432 : * returned buffer has its reference count incremented.
1433 : *
1434 : * __getblk_gfp() will lock up the machine if grow_dev_page's
1435 : * try_to_free_buffers() attempt is failing. FIXME, perhaps?
1436 : */
1437 : struct buffer_head *
1438 130625326 : __getblk_gfp(struct block_device *bdev, sector_t block,
1439 : unsigned size, gfp_t gfp)
1440 : {
1441 130625326 : struct buffer_head *bh = __find_get_block(bdev, block, size);
1442 :
1443 130763044 : might_sleep();
1444 130803710 : if (bh == NULL)
1445 1033140 : bh = __getblk_slow(bdev, block, size, gfp);
1446 130805316 : return bh;
1447 : }
1448 : EXPORT_SYMBOL(__getblk_gfp);
1449 :
1450 : /*
1451 : * Do async read-ahead on a buffer..
1452 : */
1453 464 : void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1454 : {
1455 464 : struct buffer_head *bh = __getblk(bdev, block, size);
1456 464 : if (likely(bh)) {
1457 464 : bh_readahead(bh, REQ_RAHEAD);
1458 464 : brelse(bh);
1459 : }
1460 464 : }
1461 : EXPORT_SYMBOL(__breadahead);
1462 :
1463 : /**
1464 : * __bread_gfp() - reads a specified block and returns the bh
1465 : * @bdev: the block_device to read from
1466 : * @block: number of block
1467 : * @size: size (in bytes) to read
1468 : * @gfp: page allocation flag
1469 : *
1470 : * Reads a specified block, and returns buffer head that contains it.
1471 : * The page cache can be allocated from non-movable area
1472 : * not to prevent page migration if you set gfp to zero.
1473 : * It returns NULL if the block was unreadable.
1474 : */
1475 : struct buffer_head *
1476 285683 : __bread_gfp(struct block_device *bdev, sector_t block,
1477 : unsigned size, gfp_t gfp)
1478 : {
1479 285683 : struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1480 :
1481 571370 : if (likely(bh) && !buffer_uptodate(bh))
1482 86 : bh = __bread_slow(bh);
1483 285685 : return bh;
1484 : }
1485 : EXPORT_SYMBOL(__bread_gfp);
1486 :
1487 336412 : static void __invalidate_bh_lrus(struct bh_lru *b)
1488 : {
1489 336412 : int i;
1490 :
1491 5702305 : for (i = 0; i < BH_LRU_SIZE; i++) {
1492 5365302 : brelse(b->bhs[i]);
1493 5367017 : b->bhs[i] = NULL;
1494 : }
1495 337003 : }
1496 : /*
1497 : * invalidate_bh_lrus() is called rarely - but not only at unmount.
1498 : * This doesn't race because it runs in each cpu either in irq
1499 : * or with preempt disabled.
1500 : */
1501 11069 : static void invalidate_bh_lru(void *arg)
1502 : {
1503 11069 : struct bh_lru *b = &get_cpu_var(bh_lrus);
1504 :
1505 10796 : __invalidate_bh_lrus(b);
1506 11168 : put_cpu_var(bh_lrus);
1507 11134 : }
1508 :
1509 2502259 : bool has_bh_in_lru(int cpu, void *dummy)
1510 : {
1511 2502259 : struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1512 2502242 : int i;
1513 :
1514 42336486 : for (i = 0; i < BH_LRU_SIZE; i++) {
1515 39846650 : if (b->bhs[i])
1516 : return true;
1517 : }
1518 :
1519 : return false;
1520 : }
1521 :
1522 545482 : void invalidate_bh_lrus(void)
1523 : {
1524 545482 : on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
1525 545467 : }
1526 : EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1527 :
1528 : /*
1529 : * It's called from workqueue context so we need a bh_lru_lock to close
1530 : * the race with preemption/irq.
1531 : */
1532 325828 : void invalidate_bh_lrus_cpu(void)
1533 : {
1534 325828 : struct bh_lru *b;
1535 :
1536 325828 : bh_lru_lock();
1537 325534 : b = this_cpu_ptr(&bh_lrus);
1538 325394 : __invalidate_bh_lrus(b);
1539 325674 : bh_lru_unlock();
1540 325541 : }
1541 :
1542 3121538 : void set_bh_page(struct buffer_head *bh,
1543 : struct page *page, unsigned long offset)
1544 : {
1545 3121538 : bh->b_page = page;
1546 3121538 : BUG_ON(offset >= PAGE_SIZE);
1547 3121538 : if (PageHighMem(page))
1548 : /*
1549 : * This catches illegal uses and preserves the offset:
1550 : */
1551 : bh->b_data = (char *)(0 + offset);
1552 : else
1553 3121538 : bh->b_data = page_address(page) + offset;
1554 3121538 : }
1555 : EXPORT_SYMBOL(set_bh_page);
1556 :
1557 45060597 : void folio_set_bh(struct buffer_head *bh, struct folio *folio,
1558 : unsigned long offset)
1559 : {
1560 45060597 : bh->b_folio = folio;
1561 45060597 : BUG_ON(offset >= folio_size(folio));
1562 45032701 : if (folio_test_highmem(folio))
1563 : /*
1564 : * This catches illegal uses and preserves the offset:
1565 : */
1566 : bh->b_data = (char *)(0 + offset);
1567 : else
1568 45032701 : bh->b_data = folio_address(folio) + offset;
1569 45032701 : }
1570 : EXPORT_SYMBOL(folio_set_bh);
1571 :
1572 : /*
1573 : * Called when truncating a buffer on a page completely.
1574 : */
1575 :
1576 : /* Bits that are cleared during an invalidate */
1577 : #define BUFFER_FLAGS_DISCARD \
1578 : (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1579 : 1 << BH_Delay | 1 << BH_Unwritten)
1580 :
1581 37171442 : static void discard_buffer(struct buffer_head * bh)
1582 : {
1583 37171442 : unsigned long b_state;
1584 :
1585 37171442 : lock_buffer(bh);
1586 37174905 : clear_buffer_dirty(bh);
1587 37175160 : bh->b_bdev = NULL;
1588 37175160 : b_state = READ_ONCE(bh->b_state);
1589 37175160 : do {
1590 37175160 : } while (!try_cmpxchg(&bh->b_state, &b_state,
1591 : b_state & ~BUFFER_FLAGS_DISCARD));
1592 37177177 : unlock_buffer(bh);
1593 37176160 : }
1594 :
1595 : /**
1596 : * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
1597 : * @folio: The folio which is affected.
1598 : * @offset: start of the range to invalidate
1599 : * @length: length of the range to invalidate
1600 : *
1601 : * block_invalidate_folio() is called when all or part of the folio has been
1602 : * invalidated by a truncate operation.
1603 : *
1604 : * block_invalidate_folio() does not have to release all buffers, but it must
1605 : * ensure that no dirty buffer is left outside @offset and that no I/O
1606 : * is underway against any of the blocks which are outside the truncation
1607 : * point. Because the caller is about to free (and possibly reuse) those
1608 : * blocks on-disk.
1609 : */
1610 37171478 : void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
1611 : {
1612 37171478 : struct buffer_head *head, *bh, *next;
1613 37171478 : size_t curr_off = 0;
1614 37171478 : size_t stop = length + offset;
1615 :
1616 37171478 : BUG_ON(!folio_test_locked(folio));
1617 :
1618 : /*
1619 : * Check for overflow
1620 : */
1621 37171478 : BUG_ON(stop > folio_size(folio) || stop < length);
1622 :
1623 37171767 : head = folio_buffers(folio);
1624 37171767 : if (!head)
1625 : return;
1626 :
1627 : bh = head;
1628 37212894 : do {
1629 37212894 : size_t next_off = curr_off + bh->b_size;
1630 37212894 : next = bh->b_this_page;
1631 :
1632 : /*
1633 : * Are we still fully in range ?
1634 : */
1635 37212894 : if (next_off > stop)
1636 0 : goto out;
1637 :
1638 : /*
1639 : * is this block fully invalidated?
1640 : */
1641 37212894 : if (offset <= curr_off)
1642 37171460 : discard_buffer(bh);
1643 37217967 : curr_off = next_off;
1644 37217967 : bh = next;
1645 37217967 : } while (bh != head);
1646 :
1647 : /*
1648 : * We release buffers only if the entire folio is being invalidated.
1649 : * The get_block cached value has been unconditionally invalidated,
1650 : * so real IO is not possible anymore.
1651 : */
1652 37176840 : if (length == folio_size(folio))
1653 37135211 : filemap_release_folio(folio, 0);
1654 41301 : out:
1655 : return;
1656 : }
1657 : EXPORT_SYMBOL(block_invalidate_folio);
1658 :
1659 : /*
1660 : * We attach and possibly dirty the buffers atomically wrt
1661 : * block_dirty_folio() via private_lock. try_to_free_buffers
1662 : * is already excluded via the folio lock.
1663 : */
1664 43760289 : void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
1665 : unsigned long b_state)
1666 : {
1667 43760289 : struct buffer_head *bh, *head, *tail;
1668 :
1669 43760289 : head = folio_alloc_buffers(folio, blocksize, true);
1670 43760289 : bh = head;
1671 43842859 : do {
1672 43842859 : bh->b_state |= b_state;
1673 43842859 : tail = bh;
1674 43842859 : bh = bh->b_this_page;
1675 43842859 : } while (bh);
1676 43804751 : tail->b_this_page = head;
1677 :
1678 43804751 : spin_lock(&folio->mapping->private_lock);
1679 52067753 : if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
1680 : bh = head;
1681 8245515 : do {
1682 8245515 : if (folio_test_dirty(folio))
1683 0 : set_buffer_dirty(bh);
1684 16451417 : if (folio_test_uptodate(folio))
1685 8218983 : set_buffer_uptodate(bh);
1686 8252907 : bh = bh->b_this_page;
1687 8252907 : } while (bh != head);
1688 : }
1689 43830224 : folio_attach_private(folio, head);
1690 43827178 : spin_unlock(&folio->mapping->private_lock);
1691 43843717 : }
1692 : EXPORT_SYMBOL(folio_create_empty_buffers);
1693 :
1694 906491 : void create_empty_buffers(struct page *page,
1695 : unsigned long blocksize, unsigned long b_state)
1696 : {
1697 906491 : folio_create_empty_buffers(page_folio(page), blocksize, b_state);
1698 908425 : }
1699 : EXPORT_SYMBOL(create_empty_buffers);
1700 :
1701 : /**
1702 : * clean_bdev_aliases: clean a range of buffers in block device
1703 : * @bdev: Block device to clean buffers in
1704 : * @block: Start of a range of blocks to clean
1705 : * @len: Number of blocks to clean
1706 : *
1707 : * We are taking a range of blocks for data and we don't want writeback of any
1708 : * buffer-cache aliases starting from return from this function and until the
1709 : * moment when something will explicitly mark the buffer dirty (hopefully that
1710 : * will not happen until we will free that block ;-) We don't even need to mark
1711 : * it not-uptodate - nobody can expect anything from a newly allocated buffer
1712 : * anyway. We used to use unmap_buffer() for such invalidation, but that was
1713 : * wrong. We definitely don't want to mark the alias unmapped, for example - it
1714 : * would confuse anyone who might pick it with bread() afterwards...
1715 : *
1716 : * Also.. Note that bforget() doesn't lock the buffer. So there can be
1717 : * writeout I/O going on against recently-freed buffers. We don't wait on that
1718 : * I/O in bforget() - it's more efficient to wait on the I/O only if we really
1719 : * need to. That happens here.
1720 : */
1721 29345308 : void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
1722 : {
1723 29345308 : struct inode *bd_inode = bdev->bd_inode;
1724 29345308 : struct address_space *bd_mapping = bd_inode->i_mapping;
1725 29345308 : struct folio_batch fbatch;
1726 29345308 : pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
1727 29345308 : pgoff_t end;
1728 29345308 : int i, count;
1729 29345308 : struct buffer_head *bh;
1730 29345308 : struct buffer_head *head;
1731 :
1732 29345308 : end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
1733 29345308 : folio_batch_init(&fbatch);
1734 29345308 : while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
1735 337699 : count = folio_batch_count(&fbatch);
1736 675398 : for (i = 0; i < count; i++) {
1737 337699 : struct folio *folio = fbatch.folios[i];
1738 :
1739 337699 : if (!folio_buffers(folio))
1740 778 : continue;
1741 : /*
1742 : * We use folio lock instead of bd_mapping->private_lock
1743 : * to pin buffers here since we can afford to sleep and
1744 : * it scales better than a global spinlock lock.
1745 : */
1746 336921 : folio_lock(folio);
1747 : /* Recheck when the folio is locked which pins bhs */
1748 336921 : head = folio_buffers(folio);
1749 336921 : if (!head)
1750 0 : goto unlock_page;
1751 : bh = head;
1752 340980 : do {
1753 681960 : if (!buffer_mapped(bh) || (bh->b_blocknr < block))
1754 3103 : goto next;
1755 337877 : if (bh->b_blocknr >= block + len)
1756 : break;
1757 336921 : clear_buffer_dirty(bh);
1758 336921 : wait_on_buffer(bh);
1759 336921 : clear_buffer_req(bh);
1760 340024 : next:
1761 340024 : bh = bh->b_this_page;
1762 340024 : } while (bh != head);
1763 336921 : unlock_page:
1764 336921 : folio_unlock(folio);
1765 : }
1766 337699 : folio_batch_release(&fbatch);
1767 337699 : cond_resched();
1768 : /* End of range already reached? */
1769 337699 : if (index > end || !index)
1770 : break;
1771 : }
1772 29327313 : }
1773 : EXPORT_SYMBOL(clean_bdev_aliases);
1774 :
1775 : /*
1776 : * Size is a power-of-two in the range 512..PAGE_SIZE,
1777 : * and the case we care about most is PAGE_SIZE.
1778 : *
1779 : * So this *could* possibly be written with those
1780 : * constraints in mind (relevant mostly if some
1781 : * architecture has a slow bit-scan instruction)
1782 : */
1783 : static inline int block_size_bits(unsigned int blocksize)
1784 : {
1785 0 : return ilog2(blocksize);
1786 : }
1787 :
1788 157051553 : static struct buffer_head *folio_create_buffers(struct folio *folio,
1789 : struct inode *inode,
1790 : unsigned int b_state)
1791 : {
1792 157051553 : BUG_ON(!folio_test_locked(folio));
1793 :
1794 157051553 : if (!folio_buffers(folio))
1795 85711166 : folio_create_empty_buffers(folio,
1796 42855583 : 1 << READ_ONCE(inode->i_blkbits),
1797 : b_state);
1798 157127251 : return folio_buffers(folio);
1799 : }
1800 :
1801 : /*
1802 : * NOTE! All mapped/uptodate combinations are valid:
1803 : *
1804 : * Mapped Uptodate Meaning
1805 : *
1806 : * No No "unknown" - must do get_block()
1807 : * No Yes "hole" - zero-filled
1808 : * Yes No "allocated" - allocated on disk, not read in
1809 : * Yes Yes "valid" - allocated and up-to-date in memory.
1810 : *
1811 : * "Dirty" is valid only with the last case (mapped+uptodate).
1812 : */
1813 :
1814 : /*
1815 : * While block_write_full_page is writing back the dirty buffers under
1816 : * the page lock, whoever dirtied the buffers may decide to clean them
1817 : * again at any time. We handle that by only looking at the buffer
1818 : * state inside lock_buffer().
1819 : *
1820 : * If block_write_full_page() is called for regular writeback
1821 : * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1822 : * locked buffer. This only can happen if someone has written the buffer
1823 : * directly, with submit_bh(). At the address_space level PageWriteback
1824 : * prevents this contention from occurring.
1825 : *
1826 : * If block_write_full_page() is called with wbc->sync_mode ==
1827 : * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
1828 : * causes the writes to be flagged as synchronous writes.
1829 : */
1830 12057001 : int __block_write_full_folio(struct inode *inode, struct folio *folio,
1831 : get_block_t *get_block, struct writeback_control *wbc,
1832 : bh_end_io_t *handler)
1833 : {
1834 12057001 : int err;
1835 12057001 : sector_t block;
1836 12057001 : sector_t last_block;
1837 12057001 : struct buffer_head *bh, *head;
1838 12057001 : unsigned int blocksize, bbits;
1839 12057001 : int nr_underway = 0;
1840 12057001 : blk_opf_t write_flags = wbc_to_write_flags(wbc);
1841 :
1842 12057001 : head = folio_create_buffers(folio, inode,
1843 : (1 << BH_Dirty) | (1 << BH_Uptodate));
1844 :
1845 : /*
1846 : * Be very careful. We have no exclusion from block_dirty_folio
1847 : * here, and the (potentially unmapped) buffers may become dirty at
1848 : * any time. If a buffer becomes dirty here after we've inspected it
1849 : * then we just miss that fact, and the folio stays dirty.
1850 : *
1851 : * Buffers outside i_size may be dirtied by block_dirty_folio;
1852 : * handle that here by just cleaning them.
1853 : */
1854 :
1855 12056999 : bh = head;
1856 12056999 : blocksize = bh->b_size;
1857 12056999 : bbits = block_size_bits(blocksize);
1858 :
1859 12056999 : block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
1860 12056999 : last_block = (i_size_read(inode) - 1) >> bbits;
1861 :
1862 : /*
1863 : * Get all the dirty buffers mapped to disk addresses and
1864 : * handle any aliases from the underlying blockdev's mapping.
1865 : */
1866 12232087 : do {
1867 12232087 : if (block > last_block) {
1868 : /*
1869 : * mapped buffers outside i_size will occur, because
1870 : * this folio can be outside i_size when there is a
1871 : * truncate in progress.
1872 : */
1873 : /*
1874 : * The buffer was zeroed by block_write_full_page()
1875 : */
1876 9 : clear_buffer_dirty(bh);
1877 9 : set_buffer_uptodate(bh);
1878 36696234 : } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1879 : buffer_dirty(bh)) {
1880 1722 : WARN_ON(bh->b_size != blocksize);
1881 1722 : err = get_block(inode, block, bh, 1);
1882 1722 : if (err)
1883 14 : goto recover;
1884 1708 : clear_buffer_delay(bh);
1885 3416 : if (buffer_new(bh)) {
1886 : /* blockdev mappings never come here */
1887 20 : clear_buffer_new(bh);
1888 20 : clean_bdev_bh_alias(bh);
1889 : }
1890 : }
1891 12232073 : bh = bh->b_this_page;
1892 12232073 : block++;
1893 12232073 : } while (bh != head);
1894 :
1895 12232077 : do {
1896 24464154 : if (!buffer_mapped(bh))
1897 1133 : continue;
1898 : /*
1899 : * If it's a fully non-blocking write attempt and we cannot
1900 : * lock the buffer then redirty the folio. Note that this can
1901 : * potentially cause a busy-wait loop from writeback threads
1902 : * and kswapd activity, but those code paths have their own
1903 : * higher-level throttling.
1904 : */
1905 12230944 : if (wbc->sync_mode != WB_SYNC_NONE) {
1906 11252260 : lock_buffer(bh);
1907 978684 : } else if (!trylock_buffer(bh)) {
1908 10 : folio_redirty_for_writepage(wbc, folio);
1909 10 : continue;
1910 : }
1911 12230934 : if (test_clear_buffer_dirty(bh)) {
1912 11982688 : mark_buffer_async_write_endio(bh, handler);
1913 : } else {
1914 248243 : unlock_buffer(bh);
1915 : }
1916 12232078 : } while ((bh = bh->b_this_page) != head);
1917 :
1918 : /*
1919 : * The folio and its buffers are protected by the writeback flag,
1920 : * so we can drop the bh refcounts early.
1921 : */
1922 12056986 : BUG_ON(folio_test_writeback(folio));
1923 12056986 : folio_start_writeback(folio);
1924 :
1925 12232075 : do {
1926 12232075 : struct buffer_head *next = bh->b_this_page;
1927 24464150 : if (buffer_async_write(bh)) {
1928 11982690 : submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
1929 11982688 : nr_underway++;
1930 : }
1931 12232073 : bh = next;
1932 12232073 : } while (bh != head);
1933 12056985 : folio_unlock(folio);
1934 :
1935 12056985 : err = 0;
1936 12057001 : done:
1937 12057001 : if (nr_underway == 0) {
1938 : /*
1939 : * The folio was marked dirty, but the buffers were
1940 : * clean. Someone wrote them back by hand with
1941 : * write_dirty_buffer/submit_bh. A rare case.
1942 : */
1943 241960 : folio_end_writeback(folio);
1944 :
1945 : /*
1946 : * The folio and buffer_heads can be released at any time from
1947 : * here on.
1948 : */
1949 : }
1950 12057001 : return err;
1951 :
1952 : recover:
1953 : /*
1954 : * ENOSPC, or some other error. We may already have added some
1955 : * blocks to the file, so we need to write these out to avoid
1956 : * exposing stale data.
1957 : * The folio is currently locked and not marked for writeback
1958 : */
1959 14 : bh = head;
1960 : /* Recovery: lock and submit the mapped buffers */
1961 56 : do {
1962 114 : if (buffer_mapped(bh) && buffer_dirty(bh) &&
1963 : !buffer_delay(bh)) {
1964 1 : lock_buffer(bh);
1965 1 : mark_buffer_async_write_endio(bh, handler);
1966 : } else {
1967 : /*
1968 : * The buffer may have been set dirty during
1969 : * attachment to a dirty folio.
1970 : */
1971 55 : clear_buffer_dirty(bh);
1972 : }
1973 56 : } while ((bh = bh->b_this_page) != head);
1974 14 : folio_set_error(folio);
1975 14 : BUG_ON(folio_test_writeback(folio));
1976 14 : mapping_set_error(folio->mapping, err);
1977 14 : folio_start_writeback(folio);
1978 56 : do {
1979 56 : struct buffer_head *next = bh->b_this_page;
1980 112 : if (buffer_async_write(bh)) {
1981 1 : clear_buffer_dirty(bh);
1982 1 : submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
1983 1 : nr_underway++;
1984 : }
1985 56 : bh = next;
1986 56 : } while (bh != head);
1987 14 : folio_unlock(folio);
1988 14 : goto done;
1989 : }
1990 : EXPORT_SYMBOL(__block_write_full_folio);
1991 :
1992 : /*
1993 : * If a folio has any new buffers, zero them out here, and mark them uptodate
1994 : * and dirty so they'll be written out (in order to prevent uninitialised
1995 : * block data from leaking). And clear the new bit.
1996 : */
1997 121480 : void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
1998 : {
1999 121480 : size_t block_start, block_end;
2000 121480 : struct buffer_head *head, *bh;
2001 :
2002 121480 : BUG_ON(!folio_test_locked(folio));
2003 121480 : head = folio_buffers(folio);
2004 121480 : if (!head)
2005 : return;
2006 :
2007 : bh = head;
2008 : block_start = 0;
2009 121280 : do {
2010 121280 : block_end = block_start + bh->b_size;
2011 :
2012 242560 : if (buffer_new(bh)) {
2013 0 : if (block_end > from && block_start < to) {
2014 0 : if (!folio_test_uptodate(folio)) {
2015 0 : size_t start, xend;
2016 :
2017 0 : start = max(from, block_start);
2018 0 : xend = min(to, block_end);
2019 :
2020 0 : folio_zero_segment(folio, start, xend);
2021 0 : set_buffer_uptodate(bh);
2022 : }
2023 :
2024 0 : clear_buffer_new(bh);
2025 0 : mark_buffer_dirty(bh);
2026 : }
2027 : }
2028 :
2029 121112 : block_start = block_end;
2030 121112 : bh = bh->b_this_page;
2031 121112 : } while (bh != head);
2032 : }
2033 : EXPORT_SYMBOL(folio_zero_new_buffers);
2034 :
2035 : static void
2036 0 : iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
2037 : const struct iomap *iomap)
2038 : {
2039 0 : loff_t offset = block << inode->i_blkbits;
2040 :
2041 0 : bh->b_bdev = iomap->bdev;
2042 :
2043 : /*
2044 : * Block points to offset in file we need to map, iomap contains
2045 : * the offset at which the map starts. If the map ends before the
2046 : * current block, then do not map the buffer and let the caller
2047 : * handle it.
2048 : */
2049 0 : BUG_ON(offset >= iomap->offset + iomap->length);
2050 :
2051 0 : switch (iomap->type) {
2052 : case IOMAP_HOLE:
2053 : /*
2054 : * If the buffer is not up to date or beyond the current EOF,
2055 : * we need to mark it as new to ensure sub-block zeroing is
2056 : * executed if necessary.
2057 : */
2058 0 : if (!buffer_uptodate(bh) ||
2059 : (offset >= i_size_read(inode)))
2060 0 : set_buffer_new(bh);
2061 : break;
2062 : case IOMAP_DELALLOC:
2063 0 : if (!buffer_uptodate(bh) ||
2064 : (offset >= i_size_read(inode)))
2065 0 : set_buffer_new(bh);
2066 0 : set_buffer_uptodate(bh);
2067 0 : set_buffer_mapped(bh);
2068 0 : set_buffer_delay(bh);
2069 : break;
2070 : case IOMAP_UNWRITTEN:
2071 : /*
2072 : * For unwritten regions, we always need to ensure that regions
2073 : * in the block we are not writing to are zeroed. Mark the
2074 : * buffer as new to ensure this.
2075 : */
2076 0 : set_buffer_new(bh);
2077 0 : set_buffer_unwritten(bh);
2078 0 : fallthrough;
2079 0 : case IOMAP_MAPPED:
2080 0 : if ((iomap->flags & IOMAP_F_NEW) ||
2081 : offset >= i_size_read(inode))
2082 0 : set_buffer_new(bh);
2083 0 : bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
2084 0 : inode->i_blkbits;
2085 0 : set_buffer_mapped(bh);
2086 : break;
2087 : }
2088 0 : }
2089 :
2090 144936292 : int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
2091 : get_block_t *get_block, const struct iomap *iomap)
2092 : {
2093 144936292 : unsigned from = pos & (PAGE_SIZE - 1);
2094 144936292 : unsigned to = from + len;
2095 144936292 : struct inode *inode = folio->mapping->host;
2096 144936292 : unsigned block_start, block_end;
2097 144936292 : sector_t block;
2098 144936292 : int err = 0;
2099 144936292 : unsigned blocksize, bbits;
2100 144936292 : struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
2101 :
2102 144936292 : BUG_ON(!folio_test_locked(folio));
2103 144936292 : BUG_ON(from > PAGE_SIZE);
2104 144936292 : BUG_ON(to > PAGE_SIZE);
2105 144936292 : BUG_ON(from > to);
2106 :
2107 144936292 : head = folio_create_buffers(folio, inode, 0);
2108 144973367 : blocksize = head->b_size;
2109 144973367 : bbits = block_size_bits(blocksize);
2110 :
2111 144973367 : block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
2112 :
2113 289787041 : for(bh = head, block_start = 0; bh != head || !block_start;
2114 144813674 : block++, block_start=block_end, bh = bh->b_this_page) {
2115 145017149 : block_end = block_start + blocksize;
2116 145017149 : if (block_end <= from || block_start >= to) {
2117 8994 : if (folio_test_uptodate(folio)) {
2118 6196 : if (!buffer_uptodate(bh))
2119 0 : set_buffer_uptodate(bh);
2120 : }
2121 5896 : continue;
2122 : }
2123 290022506 : if (buffer_new(bh))
2124 0 : clear_buffer_new(bh);
2125 290022506 : if (!buffer_mapped(bh)) {
2126 43080537 : WARN_ON(bh->b_size != blocksize);
2127 43080537 : if (get_block) {
2128 43080537 : err = get_block(inode, block, bh, 1);
2129 43083966 : if (err)
2130 : break;
2131 : } else {
2132 0 : iomap_to_bh(inode, block, bh, iomap);
2133 : }
2134 :
2135 85924718 : if (buffer_new(bh)) {
2136 29360176 : clean_bdev_bh_alias(bh);
2137 36210810 : if (folio_test_uptodate(folio)) {
2138 6971574 : clear_buffer_new(bh);
2139 7051359 : set_buffer_uptodate(bh);
2140 7051359 : mark_buffer_dirty(bh);
2141 7046276 : continue;
2142 : }
2143 22272680 : if (block_end > to || block_start < from)
2144 2661630 : folio_zero_segments(folio,
2145 : to, block_end,
2146 : block_start, from);
2147 22285814 : continue;
2148 : }
2149 : }
2150 218285691 : if (folio_test_uptodate(folio)) {
2151 205570759 : if (!buffer_uptodate(bh))
2152 1 : set_buffer_uptodate(bh);
2153 102780624 : continue;
2154 : }
2155 50773626 : if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2156 12690230 : !buffer_unwritten(bh) &&
2157 12690230 : (block_start < from || block_end > to)) {
2158 1945048 : bh_read_nowait(bh, 0);
2159 1945051 : *wait_bh++=bh;
2160 : }
2161 : }
2162 : /*
2163 : * If we issued read requests - let them complete.
2164 : */
2165 146836549 : while(wait_bh > wait) {
2166 1947597 : wait_on_buffer(*--wait_bh);
2167 3890100 : if (!buffer_uptodate(*wait_bh))
2168 0 : err = -EIO;
2169 : }
2170 144888952 : if (unlikely(err))
2171 121458 : folio_zero_new_buffers(folio, from, to);
2172 144888756 : return err;
2173 : }
2174 :
2175 136887758 : int __block_write_begin(struct page *page, loff_t pos, unsigned len,
2176 : get_block_t *get_block)
2177 : {
2178 136887758 : return __block_write_begin_int(page_folio(page), pos, len, get_block,
2179 : NULL);
2180 : }
2181 : EXPORT_SYMBOL(__block_write_begin);
2182 :
2183 144809343 : static int __block_commit_write(struct inode *inode, struct folio *folio,
2184 : size_t from, size_t to)
2185 : {
2186 144809343 : size_t block_start, block_end;
2187 144809343 : bool partial = false;
2188 144809343 : unsigned blocksize;
2189 144809343 : struct buffer_head *bh, *head;
2190 :
2191 144809343 : bh = head = folio_buffers(folio);
2192 144809343 : blocksize = bh->b_size;
2193 :
2194 144809343 : block_start = 0;
2195 144881266 : do {
2196 144881266 : block_end = block_start + blocksize;
2197 144881266 : if (block_end <= from || block_start >= to) {
2198 11792 : if (!buffer_uptodate(bh))
2199 2701 : partial = true;
2200 : } else {
2201 144875370 : set_buffer_uptodate(bh);
2202 144906655 : mark_buffer_dirty(bh);
2203 : }
2204 290003554 : if (buffer_new(bh))
2205 22321217 : clear_buffer_new(bh);
2206 :
2207 145009710 : block_start = block_end;
2208 145009710 : bh = bh->b_this_page;
2209 145009710 : } while (bh != head);
2210 :
2211 : /*
2212 : * If this is a partial write which happened to make all buffers
2213 : * uptodate then we can optimize away a bogus read_folio() for
2214 : * the next read(). Here we 'discover' whether the folio went
2215 : * uptodate as a result of this (potentially partial) write.
2216 : */
2217 144937787 : if (!partial)
2218 144932218 : folio_mark_uptodate(folio);
2219 144979461 : return 0;
2220 : }
2221 :
2222 : /*
2223 : * block_write_begin takes care of the basic task of block allocation and
2224 : * bringing partial write blocks uptodate first.
2225 : *
2226 : * The filesystem needs to handle block truncation upon failure.
2227 : */
2228 23823836 : int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2229 : struct page **pagep, get_block_t *get_block)
2230 : {
2231 23823836 : pgoff_t index = pos >> PAGE_SHIFT;
2232 23823836 : struct page *page;
2233 23823836 : int status;
2234 :
2235 23823836 : page = grab_cache_page_write_begin(mapping, index);
2236 23823836 : if (!page)
2237 : return -ENOMEM;
2238 :
2239 23823836 : status = __block_write_begin(page, pos, len, get_block);
2240 23823836 : if (unlikely(status)) {
2241 0 : unlock_page(page);
2242 0 : put_page(page);
2243 0 : page = NULL;
2244 : }
2245 :
2246 23823836 : *pagep = page;
2247 23823836 : return status;
2248 : }
2249 : EXPORT_SYMBOL(block_write_begin);
2250 :
2251 136830399 : int block_write_end(struct file *file, struct address_space *mapping,
2252 : loff_t pos, unsigned len, unsigned copied,
2253 : struct page *page, void *fsdata)
2254 : {
2255 136830399 : struct folio *folio = page_folio(page);
2256 136714792 : struct inode *inode = mapping->host;
2257 136714792 : size_t start = pos - folio_pos(folio);
2258 :
2259 136714792 : if (unlikely(copied < len)) {
2260 : /*
2261 : * The buffers that were written will now be uptodate, so
2262 : * we don't have to worry about a read_folio reading them
2263 : * and overwriting a partial write. However if we have
2264 : * encountered a short write and only partially written
2265 : * into a buffer, it will not be marked uptodate, so a
2266 : * read_folio might come in and destroy our partial write.
2267 : *
2268 : * Do the simplest thing, and just treat any short write to a
2269 : * non uptodate folio as a zero-length write, and force the
2270 : * caller to redo the whole thing.
2271 : */
2272 0 : if (!folio_test_uptodate(folio))
2273 0 : copied = 0;
2274 :
2275 0 : folio_zero_new_buffers(folio, start+copied, start+len);
2276 : }
2277 136714792 : flush_dcache_folio(folio);
2278 :
2279 : /* This could be a short (even 0-length) commit */
2280 136714792 : __block_commit_write(inode, folio, start, start + copied);
2281 :
2282 136866244 : return copied;
2283 : }
2284 : EXPORT_SYMBOL(block_write_end);
2285 :
2286 112877144 : int generic_write_end(struct file *file, struct address_space *mapping,
2287 : loff_t pos, unsigned len, unsigned copied,
2288 : struct page *page, void *fsdata)
2289 : {
2290 112877144 : struct inode *inode = mapping->host;
2291 112877144 : loff_t old_size = inode->i_size;
2292 112877144 : bool i_size_changed = false;
2293 :
2294 112877144 : copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2295 :
2296 : /*
2297 : * No need to use i_size_read() here, the i_size cannot change under us
2298 : * because we hold i_rwsem.
2299 : *
2300 : * But it's important to update i_size while still holding page lock:
2301 : * page writeout could otherwise come in and zero beyond i_size.
2302 : */
2303 112974208 : if (pos + copied > inode->i_size) {
2304 24144343 : i_size_write(inode, pos + copied);
2305 24144343 : i_size_changed = true;
2306 : }
2307 :
2308 112974208 : unlock_page(page);
2309 112991080 : put_page(page);
2310 :
2311 112975855 : if (old_size < pos)
2312 4710106 : pagecache_isize_extended(inode, old_size, pos);
2313 : /*
2314 : * Don't mark the inode dirty under page lock. First, it unnecessarily
2315 : * makes the holding time of page lock longer. Second, it forces lock
2316 : * ordering of page lock and transaction start for journaling
2317 : * filesystems.
2318 : */
2319 112966060 : if (i_size_changed)
2320 24142152 : mark_inode_dirty(inode);
2321 112992951 : return copied;
2322 : }
2323 : EXPORT_SYMBOL(generic_write_end);
2324 :
2325 : /*
2326 : * block_is_partially_uptodate checks whether buffers within a folio are
2327 : * uptodate or not.
2328 : *
2329 : * Returns true if all buffers which correspond to the specified part
2330 : * of the folio are uptodate.
2331 : */
2332 23 : bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
2333 : {
2334 23 : unsigned block_start, block_end, blocksize;
2335 23 : unsigned to;
2336 23 : struct buffer_head *bh, *head;
2337 23 : bool ret = true;
2338 :
2339 23 : head = folio_buffers(folio);
2340 23 : if (!head)
2341 : return false;
2342 23 : blocksize = head->b_size;
2343 23 : to = min_t(unsigned, folio_size(folio) - from, count);
2344 23 : to = from + to;
2345 23 : if (from < blocksize && to > folio_size(folio) - blocksize)
2346 : return false;
2347 :
2348 : bh = head;
2349 : block_start = 0;
2350 3 : do {
2351 3 : block_end = block_start + blocksize;
2352 3 : if (block_end > from && block_start < to) {
2353 6 : if (!buffer_uptodate(bh)) {
2354 : ret = false;
2355 : break;
2356 : }
2357 3 : if (block_end >= to)
2358 : break;
2359 : }
2360 2 : block_start = block_end;
2361 2 : bh = bh->b_this_page;
2362 2 : } while (bh != head);
2363 :
2364 : return ret;
2365 : }
2366 : EXPORT_SYMBOL(block_is_partially_uptodate);
2367 :
2368 : /*
2369 : * Generic "read_folio" function for block devices that have the normal
2370 : * get_block functionality. This is most of the block device filesystems.
2371 : * Reads the folio asynchronously --- the unlock_buffer() and
2372 : * set/clear_buffer_uptodate() functions propagate buffer state into the
2373 : * folio once IO has completed.
2374 : */
2375 101575 : int block_read_full_folio(struct folio *folio, get_block_t *get_block)
2376 : {
2377 101575 : struct inode *inode = folio->mapping->host;
2378 101575 : sector_t iblock, lblock;
2379 101575 : struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2380 101575 : unsigned int blocksize, bbits;
2381 101575 : int nr, i;
2382 101575 : int fully_mapped = 1;
2383 101575 : bool page_error = false;
2384 101575 : loff_t limit = i_size_read(inode);
2385 :
2386 : /* This is needed for ext4. */
2387 101575 : if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
2388 : limit = inode->i_sb->s_maxbytes;
2389 :
2390 101575 : VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
2391 :
2392 101575 : head = folio_create_buffers(folio, inode, 0);
2393 101575 : blocksize = head->b_size;
2394 101575 : bbits = block_size_bits(blocksize);
2395 :
2396 101575 : iblock = (sector_t)folio->index << (PAGE_SHIFT - bbits);
2397 101575 : lblock = (limit+blocksize-1) >> bbits;
2398 101575 : bh = head;
2399 101575 : nr = 0;
2400 101575 : i = 0;
2401 :
2402 103273 : do {
2403 206546 : if (buffer_uptodate(bh))
2404 13005 : continue;
2405 :
2406 180536 : if (!buffer_mapped(bh)) {
2407 89718 : int err = 0;
2408 :
2409 89718 : fully_mapped = 0;
2410 89718 : if (iblock < lblock) {
2411 89396 : WARN_ON(bh->b_size != blocksize);
2412 89396 : err = get_block(inode, iblock, bh, 0);
2413 89396 : if (err) {
2414 0 : folio_set_error(folio);
2415 0 : page_error = true;
2416 : }
2417 : }
2418 179436 : if (!buffer_mapped(bh)) {
2419 47802 : folio_zero_range(folio, i * blocksize,
2420 : blocksize);
2421 47802 : if (!err)
2422 47802 : set_buffer_uptodate(bh);
2423 47802 : continue;
2424 : }
2425 : /*
2426 : * get_block() might have updated the buffer
2427 : * synchronously
2428 : */
2429 83832 : if (buffer_uptodate(bh))
2430 0 : continue;
2431 : }
2432 42466 : arr[nr++] = bh;
2433 103273 : } while (i++, iblock++, (bh = bh->b_this_page) != head);
2434 :
2435 101575 : if (fully_mapped)
2436 12889 : folio_set_mappedtodisk(folio);
2437 :
2438 101575 : if (!nr) {
2439 : /*
2440 : * All buffers are uptodate - we can set the folio uptodate
2441 : * as well. But not if get_block() returned an error.
2442 : */
2443 60082 : if (!page_error)
2444 60082 : folio_mark_uptodate(folio);
2445 60082 : folio_unlock(folio);
2446 60082 : return 0;
2447 : }
2448 :
2449 : /* Stage two: lock the buffers */
2450 83959 : for (i = 0; i < nr; i++) {
2451 42466 : bh = arr[i];
2452 42466 : lock_buffer(bh);
2453 42466 : mark_buffer_async_read(bh);
2454 : }
2455 :
2456 : /*
2457 : * Stage 3: start the IO. Check for uptodateness
2458 : * inside the buffer lock in case another process reading
2459 : * the underlying blockdev brought it uptodate (the sct fix).
2460 : */
2461 83959 : for (i = 0; i < nr; i++) {
2462 42466 : bh = arr[i];
2463 84932 : if (buffer_uptodate(bh))
2464 36 : end_buffer_async_read(bh, 1);
2465 : else
2466 42430 : submit_bh(REQ_OP_READ, bh);
2467 : }
2468 : return 0;
2469 : }
2470 : EXPORT_SYMBOL(block_read_full_folio);
2471 :
2472 : /* utility function for filesystems that need to do work on expanding
2473 : * truncates. Uses filesystem pagecache writes to allow the filesystem to
2474 : * deal with the hole.
2475 : */
2476 0 : int generic_cont_expand_simple(struct inode *inode, loff_t size)
2477 : {
2478 0 : struct address_space *mapping = inode->i_mapping;
2479 0 : const struct address_space_operations *aops = mapping->a_ops;
2480 0 : struct page *page;
2481 0 : void *fsdata = NULL;
2482 0 : int err;
2483 :
2484 0 : err = inode_newsize_ok(inode, size);
2485 0 : if (err)
2486 0 : goto out;
2487 :
2488 0 : err = aops->write_begin(NULL, mapping, size, 0, &page, &fsdata);
2489 0 : if (err)
2490 0 : goto out;
2491 :
2492 0 : err = aops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
2493 0 : BUG_ON(err > 0);
2494 :
2495 0 : out:
2496 0 : return err;
2497 : }
2498 : EXPORT_SYMBOL(generic_cont_expand_simple);
2499 :
2500 0 : static int cont_expand_zero(struct file *file, struct address_space *mapping,
2501 : loff_t pos, loff_t *bytes)
2502 : {
2503 0 : struct inode *inode = mapping->host;
2504 0 : const struct address_space_operations *aops = mapping->a_ops;
2505 0 : unsigned int blocksize = i_blocksize(inode);
2506 0 : struct page *page;
2507 0 : void *fsdata = NULL;
2508 0 : pgoff_t index, curidx;
2509 0 : loff_t curpos;
2510 0 : unsigned zerofrom, offset, len;
2511 0 : int err = 0;
2512 :
2513 0 : index = pos >> PAGE_SHIFT;
2514 0 : offset = pos & ~PAGE_MASK;
2515 :
2516 0 : while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
2517 0 : zerofrom = curpos & ~PAGE_MASK;
2518 0 : if (zerofrom & (blocksize-1)) {
2519 0 : *bytes |= (blocksize-1);
2520 0 : (*bytes)++;
2521 : }
2522 0 : len = PAGE_SIZE - zerofrom;
2523 :
2524 0 : err = aops->write_begin(file, mapping, curpos, len,
2525 : &page, &fsdata);
2526 0 : if (err)
2527 0 : goto out;
2528 0 : zero_user(page, zerofrom, len);
2529 0 : err = aops->write_end(file, mapping, curpos, len, len,
2530 : page, fsdata);
2531 0 : if (err < 0)
2532 0 : goto out;
2533 0 : BUG_ON(err != len);
2534 0 : err = 0;
2535 :
2536 0 : balance_dirty_pages_ratelimited(mapping);
2537 :
2538 0 : if (fatal_signal_pending(current)) {
2539 0 : err = -EINTR;
2540 0 : goto out;
2541 : }
2542 : }
2543 :
2544 : /* page covers the boundary, find the boundary offset */
2545 0 : if (index == curidx) {
2546 0 : zerofrom = curpos & ~PAGE_MASK;
2547 : /* if we will expand the thing last block will be filled */
2548 0 : if (offset <= zerofrom) {
2549 0 : goto out;
2550 : }
2551 0 : if (zerofrom & (blocksize-1)) {
2552 0 : *bytes |= (blocksize-1);
2553 0 : (*bytes)++;
2554 : }
2555 0 : len = offset - zerofrom;
2556 :
2557 0 : err = aops->write_begin(file, mapping, curpos, len,
2558 : &page, &fsdata);
2559 0 : if (err)
2560 0 : goto out;
2561 0 : zero_user(page, zerofrom, len);
2562 0 : err = aops->write_end(file, mapping, curpos, len, len,
2563 : page, fsdata);
2564 0 : if (err < 0)
2565 0 : goto out;
2566 0 : BUG_ON(err != len);
2567 : err = 0;
2568 : }
2569 0 : out:
2570 0 : return err;
2571 : }
2572 :
2573 : /*
2574 : * For moronic filesystems that do not allow holes in file.
2575 : * We may have to extend the file.
2576 : */
2577 0 : int cont_write_begin(struct file *file, struct address_space *mapping,
2578 : loff_t pos, unsigned len,
2579 : struct page **pagep, void **fsdata,
2580 : get_block_t *get_block, loff_t *bytes)
2581 : {
2582 0 : struct inode *inode = mapping->host;
2583 0 : unsigned int blocksize = i_blocksize(inode);
2584 0 : unsigned int zerofrom;
2585 0 : int err;
2586 :
2587 0 : err = cont_expand_zero(file, mapping, pos, bytes);
2588 0 : if (err)
2589 : return err;
2590 :
2591 0 : zerofrom = *bytes & ~PAGE_MASK;
2592 0 : if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2593 0 : *bytes |= (blocksize-1);
2594 0 : (*bytes)++;
2595 : }
2596 :
2597 0 : return block_write_begin(mapping, pos, len, pagep, get_block);
2598 : }
2599 : EXPORT_SYMBOL(cont_write_begin);
2600 :
2601 51857 : int block_commit_write(struct page *page, unsigned from, unsigned to)
2602 : {
2603 51857 : struct folio *folio = page_folio(page);
2604 51857 : struct inode *inode = folio->mapping->host;
2605 51857 : __block_commit_write(inode, folio, from, to);
2606 51857 : return 0;
2607 : }
2608 : EXPORT_SYMBOL(block_commit_write);
2609 :
2610 : /*
2611 : * block_page_mkwrite() is not allowed to change the file size as it gets
2612 : * called from a page fault handler when a page is first dirtied. Hence we must
2613 : * be careful to check for EOF conditions here. We set the page up correctly
2614 : * for a written page which means we get ENOSPC checking when writing into
2615 : * holes and correct delalloc and unwritten extent mapping on filesystems that
2616 : * support these features.
2617 : *
2618 : * We are not allowed to take the i_mutex here so we have to play games to
2619 : * protect against truncate races as the page could now be beyond EOF. Because
2620 : * truncate writes the inode size before removing pages, once we have the
2621 : * page lock we can determine safely if the page is beyond EOF. If it is not
2622 : * beyond EOF, then the page is guaranteed safe against truncation until we
2623 : * unlock the page.
2624 : *
2625 : * Direct callers of this function should protect against filesystem freezing
2626 : * using sb_start_pagefault() - sb_end_pagefault() functions.
2627 : */
2628 8014011 : int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2629 : get_block_t get_block)
2630 : {
2631 8014011 : struct folio *folio = page_folio(vmf->page);
2632 7999102 : struct inode *inode = file_inode(vma->vm_file);
2633 7999102 : unsigned long end;
2634 7999102 : loff_t size;
2635 7999102 : int ret;
2636 :
2637 7999102 : folio_lock(folio);
2638 8024436 : size = i_size_read(inode);
2639 8024436 : if ((folio->mapping != inode->i_mapping) ||
2640 : (folio_pos(folio) >= size)) {
2641 : /* We overload EFAULT to mean page got truncated */
2642 64 : ret = -EFAULT;
2643 64 : goto out_unlock;
2644 : }
2645 :
2646 8024372 : end = folio_size(folio);
2647 : /* folio is wholly or partially inside EOF */
2648 8022722 : if (folio_pos(folio) + end > size)
2649 53324 : end = size - folio_pos(folio);
2650 :
2651 8022722 : ret = __block_write_begin_int(folio, 0, end, get_block, NULL);
2652 8032368 : if (!ret)
2653 8028462 : ret = __block_commit_write(inode, folio, 0, end);
2654 :
2655 8043691 : if (unlikely(ret < 0))
2656 3416 : goto out_unlock;
2657 8040275 : folio_mark_dirty(folio);
2658 8038465 : folio_wait_stable(folio);
2659 8038465 : return 0;
2660 3480 : out_unlock:
2661 3480 : folio_unlock(folio);
2662 3480 : return ret;
2663 : }
2664 : EXPORT_SYMBOL(block_page_mkwrite);
2665 :
2666 121 : int block_truncate_page(struct address_space *mapping,
2667 : loff_t from, get_block_t *get_block)
2668 : {
2669 121 : pgoff_t index = from >> PAGE_SHIFT;
2670 121 : unsigned blocksize;
2671 121 : sector_t iblock;
2672 121 : size_t offset, length, pos;
2673 121 : struct inode *inode = mapping->host;
2674 121 : struct folio *folio;
2675 121 : struct buffer_head *bh;
2676 121 : int err = 0;
2677 :
2678 121 : blocksize = i_blocksize(inode);
2679 121 : length = from & (blocksize - 1);
2680 :
2681 : /* Block boundary? Nothing to do */
2682 121 : if (!length)
2683 : return 0;
2684 :
2685 116 : length = blocksize - length;
2686 116 : iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
2687 :
2688 116 : folio = filemap_grab_folio(mapping, index);
2689 116 : if (IS_ERR(folio))
2690 0 : return PTR_ERR(folio);
2691 :
2692 116 : bh = folio_buffers(folio);
2693 116 : if (!bh) {
2694 110 : folio_create_empty_buffers(folio, blocksize, 0);
2695 110 : bh = folio_buffers(folio);
2696 : }
2697 :
2698 : /* Find the buffer that contains "offset" */
2699 116 : offset = offset_in_folio(folio, from);
2700 116 : pos = blocksize;
2701 306 : while (offset >= pos) {
2702 190 : bh = bh->b_this_page;
2703 190 : iblock++;
2704 190 : pos += blocksize;
2705 : }
2706 :
2707 232 : if (!buffer_mapped(bh)) {
2708 111 : WARN_ON(bh->b_size != blocksize);
2709 111 : err = get_block(inode, iblock, bh, 0);
2710 111 : if (err)
2711 0 : goto unlock;
2712 : /* unmapped? It's a hole - nothing to do */
2713 222 : if (!buffer_mapped(bh))
2714 106 : goto unlock;
2715 : }
2716 :
2717 : /* Ok, it's mapped. Make sure it's up-to-date */
2718 17 : if (folio_test_uptodate(folio))
2719 7 : set_buffer_uptodate(bh);
2720 :
2721 26 : if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2722 3 : err = bh_read(bh, 0);
2723 : /* Uhhuh. Read error. Complain and punt. */
2724 3 : if (err < 0)
2725 0 : goto unlock;
2726 : }
2727 :
2728 10 : folio_zero_range(folio, offset, length);
2729 10 : mark_buffer_dirty(bh);
2730 :
2731 116 : unlock:
2732 116 : folio_unlock(folio);
2733 116 : folio_put(folio);
2734 :
2735 116 : return err;
2736 : }
2737 : EXPORT_SYMBOL(block_truncate_page);
2738 :
2739 : /*
2740 : * The generic ->writepage function for buffer-backed address_spaces
2741 : */
2742 12056997 : int block_write_full_page(struct page *page, get_block_t *get_block,
2743 : struct writeback_control *wbc)
2744 : {
2745 12056997 : struct folio *folio = page_folio(page);
2746 12056997 : struct inode * const inode = folio->mapping->host;
2747 12056997 : loff_t i_size = i_size_read(inode);
2748 :
2749 : /* Is the folio fully inside i_size? */
2750 12056997 : if (folio_pos(folio) + folio_size(folio) <= i_size)
2751 12056989 : return __block_write_full_folio(inode, folio, get_block, wbc,
2752 : end_buffer_async_write);
2753 :
2754 : /* Is the folio fully outside i_size? (truncate in progress) */
2755 7 : if (folio_pos(folio) >= i_size) {
2756 0 : folio_unlock(folio);
2757 0 : return 0; /* don't care */
2758 : }
2759 :
2760 : /*
2761 : * The folio straddles i_size. It must be zeroed out on each and every
2762 : * writepage invocation because it may be mmapped. "A file is mapped
2763 : * in multiples of the page size. For a file that is not a multiple of
2764 : * the page size, the remaining memory is zeroed when mapped, and
2765 : * writes to that region are not written out to the file."
2766 : */
2767 7 : folio_zero_segment(folio, offset_in_folio(folio, i_size),
2768 : folio_size(folio));
2769 7 : return __block_write_full_folio(inode, folio, get_block, wbc,
2770 : end_buffer_async_write);
2771 : }
2772 : EXPORT_SYMBOL(block_write_full_page);
2773 :
2774 0 : sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2775 : get_block_t *get_block)
2776 : {
2777 0 : struct inode *inode = mapping->host;
2778 0 : struct buffer_head tmp = {
2779 0 : .b_size = i_blocksize(inode),
2780 : };
2781 :
2782 0 : get_block(inode, block, &tmp, 0);
2783 0 : return tmp.b_blocknr;
2784 : }
2785 : EXPORT_SYMBOL(generic_block_bmap);
2786 :
2787 18026476 : static void end_bio_bh_io_sync(struct bio *bio)
2788 : {
2789 18026476 : struct buffer_head *bh = bio->bi_private;
2790 :
2791 18026476 : if (unlikely(bio_flagged(bio, BIO_QUIET)))
2792 0 : set_bit(BH_Quiet, &bh->b_state);
2793 :
2794 18026476 : bh->b_end_io(bh, !bio->bi_status);
2795 18026478 : bio_put(bio);
2796 18026478 : }
2797 :
2798 18026026 : static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
2799 : struct writeback_control *wbc)
2800 : {
2801 18026026 : const enum req_op op = opf & REQ_OP_MASK;
2802 18026026 : struct bio *bio;
2803 :
2804 36052052 : BUG_ON(!buffer_locked(bh));
2805 36052052 : BUG_ON(!buffer_mapped(bh));
2806 18026026 : BUG_ON(!bh->b_end_io);
2807 36052052 : BUG_ON(buffer_delay(bh));
2808 36052052 : BUG_ON(buffer_unwritten(bh));
2809 :
2810 : /*
2811 : * Only clear out a write error when rewriting
2812 : */
2813 18026026 : if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
2814 3308846 : clear_buffer_write_io_error(bh);
2815 :
2816 36052482 : if (buffer_meta(bh))
2817 1628228 : opf |= REQ_META;
2818 36052482 : if (buffer_prio(bh))
2819 1628228 : opf |= REQ_PRIO;
2820 :
2821 18026241 : bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);
2822 :
2823 18026127 : fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
2824 :
2825 18026127 : bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2826 :
2827 18026127 : __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
2828 :
2829 18025815 : bio->bi_end_io = end_bio_bh_io_sync;
2830 18025815 : bio->bi_private = bh;
2831 :
2832 : /* Take care of bh's that straddle the end of the device */
2833 18025815 : guard_bio_eod(bio);
2834 :
2835 18026111 : if (wbc) {
2836 11982682 : wbc_init_bio(wbc, bio);
2837 11982686 : wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
2838 : }
2839 :
2840 18026111 : submit_bio(bio);
2841 18026194 : }
2842 :
2843 3831438 : void submit_bh(blk_opf_t opf, struct buffer_head *bh)
2844 : {
2845 3873953 : submit_bh_wbc(opf, bh, NULL);
2846 260774 : }
2847 : EXPORT_SYMBOL(submit_bh);
2848 :
2849 165770 : void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2850 : {
2851 165770 : lock_buffer(bh);
2852 165770 : if (!test_clear_buffer_dirty(bh)) {
2853 6 : unlock_buffer(bh);
2854 6 : return;
2855 : }
2856 165765 : bh->b_end_io = end_buffer_write_sync;
2857 165765 : get_bh(bh);
2858 165765 : submit_bh(REQ_OP_WRITE | op_flags, bh);
2859 : }
2860 : EXPORT_SYMBOL(write_dirty_buffer);
2861 :
2862 : /*
2863 : * For a data-integrity writeout, we need to wait upon any in-progress I/O
2864 : * and then start new I/O and then wait upon it. The caller must have a ref on
2865 : * the buffer_head.
2866 : */
2867 1975 : int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2868 : {
2869 1975 : WARN_ON(atomic_read(&bh->b_count) < 1);
2870 1975 : lock_buffer(bh);
2871 1975 : if (test_clear_buffer_dirty(bh)) {
2872 : /*
2873 : * The bh should be mapped, but it might not be if the
2874 : * device was hot-removed. Not much we can do but fail the I/O.
2875 : */
2876 3756 : if (!buffer_mapped(bh)) {
2877 0 : unlock_buffer(bh);
2878 0 : return -EIO;
2879 : }
2880 :
2881 1878 : get_bh(bh);
2882 1878 : bh->b_end_io = end_buffer_write_sync;
2883 1878 : submit_bh(REQ_OP_WRITE | op_flags, bh);
2884 1878 : wait_on_buffer(bh);
2885 3756 : if (!buffer_uptodate(bh))
2886 0 : return -EIO;
2887 : } else {
2888 97 : unlock_buffer(bh);
2889 : }
2890 : return 0;
2891 : }
2892 : EXPORT_SYMBOL(__sync_dirty_buffer);
2893 :
2894 1975 : int sync_dirty_buffer(struct buffer_head *bh)
2895 : {
2896 1975 : return __sync_dirty_buffer(bh, REQ_SYNC);
2897 : }
2898 : EXPORT_SYMBOL(sync_dirty_buffer);
2899 :
2900 : /*
2901 : * try_to_free_buffers() checks if all the buffers on this particular folio
2902 : * are unused, and releases them if so.
2903 : *
2904 : * Exclusion against try_to_free_buffers may be obtained by either
2905 : * locking the folio or by holding its mapping's private_lock.
2906 : *
2907 : * If the folio is dirty but all the buffers are clean then we need to
2908 : * be sure to mark the folio clean as well. This is because the folio
2909 : * may be against a block device, and a later reattachment of buffers
2910 : * to a dirty folio will set *all* buffers dirty. Which would corrupt
2911 : * filesystem data on the same device.
2912 : *
2913 : * The same applies to regular filesystem folios: if all the buffers are
2914 : * clean then we set the folio clean and proceed. To do that, we require
2915 : * total exclusion from block_dirty_folio(). That is obtained with
2916 : * private_lock.
2917 : *
2918 : * try_to_free_buffers() is non-blocking.
2919 : */
2920 : static inline int buffer_busy(struct buffer_head *bh)
2921 : {
2922 46336463 : return atomic_read(&bh->b_count) |
2923 46336463 : (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2924 : }
2925 :
2926 : static bool
2927 46120432 : drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free)
2928 : {
2929 46120432 : struct buffer_head *head = folio_buffers(folio);
2930 46120432 : struct buffer_head *bh;
2931 :
2932 46120432 : bh = head;
2933 46336463 : do {
2934 46336463 : if (buffer_busy(bh))
2935 1224798 : goto failed;
2936 45111665 : bh = bh->b_this_page;
2937 45111665 : } while (bh != head);
2938 :
2939 45111658 : do {
2940 45111658 : struct buffer_head *next = bh->b_this_page;
2941 :
2942 45111658 : if (bh->b_assoc_map)
2943 0 : __remove_assoc_queue(bh);
2944 45111080 : bh = next;
2945 45111080 : } while (bh != head);
2946 44895056 : *buffers_to_free = head;
2947 44895056 : folio_detach_private(folio);
2948 44895056 : return true;
2949 : failed:
2950 1224798 : return false;
2951 : }
2952 :
2953 46119616 : bool try_to_free_buffers(struct folio *folio)
2954 : {
2955 46119616 : struct address_space * const mapping = folio->mapping;
2956 46119616 : struct buffer_head *buffers_to_free = NULL;
2957 46119616 : bool ret = 0;
2958 :
2959 46119616 : BUG_ON(!folio_test_locked(folio));
2960 46119616 : if (folio_test_writeback(folio))
2961 : return false;
2962 :
2963 46119616 : if (mapping == NULL) { /* can this still happen? */
2964 0 : ret = drop_buffers(folio, &buffers_to_free);
2965 0 : goto out;
2966 : }
2967 :
2968 46119616 : spin_lock(&mapping->private_lock);
2969 46123653 : ret = drop_buffers(folio, &buffers_to_free);
2970 :
2971 : /*
2972 : * If the filesystem writes its buffers by hand (eg ext3)
2973 : * then we can have clean buffers against a dirty folio. We
2974 : * clean the folio here; otherwise the VM will never notice
2975 : * that the filesystem did any IO at all.
2976 : *
2977 : * Also, during truncate, discard_buffer will have marked all
2978 : * the folio's buffers clean. We discover that here and clean
2979 : * the folio also.
2980 : *
2981 : * private_lock must be held over this entire operation in order
2982 : * to synchronise against block_dirty_folio and prevent the
2983 : * dirty bit from being lost.
2984 : */
2985 46121991 : if (ret)
2986 44895973 : folio_cancel_dirty(folio);
2987 46121386 : spin_unlock(&mapping->private_lock);
2988 46123442 : out:
2989 46123442 : if (buffers_to_free) {
2990 : struct buffer_head *bh = buffers_to_free;
2991 :
2992 45114808 : do {
2993 45114808 : struct buffer_head *next = bh->b_this_page;
2994 45114808 : free_buffer_head(bh);
2995 45111537 : bh = next;
2996 45111537 : } while (bh != buffers_to_free);
2997 : }
2998 : return ret;
2999 : }
3000 : EXPORT_SYMBOL(try_to_free_buffers);
3001 :
3002 : /*
3003 : * Buffer-head allocation
3004 : */
3005 : static struct kmem_cache *bh_cachep __read_mostly;
3006 :
3007 : /*
3008 : * Once the number of bh's in the machine exceeds this level, we start
3009 : * stripping them in writeback.
3010 : */
3011 : static unsigned long max_buffer_heads;
3012 :
3013 : int buffer_heads_over_limit;
3014 :
3015 : struct bh_accounting {
3016 : int nr; /* Number of live bh's */
3017 : int ratelimit; /* Limit cacheline bouncing */
3018 : };
3019 :
3020 : static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3021 :
3022 96421664 : static void recalc_bh_state(void)
3023 : {
3024 96421664 : int i;
3025 96421664 : int tot = 0;
3026 :
3027 96421664 : if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3028 : return;
3029 23519 : __this_cpu_write(bh_accounting.ratelimit, 0);
3030 117370 : for_each_online_cpu(i)
3031 93851 : tot += per_cpu(bh_accounting, i).nr;
3032 23519 : buffer_heads_over_limit = (tot > max_buffer_heads);
3033 : }
3034 :
3035 48136798 : struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3036 : {
3037 48136798 : struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3038 48181554 : if (ret) {
3039 48181554 : INIT_LIST_HEAD(&ret->b_assoc_buffers);
3040 48181554 : spin_lock_init(&ret->b_uptodate_lock);
3041 48171789 : preempt_disable();
3042 48182861 : __this_cpu_inc(bh_accounting.nr);
3043 48174666 : recalc_bh_state();
3044 48181820 : preempt_enable();
3045 : }
3046 48176456 : return ret;
3047 : }
3048 : EXPORT_SYMBOL(alloc_buffer_head);
3049 :
3050 48233821 : void free_buffer_head(struct buffer_head *bh)
3051 : {
3052 48233821 : BUG_ON(!list_empty(&bh->b_assoc_buffers));
3053 48233821 : kmem_cache_free(bh_cachep, bh);
3054 48236262 : preempt_disable();
3055 48234613 : __this_cpu_dec(bh_accounting.nr);
3056 48232629 : recalc_bh_state();
3057 48232244 : preempt_enable();
3058 48232883 : }
3059 : EXPORT_SYMBOL(free_buffer_head);
3060 :
3061 264 : static int buffer_exit_cpu_dead(unsigned int cpu)
3062 : {
3063 264 : int i;
3064 264 : struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3065 :
3066 4488 : for (i = 0; i < BH_LRU_SIZE; i++) {
3067 4224 : brelse(b->bhs[i]);
3068 4224 : b->bhs[i] = NULL;
3069 : }
3070 264 : this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3071 264 : per_cpu(bh_accounting, cpu).nr = 0;
3072 264 : return 0;
3073 : }
3074 :
3075 : /**
3076 : * bh_uptodate_or_lock - Test whether the buffer is uptodate
3077 : * @bh: struct buffer_head
3078 : *
3079 : * Return true if the buffer is up-to-date and false,
3080 : * with the buffer locked, if not.
3081 : */
3082 26299376 : int bh_uptodate_or_lock(struct buffer_head *bh)
3083 : {
3084 52597119 : if (!buffer_uptodate(bh)) {
3085 1990614 : lock_buffer(bh);
3086 3981221 : if (!buffer_uptodate(bh))
3087 : return 0;
3088 4504 : unlock_buffer(bh);
3089 : }
3090 : return 1;
3091 : }
3092 : EXPORT_SYMBOL(bh_uptodate_or_lock);
3093 :
3094 : /**
3095 : * __bh_read - Submit read for a locked buffer
3096 : * @bh: struct buffer_head
3097 : * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3098 : * @wait: wait until reading finish
3099 : *
3100 : * Returns zero on success or don't wait, and -EIO on error.
3101 : */
3102 1949589 : int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
3103 : {
3104 1949589 : int ret = 0;
3105 :
3106 3899178 : BUG_ON(!buffer_locked(bh));
3107 :
3108 1949589 : get_bh(bh);
3109 1949595 : bh->b_end_io = end_buffer_read_sync;
3110 1949595 : submit_bh(REQ_OP_READ | op_flags, bh);
3111 1949601 : if (wait) {
3112 2517 : wait_on_buffer(bh);
3113 5034 : if (!buffer_uptodate(bh))
3114 0 : ret = -EIO;
3115 : }
3116 1949601 : return ret;
3117 : }
3118 : EXPORT_SYMBOL(__bh_read);
3119 :
3120 : /**
3121 : * __bh_read_batch - Submit read for a batch of unlocked buffers
3122 : * @nr: entry number of the buffer batch
3123 : * @bhs: a batch of struct buffer_head
3124 : * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3125 : * @force_lock: force to get a lock on the buffer if set, otherwise drops any
3126 : * buffer that cannot lock.
3127 : *
3128 : * Returns zero on success or don't wait, and -EIO on error.
3129 : */
3130 7034 : void __bh_read_batch(int nr, struct buffer_head *bhs[],
3131 : blk_opf_t op_flags, bool force_lock)
3132 : {
3133 7034 : int i;
3134 :
3135 59620 : for (i = 0; i < nr; i++) {
3136 52586 : struct buffer_head *bh = bhs[i];
3137 :
3138 105172 : if (buffer_uptodate(bh))
3139 0 : continue;
3140 :
3141 52586 : if (force_lock)
3142 0 : lock_buffer(bh);
3143 : else
3144 52586 : if (!trylock_buffer(bh))
3145 0 : continue;
3146 :
3147 105172 : if (buffer_uptodate(bh)) {
3148 0 : unlock_buffer(bh);
3149 0 : continue;
3150 : }
3151 :
3152 52586 : bh->b_end_io = end_buffer_read_sync;
3153 52586 : get_bh(bh);
3154 52586 : submit_bh(REQ_OP_READ | op_flags, bh);
3155 : }
3156 7034 : }
3157 : EXPORT_SYMBOL(__bh_read_batch);
3158 :
3159 0 : void __init buffer_init(void)
3160 : {
3161 0 : unsigned long nrpages;
3162 0 : int ret;
3163 :
3164 0 : bh_cachep = kmem_cache_create("buffer_head",
3165 : sizeof(struct buffer_head), 0,
3166 : (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3167 : SLAB_MEM_SPREAD),
3168 : NULL);
3169 :
3170 : /*
3171 : * Limit the bh occupancy to 10% of ZONE_NORMAL
3172 : */
3173 0 : nrpages = (nr_free_buffer_pages() * 10) / 100;
3174 0 : max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3175 0 : ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
3176 : NULL, buffer_exit_cpu_dead);
3177 0 : WARN_ON(ret < 0);
3178 0 : }
|