Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * linux/fs/buffer.c
4 : *
5 : * Copyright (C) 1991, 1992, 2002 Linus Torvalds
6 : */
7 :
8 : /*
9 : * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10 : *
11 : * Removed a lot of unnecessary code and simplified things now that
12 : * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13 : *
14 : * Speed up hash, lru, and free list operations. Use gfp() for allocating
15 : * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16 : *
17 : * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18 : *
19 : * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
20 : */
21 :
22 : #include <linux/kernel.h>
23 : #include <linux/sched/signal.h>
24 : #include <linux/syscalls.h>
25 : #include <linux/fs.h>
26 : #include <linux/iomap.h>
27 : #include <linux/mm.h>
28 : #include <linux/percpu.h>
29 : #include <linux/slab.h>
30 : #include <linux/capability.h>
31 : #include <linux/blkdev.h>
32 : #include <linux/file.h>
33 : #include <linux/quotaops.h>
34 : #include <linux/highmem.h>
35 : #include <linux/export.h>
36 : #include <linux/backing-dev.h>
37 : #include <linux/writeback.h>
38 : #include <linux/hash.h>
39 : #include <linux/suspend.h>
40 : #include <linux/buffer_head.h>
41 : #include <linux/task_io_accounting_ops.h>
42 : #include <linux/bio.h>
43 : #include <linux/cpu.h>
44 : #include <linux/bitops.h>
45 : #include <linux/mpage.h>
46 : #include <linux/bit_spinlock.h>
47 : #include <linux/pagevec.h>
48 : #include <linux/sched/mm.h>
49 : #include <trace/events/block.h>
50 : #include <linux/fscrypt.h>
51 : #include <linux/fsverity.h>
52 :
53 : #include "internal.h"
54 :
55 : static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
56 : static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
57 : struct writeback_control *wbc);
58 :
59 : #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
60 :
61 153718 : inline void touch_buffer(struct buffer_head *bh)
62 : {
63 153718 : trace_block_touch_buffer(bh);
64 153718 : folio_mark_accessed(bh->b_folio);
65 153719 : }
66 : EXPORT_SYMBOL(touch_buffer);
67 :
68 0 : void __lock_buffer(struct buffer_head *bh)
69 : {
70 9 : wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
71 9 : }
72 : EXPORT_SYMBOL(__lock_buffer);
73 :
74 609837 : void unlock_buffer(struct buffer_head *bh)
75 : {
76 609837 : clear_bit_unlock(BH_Lock, &bh->b_state);
77 609835 : smp_mb__after_atomic();
78 609837 : wake_up_bit(&bh->b_state, BH_Lock);
79 609837 : }
80 : EXPORT_SYMBOL(unlock_buffer);
81 :
82 : /*
83 : * Returns if the folio has dirty or writeback buffers. If all the buffers
84 : * are unlocked and clean then the folio_test_dirty information is stale. If
85 : * any of the buffers are locked, it is assumed they are locked for IO.
86 : */
87 71 : void buffer_check_dirty_writeback(struct folio *folio,
88 : bool *dirty, bool *writeback)
89 : {
90 71 : struct buffer_head *head, *bh;
91 71 : *dirty = false;
92 71 : *writeback = false;
93 :
94 71 : BUG_ON(!folio_test_locked(folio));
95 :
96 71 : head = folio_buffers(folio);
97 71 : if (!head)
98 : return;
99 :
100 71 : if (folio_test_writeback(folio))
101 0 : *writeback = true;
102 :
103 : bh = head;
104 71 : do {
105 142 : if (buffer_locked(bh))
106 0 : *writeback = true;
107 :
108 142 : if (buffer_dirty(bh))
109 0 : *dirty = true;
110 :
111 71 : bh = bh->b_this_page;
112 71 : } while (bh != head);
113 : }
114 :
115 : /*
116 : * Block until a buffer comes unlocked. This doesn't stop it
117 : * from becoming locked again - you have to lock it yourself
118 : * if you want to preserve its state.
119 : */
120 2 : void __wait_on_buffer(struct buffer_head * bh)
121 : {
122 102764 : wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
123 102762 : }
124 : EXPORT_SYMBOL(__wait_on_buffer);
125 :
126 81740 : static void buffer_io_error(struct buffer_head *bh, char *msg)
127 : {
128 163480 : if (!test_bit(BH_Quiet, &bh->b_state))
129 81740 : printk_ratelimited(KERN_ERR
130 : "Buffer I/O error on dev %pg, logical block %llu%s\n",
131 : bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
132 81740 : }
133 :
134 : /*
135 : * End-of-IO handler helper function which does not touch the bh after
136 : * unlocking it.
137 : * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
138 : * a race there is benign: unlock_buffer() only use the bh's address for
139 : * hashing after unlocking the buffer, so it doesn't actually touch the bh
140 : * itself.
141 : */
142 102037 : static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
143 : {
144 102037 : if (uptodate) {
145 102037 : set_buffer_uptodate(bh);
146 : } else {
147 : /* This happens, due to failed read-ahead attempts. */
148 0 : clear_buffer_uptodate(bh);
149 : }
150 102037 : unlock_buffer(bh);
151 102037 : }
152 :
153 : /*
154 : * Default synchronous end-of-IO handler.. Just mark it up-to-date and
155 : * unlock the buffer.
156 : */
157 102037 : void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
158 : {
159 102037 : __end_buffer_read_notouch(bh, uptodate);
160 102037 : put_bh(bh);
161 102037 : }
162 : EXPORT_SYMBOL(end_buffer_read_sync);
163 :
164 848 : void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
165 : {
166 848 : if (uptodate) {
167 848 : set_buffer_uptodate(bh);
168 : } else {
169 0 : buffer_io_error(bh, ", lost sync page write");
170 0 : mark_buffer_write_io_error(bh);
171 0 : clear_buffer_uptodate(bh);
172 : }
173 848 : unlock_buffer(bh);
174 848 : put_bh(bh);
175 848 : }
176 : EXPORT_SYMBOL(end_buffer_write_sync);
177 :
178 : /*
179 : * Various filesystems appear to want __find_get_block to be non-blocking.
180 : * But it's the page lock which protects the buffers. To get around this,
181 : * we get exclusion from try_to_free_buffers with the blockdev mapping's
182 : * private_lock.
183 : *
184 : * Hack idea: for the blockdev mapping, private_lock contention
185 : * may be quite high. This code could TryLock the page, and if that
186 : * succeeds, there is no need to take private_lock.
187 : */
188 : static struct buffer_head *
189 2531 : __find_get_block_slow(struct block_device *bdev, sector_t block)
190 : {
191 2531 : struct inode *bd_inode = bdev->bd_inode;
192 2531 : struct address_space *bd_mapping = bd_inode->i_mapping;
193 2531 : struct buffer_head *ret = NULL;
194 2531 : pgoff_t index;
195 2531 : struct buffer_head *bh;
196 2531 : struct buffer_head *head;
197 2531 : struct folio *folio;
198 2531 : int all_mapped = 1;
199 2531 : static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
200 :
201 2531 : index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
202 2531 : folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0);
203 2531 : if (IS_ERR(folio))
204 442 : goto out;
205 :
206 2089 : spin_lock(&bd_mapping->private_lock);
207 2089 : head = folio_buffers(folio);
208 2089 : if (!head)
209 24 : goto out_unlock;
210 : bh = head;
211 70669 : do {
212 141338 : if (!buffer_mapped(bh))
213 : all_mapped = 0;
214 70669 : else if (bh->b_blocknr == block) {
215 2065 : ret = bh;
216 2065 : get_bh(bh);
217 2065 : goto out_unlock;
218 : }
219 68604 : bh = bh->b_this_page;
220 68604 : } while (bh != head);
221 :
222 : /* we might be here because some of the buffers on this page are
223 : * not mapped. This is due to various races between
224 : * file io on the block device and getblk. It gets dealt with
225 : * elsewhere, don't buffer_error if we had some unmapped buffers
226 : */
227 0 : ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
228 0 : if (all_mapped && __ratelimit(&last_warned)) {
229 0 : printk("__find_get_block_slow() failed. block=%llu, "
230 : "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
231 : "device %pg blocksize: %d\n",
232 : (unsigned long long)block,
233 : (unsigned long long)bh->b_blocknr,
234 : bh->b_state, bh->b_size, bdev,
235 : 1 << bd_inode->i_blkbits);
236 : }
237 0 : out_unlock:
238 2089 : spin_unlock(&bd_mapping->private_lock);
239 2089 : folio_put(folio);
240 2531 : out:
241 2531 : return ret;
242 : }
243 :
244 84190 : static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
245 : {
246 84190 : unsigned long flags;
247 84190 : struct buffer_head *first;
248 84190 : struct buffer_head *tmp;
249 84190 : struct folio *folio;
250 84190 : int folio_uptodate = 1;
251 :
252 168380 : BUG_ON(!buffer_async_read(bh));
253 :
254 84190 : folio = bh->b_folio;
255 84190 : if (uptodate) {
256 2472 : set_buffer_uptodate(bh);
257 : } else {
258 81718 : clear_buffer_uptodate(bh);
259 81718 : buffer_io_error(bh, ", async page read");
260 81718 : folio_set_error(folio);
261 : }
262 :
263 : /*
264 : * Be _very_ careful from here on. Bad things can happen if
265 : * two buffer heads end IO at almost the same time and both
266 : * decide that the page is now completely done.
267 : */
268 84190 : first = folio_buffers(folio);
269 84190 : spin_lock_irqsave(&first->b_uptodate_lock, flags);
270 84190 : clear_buffer_async_read(bh);
271 84190 : unlock_buffer(bh);
272 84190 : tmp = bh;
273 248274 : do {
274 496548 : if (!buffer_uptodate(tmp))
275 236413 : folio_uptodate = 0;
276 496548 : if (buffer_async_read(tmp)) {
277 155958 : BUG_ON(!buffer_locked(tmp));
278 77979 : goto still_busy;
279 : }
280 170295 : tmp = tmp->b_this_page;
281 170295 : } while (tmp != bh);
282 6211 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
283 :
284 : /*
285 : * If all of the buffers are uptodate then we can set the page
286 : * uptodate.
287 : */
288 6211 : if (folio_uptodate)
289 1096 : folio_mark_uptodate(folio);
290 6211 : folio_unlock(folio);
291 6211 : return;
292 :
293 : still_busy:
294 77979 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
295 : return;
296 : }
297 :
298 : struct postprocess_bh_ctx {
299 : struct work_struct work;
300 : struct buffer_head *bh;
301 : };
302 :
303 : static void verify_bh(struct work_struct *work)
304 : {
305 : struct postprocess_bh_ctx *ctx =
306 : container_of(work, struct postprocess_bh_ctx, work);
307 : struct buffer_head *bh = ctx->bh;
308 : bool valid;
309 :
310 : valid = fsverity_verify_blocks(bh->b_folio, bh->b_size, bh_offset(bh));
311 : end_buffer_async_read(bh, valid);
312 : kfree(ctx);
313 : }
314 :
315 : static bool need_fsverity(struct buffer_head *bh)
316 : {
317 : struct folio *folio = bh->b_folio;
318 : struct inode *inode = folio->mapping->host;
319 :
320 : return fsverity_active(inode) &&
321 : /* needed by ext4 */
322 : folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
323 : }
324 :
325 : static void decrypt_bh(struct work_struct *work)
326 : {
327 : struct postprocess_bh_ctx *ctx =
328 : container_of(work, struct postprocess_bh_ctx, work);
329 : struct buffer_head *bh = ctx->bh;
330 : int err;
331 :
332 : err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size,
333 : bh_offset(bh));
334 : if (err == 0 && need_fsverity(bh)) {
335 : /*
336 : * We use different work queues for decryption and for verity
337 : * because verity may require reading metadata pages that need
338 : * decryption, and we shouldn't recurse to the same workqueue.
339 : */
340 : INIT_WORK(&ctx->work, verify_bh);
341 : fsverity_enqueue_verify_work(&ctx->work);
342 : return;
343 : }
344 : end_buffer_async_read(bh, err == 0);
345 : kfree(ctx);
346 : }
347 :
348 : /*
349 : * I/O completion handler for block_read_full_folio() - pages
350 : * which come unlocked at the end of I/O.
351 : */
352 84188 : static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
353 : {
354 84188 : struct inode *inode = bh->b_folio->mapping->host;
355 84188 : bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
356 84188 : bool verify = need_fsverity(bh);
357 :
358 : /* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
359 84188 : if (uptodate && (decrypt || verify)) {
360 : struct postprocess_bh_ctx *ctx =
361 : kmalloc(sizeof(*ctx), GFP_ATOMIC);
362 :
363 : if (ctx) {
364 : ctx->bh = bh;
365 : if (decrypt) {
366 : INIT_WORK(&ctx->work, decrypt_bh);
367 : fscrypt_enqueue_decrypt_work(&ctx->work);
368 : } else {
369 : INIT_WORK(&ctx->work, verify_bh);
370 : fsverity_enqueue_verify_work(&ctx->work);
371 : }
372 : return;
373 : }
374 : uptodate = 0;
375 : }
376 84188 : end_buffer_async_read(bh, uptodate);
377 : }
378 :
379 : /*
380 : * Completion handler for block_write_full_page() - pages which are unlocked
381 : * during I/O, and which have PageWriteback cleared upon I/O completion.
382 : */
383 192604 : void end_buffer_async_write(struct buffer_head *bh, int uptodate)
384 : {
385 192604 : unsigned long flags;
386 192604 : struct buffer_head *first;
387 192604 : struct buffer_head *tmp;
388 192604 : struct folio *folio;
389 :
390 385208 : BUG_ON(!buffer_async_write(bh));
391 :
392 192604 : folio = bh->b_folio;
393 192604 : if (uptodate) {
394 192582 : set_buffer_uptodate(bh);
395 : } else {
396 22 : buffer_io_error(bh, ", lost async page write");
397 22 : mark_buffer_write_io_error(bh);
398 22 : clear_buffer_uptodate(bh);
399 22 : folio_set_error(folio);
400 : }
401 :
402 192604 : first = folio_buffers(folio);
403 192604 : spin_lock_irqsave(&first->b_uptodate_lock, flags);
404 :
405 192604 : clear_buffer_async_write(bh);
406 192604 : unlock_buffer(bh);
407 192604 : tmp = bh->b_this_page;
408 306853 : while (tmp != bh) {
409 288336 : if (buffer_async_write(tmp)) {
410 59838 : BUG_ON(!buffer_locked(tmp));
411 29919 : goto still_busy;
412 : }
413 114249 : tmp = tmp->b_this_page;
414 : }
415 162685 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
416 162685 : folio_end_writeback(folio);
417 162685 : return;
418 :
419 : still_busy:
420 29919 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
421 : return;
422 : }
423 : EXPORT_SYMBOL(end_buffer_async_write);
424 :
425 : /*
426 : * If a page's buffers are under async readin (end_buffer_async_read
427 : * completion) then there is a possibility that another thread of
428 : * control could lock one of the buffers after it has completed
429 : * but while some of the other buffers have not completed. This
430 : * locked buffer would confuse end_buffer_async_read() into not unlocking
431 : * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
432 : * that this buffer is not under async I/O.
433 : *
434 : * The page comes unlocked when it has no locked buffer_async buffers
435 : * left.
436 : *
437 : * PageLocked prevents anyone starting new async I/O reads any of
438 : * the buffers.
439 : *
440 : * PageWriteback is used to prevent simultaneous writeout of the same
441 : * page.
442 : *
443 : * PageLocked prevents anyone from starting writeback of a page which is
444 : * under read I/O (PageWriteback is only ever set against a locked page).
445 : */
446 84190 : static void mark_buffer_async_read(struct buffer_head *bh)
447 : {
448 84190 : bh->b_end_io = end_buffer_async_read_io;
449 84190 : set_buffer_async_read(bh);
450 84190 : }
451 :
452 192604 : static void mark_buffer_async_write_endio(struct buffer_head *bh,
453 : bh_end_io_t *handler)
454 : {
455 192604 : bh->b_end_io = handler;
456 192604 : set_buffer_async_write(bh);
457 192604 : }
458 :
459 0 : void mark_buffer_async_write(struct buffer_head *bh)
460 : {
461 0 : mark_buffer_async_write_endio(bh, end_buffer_async_write);
462 0 : }
463 : EXPORT_SYMBOL(mark_buffer_async_write);
464 :
465 :
466 : /*
467 : * fs/buffer.c contains helper functions for buffer-backed address space's
468 : * fsync functions. A common requirement for buffer-based filesystems is
469 : * that certain data from the backing blockdev needs to be written out for
470 : * a successful fsync(). For example, ext2 indirect blocks need to be
471 : * written back and waited upon before fsync() returns.
472 : *
473 : * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
474 : * inode_has_buffers() and invalidate_inode_buffers() are provided for the
475 : * management of a list of dependent buffers at ->i_mapping->private_list.
476 : *
477 : * Locking is a little subtle: try_to_free_buffers() will remove buffers
478 : * from their controlling inode's queue when they are being freed. But
479 : * try_to_free_buffers() will be operating against the *blockdev* mapping
480 : * at the time, not against the S_ISREG file which depends on those buffers.
481 : * So the locking for private_list is via the private_lock in the address_space
482 : * which backs the buffers. Which is different from the address_space
483 : * against which the buffers are listed. So for a particular address_space,
484 : * mapping->private_lock does *not* protect mapping->private_list! In fact,
485 : * mapping->private_list will always be protected by the backing blockdev's
486 : * ->private_lock.
487 : *
488 : * Which introduces a requirement: all buffers on an address_space's
489 : * ->private_list must be from the same address_space: the blockdev's.
490 : *
491 : * address_spaces which do not place buffers at ->private_list via these
492 : * utility functions are free to use private_lock and private_list for
493 : * whatever they want. The only requirement is that list_empty(private_list)
494 : * be true at clear_inode() time.
495 : *
496 : * FIXME: clear_inode should not call invalidate_inode_buffers(). The
497 : * filesystems should do that. invalidate_inode_buffers() should just go
498 : * BUG_ON(!list_empty).
499 : *
500 : * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
501 : * take an address_space, not an inode. And it should be called
502 : * mark_buffer_dirty_fsync() to clearly define why those buffers are being
503 : * queued up.
504 : *
505 : * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
506 : * list if it is already on a list. Because if the buffer is on a list,
507 : * it *must* already be on the right one. If not, the filesystem is being
508 : * silly. This will save a ton of locking. But first we have to ensure
509 : * that buffers are taken *off* the old inode's list when they are freed
510 : * (presumably in truncate). That requires careful auditing of all
511 : * filesystems (do it inside bforget()). It could also be done by bringing
512 : * b_inode back.
513 : */
514 :
515 : /*
516 : * The buffer's backing address_space's private_lock must be held
517 : */
518 664 : static void __remove_assoc_queue(struct buffer_head *bh)
519 : {
520 664 : list_del_init(&bh->b_assoc_buffers);
521 664 : WARN_ON(!bh->b_assoc_map);
522 664 : bh->b_assoc_map = NULL;
523 664 : }
524 :
525 1125536997 : int inode_has_buffers(struct inode *inode)
526 : {
527 1125644126 : return !list_empty(&inode->i_data.private_list);
528 : }
529 :
530 : /*
531 : * osync is designed to support O_SYNC io. It waits synchronously for
532 : * all already-submitted IO to complete, but does not queue any new
533 : * writes to the disk.
534 : *
535 : * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
536 : * as you dirty the buffers, and then use osync_inode_buffers to wait for
537 : * completion. Any other dirty buffers which are not yet queued for
538 : * write will not be flushed to disk by the osync.
539 : */
540 103 : static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
541 : {
542 103 : struct buffer_head *bh;
543 103 : struct list_head *p;
544 103 : int err = 0;
545 :
546 103 : spin_lock(lock);
547 103 : repeat:
548 103 : list_for_each_prev(p, list) {
549 0 : bh = BH_ENTRY(p);
550 0 : if (buffer_locked(bh)) {
551 0 : get_bh(bh);
552 0 : spin_unlock(lock);
553 0 : wait_on_buffer(bh);
554 0 : if (!buffer_uptodate(bh))
555 0 : err = -EIO;
556 0 : brelse(bh);
557 0 : spin_lock(lock);
558 0 : goto repeat;
559 : }
560 : }
561 103 : spin_unlock(lock);
562 103 : return err;
563 : }
564 :
565 0 : void emergency_thaw_bdev(struct super_block *sb)
566 : {
567 0 : while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
568 0 : printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
569 0 : }
570 :
571 : /**
572 : * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
573 : * @mapping: the mapping which wants those buffers written
574 : *
575 : * Starts I/O against the buffers at mapping->private_list, and waits upon
576 : * that I/O.
577 : *
578 : * Basically, this is a convenience function for fsync().
579 : * @mapping is a file or directory which needs those buffers to be written for
580 : * a successful fsync().
581 : */
582 351 : int sync_mapping_buffers(struct address_space *mapping)
583 : {
584 351 : struct address_space *buffer_mapping = mapping->private_data;
585 :
586 351 : if (buffer_mapping == NULL || list_empty(&mapping->private_list))
587 : return 0;
588 :
589 103 : return fsync_buffers_list(&buffer_mapping->private_lock,
590 : &mapping->private_list);
591 : }
592 : EXPORT_SYMBOL(sync_mapping_buffers);
593 :
594 : /**
595 : * generic_buffers_fsync_noflush - generic buffer fsync implementation
596 : * for simple filesystems with no inode lock
597 : *
598 : * @file: file to synchronize
599 : * @start: start offset in bytes
600 : * @end: end offset in bytes (inclusive)
601 : * @datasync: only synchronize essential metadata if true
602 : *
603 : * This is a generic implementation of the fsync method for simple
604 : * filesystems which track all non-inode metadata in the buffers list
605 : * hanging off the address_space structure.
606 : */
607 351 : int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
608 : bool datasync)
609 : {
610 351 : struct inode *inode = file->f_mapping->host;
611 351 : int err;
612 351 : int ret;
613 :
614 351 : err = file_write_and_wait_range(file, start, end);
615 351 : if (err)
616 : return err;
617 :
618 351 : ret = sync_mapping_buffers(inode->i_mapping);
619 351 : if (!(inode->i_state & I_DIRTY_ALL))
620 250 : goto out;
621 101 : if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
622 2 : goto out;
623 :
624 99 : err = sync_inode_metadata(inode, 1);
625 99 : if (ret == 0)
626 99 : ret = err;
627 :
628 0 : out:
629 : /* check and advance again to catch errors after syncing out buffers */
630 351 : err = file_check_and_advance_wb_err(file);
631 351 : if (ret == 0)
632 351 : ret = err;
633 : return ret;
634 : }
635 : EXPORT_SYMBOL(generic_buffers_fsync_noflush);
636 :
637 : /**
638 : * generic_buffers_fsync - generic buffer fsync implementation
639 : * for simple filesystems with no inode lock
640 : *
641 : * @file: file to synchronize
642 : * @start: start offset in bytes
643 : * @end: end offset in bytes (inclusive)
644 : * @datasync: only synchronize essential metadata if true
645 : *
646 : * This is a generic implementation of the fsync method for simple
647 : * filesystems which track all non-inode metadata in the buffers list
648 : * hanging off the address_space structure. This also makes sure that
649 : * a device cache flush operation is called at the end.
650 : */
651 351 : int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
652 : bool datasync)
653 : {
654 351 : struct inode *inode = file->f_mapping->host;
655 351 : int ret;
656 :
657 351 : ret = generic_buffers_fsync_noflush(file, start, end, datasync);
658 351 : if (!ret)
659 351 : ret = blkdev_issue_flush(inode->i_sb->s_bdev);
660 351 : return ret;
661 : }
662 : EXPORT_SYMBOL(generic_buffers_fsync);
663 :
664 : /*
665 : * Called when we've recently written block `bblock', and it is known that
666 : * `bblock' was for a buffer_boundary() buffer. This means that the block at
667 : * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
668 : * dirty, schedule it for IO. So that indirects merge nicely with their data.
669 : */
670 11 : void write_boundary_block(struct block_device *bdev,
671 : sector_t bblock, unsigned blocksize)
672 : {
673 11 : struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
674 11 : if (bh) {
675 14 : if (buffer_dirty(bh))
676 2 : write_dirty_buffer(bh, 0);
677 7 : put_bh(bh);
678 : }
679 11 : }
680 :
681 33560 : void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
682 : {
683 33560 : struct address_space *mapping = inode->i_mapping;
684 33560 : struct address_space *buffer_mapping = bh->b_folio->mapping;
685 :
686 33560 : mark_buffer_dirty(bh);
687 33560 : if (!mapping->private_data) {
688 77 : mapping->private_data = buffer_mapping;
689 : } else {
690 33483 : BUG_ON(mapping->private_data != buffer_mapping);
691 : }
692 33560 : if (!bh->b_assoc_map) {
693 566 : spin_lock(&buffer_mapping->private_lock);
694 566 : list_move_tail(&bh->b_assoc_buffers,
695 : &mapping->private_list);
696 566 : bh->b_assoc_map = mapping;
697 566 : spin_unlock(&buffer_mapping->private_lock);
698 : }
699 33560 : }
700 : EXPORT_SYMBOL(mark_buffer_dirty_inode);
701 :
702 : /*
703 : * Add a page to the dirty page list.
704 : *
705 : * It is a sad fact of life that this function is called from several places
706 : * deeply under spinlocking. It may not sleep.
707 : *
708 : * If the page has buffers, the uptodate buffers are set dirty, to preserve
709 : * dirty-state coherency between the page and the buffers. It the page does
710 : * not have buffers then when they are later attached they will all be set
711 : * dirty.
712 : *
713 : * The buffers are dirtied before the page is dirtied. There's a small race
714 : * window in which a writepage caller may see the page cleanness but not the
715 : * buffer dirtiness. That's fine. If this code were to set the page dirty
716 : * before the buffers, a concurrent writepage caller could clear the page dirty
717 : * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
718 : * page on the dirty page list.
719 : *
720 : * We use private_lock to lock against try_to_free_buffers while using the
721 : * page's buffer list. Also use this to protect against clean buffers being
722 : * added to the page after it was set dirty.
723 : *
724 : * FIXME: may need to call ->reservepage here as well. That's rather up to the
725 : * address_space though.
726 : */
727 78 : bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
728 : {
729 78 : struct buffer_head *head;
730 78 : bool newly_dirty;
731 :
732 78 : spin_lock(&mapping->private_lock);
733 78 : head = folio_buffers(folio);
734 78 : if (head) {
735 : struct buffer_head *bh = head;
736 :
737 960 : do {
738 960 : set_buffer_dirty(bh);
739 960 : bh = bh->b_this_page;
740 960 : } while (bh != head);
741 : }
742 : /*
743 : * Lock out page's memcg migration to keep PageDirty
744 : * synchronized with per-memcg dirty page counters.
745 : */
746 78 : folio_memcg_lock(folio);
747 78 : newly_dirty = !folio_test_set_dirty(folio);
748 78 : spin_unlock(&mapping->private_lock);
749 :
750 78 : if (newly_dirty)
751 25 : __folio_mark_dirty(folio, mapping, 1);
752 :
753 78 : folio_memcg_unlock(folio);
754 :
755 78 : if (newly_dirty)
756 25 : __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
757 :
758 78 : return newly_dirty;
759 : }
760 : EXPORT_SYMBOL(block_dirty_folio);
761 :
762 : /*
763 : * Write out and wait upon a list of buffers.
764 : *
765 : * We have conflicting pressures: we want to make sure that all
766 : * initially dirty buffers get waited on, but that any subsequently
767 : * dirtied buffers don't. After all, we don't want fsync to last
768 : * forever if somebody is actively writing to the file.
769 : *
770 : * Do this in two main stages: first we copy dirty buffers to a
771 : * temporary inode list, queueing the writes as we go. Then we clean
772 : * up, waiting for those writes to complete.
773 : *
774 : * During this second stage, any subsequent updates to the file may end
775 : * up refiling the buffer on the original inode's dirty list again, so
776 : * there is a chance we will end up with a buffer queued for write but
777 : * not yet completed on that list. So, as a final cleanup we go through
778 : * the osync code to catch these locked, dirty buffers without requeuing
779 : * any newly dirty buffers for write.
780 : */
781 103 : static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
782 : {
783 103 : struct buffer_head *bh;
784 103 : struct list_head tmp;
785 103 : struct address_space *mapping;
786 103 : int err = 0, err2;
787 103 : struct blk_plug plug;
788 :
789 103 : INIT_LIST_HEAD(&tmp);
790 103 : blk_start_plug(&plug);
791 :
792 103 : spin_lock(lock);
793 342 : while (!list_empty(list)) {
794 239 : bh = BH_ENTRY(list->next);
795 239 : mapping = bh->b_assoc_map;
796 239 : __remove_assoc_queue(bh);
797 : /* Avoid race with mark_buffer_dirty_inode() which does
798 : * a lockless check and we rely on seeing the dirty bit */
799 239 : smp_mb();
800 544 : if (buffer_dirty(bh) || buffer_locked(bh)) {
801 174 : list_add(&bh->b_assoc_buffers, &tmp);
802 174 : bh->b_assoc_map = mapping;
803 348 : if (buffer_dirty(bh)) {
804 173 : get_bh(bh);
805 173 : spin_unlock(lock);
806 : /*
807 : * Ensure any pending I/O completes so that
808 : * write_dirty_buffer() actually writes the
809 : * current contents - it is a noop if I/O is
810 : * still in flight on potentially older
811 : * contents.
812 : */
813 173 : write_dirty_buffer(bh, REQ_SYNC);
814 :
815 : /*
816 : * Kick off IO for the previous mapping. Note
817 : * that we will not run the very last mapping,
818 : * wait_on_buffer() will do that for us
819 : * through sync_buffer().
820 : */
821 173 : brelse(bh);
822 173 : spin_lock(lock);
823 : }
824 : }
825 : }
826 :
827 103 : spin_unlock(lock);
828 103 : blk_finish_plug(&plug);
829 103 : spin_lock(lock);
830 :
831 277 : while (!list_empty(&tmp)) {
832 174 : bh = BH_ENTRY(tmp.prev);
833 174 : get_bh(bh);
834 174 : mapping = bh->b_assoc_map;
835 174 : __remove_assoc_queue(bh);
836 : /* Avoid race with mark_buffer_dirty_inode() which does
837 : * a lockless check and we rely on seeing the dirty bit */
838 174 : smp_mb();
839 348 : if (buffer_dirty(bh)) {
840 0 : list_add(&bh->b_assoc_buffers,
841 : &mapping->private_list);
842 0 : bh->b_assoc_map = mapping;
843 : }
844 174 : spin_unlock(lock);
845 174 : wait_on_buffer(bh);
846 348 : if (!buffer_uptodate(bh))
847 0 : err = -EIO;
848 174 : brelse(bh);
849 174 : spin_lock(lock);
850 : }
851 :
852 103 : spin_unlock(lock);
853 103 : err2 = osync_buffers_list(lock, list);
854 103 : if (err)
855 : return err;
856 : else
857 103 : return err2;
858 : }
859 :
860 : /*
861 : * Invalidate any and all dirty buffers on a given inode. We are
862 : * probably unmounting the fs, but that doesn't mean we have already
863 : * done a sync(). Just drop the buffers from the inode list.
864 : *
865 : * NOTE: we take the inode's blockdev's mapping's private_lock. Which
866 : * assumes that all the buffers are against the blockdev. Not true
867 : * for reiserfs.
868 : */
869 487 : void invalidate_inode_buffers(struct inode *inode)
870 : {
871 487 : if (inode_has_buffers(inode)) {
872 65 : struct address_space *mapping = &inode->i_data;
873 65 : struct list_head *list = &mapping->private_list;
874 65 : struct address_space *buffer_mapping = mapping->private_data;
875 :
876 65 : spin_lock(&buffer_mapping->private_lock);
877 316 : while (!list_empty(list))
878 251 : __remove_assoc_queue(BH_ENTRY(list->next));
879 65 : spin_unlock(&buffer_mapping->private_lock);
880 : }
881 487 : }
882 : EXPORT_SYMBOL(invalidate_inode_buffers);
883 :
884 : /*
885 : * Remove any clean buffers from the inode's buffer list. This is called
886 : * when we're trying to free the inode itself. Those buffers can pin it.
887 : *
888 : * Returns true if all buffers were removed.
889 : */
890 106642 : int remove_inode_buffers(struct inode *inode)
891 : {
892 106642 : int ret = 1;
893 :
894 106642 : if (inode_has_buffers(inode)) {
895 0 : struct address_space *mapping = &inode->i_data;
896 0 : struct list_head *list = &mapping->private_list;
897 0 : struct address_space *buffer_mapping = mapping->private_data;
898 :
899 0 : spin_lock(&buffer_mapping->private_lock);
900 0 : while (!list_empty(list)) {
901 0 : struct buffer_head *bh = BH_ENTRY(list->next);
902 0 : if (buffer_dirty(bh)) {
903 : ret = 0;
904 : break;
905 : }
906 0 : __remove_assoc_queue(bh);
907 : }
908 0 : spin_unlock(&buffer_mapping->private_lock);
909 : }
910 106642 : return ret;
911 : }
912 :
913 : /*
914 : * Create the appropriate buffers when given a folio for data area and
915 : * the size of each buffer.. Use the bh->b_this_page linked list to
916 : * follow the buffers created. Return NULL if unable to create more
917 : * buffers.
918 : *
919 : * The retry flag is used to differentiate async IO (paging, swapping)
920 : * which may not fail from ordinary buffer allocations.
921 : */
922 177794 : struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
923 : bool retry)
924 : {
925 177794 : struct buffer_head *bh, *head;
926 177794 : gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
927 177794 : long offset;
928 177794 : struct mem_cgroup *memcg, *old_memcg;
929 :
930 177794 : if (retry)
931 177794 : gfp |= __GFP_NOFAIL;
932 :
933 : /* The folio lock pins the memcg */
934 177794 : memcg = folio_memcg(folio);
935 177794 : old_memcg = set_active_memcg(memcg);
936 :
937 177794 : head = NULL;
938 177794 : offset = folio_size(folio);
939 533482 : while ((offset -= size) >= 0) {
940 355688 : bh = alloc_buffer_head(gfp);
941 355688 : if (!bh)
942 0 : goto no_grow;
943 :
944 355688 : bh->b_this_page = head;
945 355688 : bh->b_blocknr = -1;
946 355688 : head = bh;
947 :
948 355688 : bh->b_size = size;
949 :
950 : /* Link the buffer to its folio */
951 355688 : folio_set_bh(bh, folio, offset);
952 : }
953 177794 : out:
954 177794 : set_active_memcg(old_memcg);
955 177794 : return head;
956 : /*
957 : * In case anything failed, we just free everything we got.
958 : */
959 : no_grow:
960 0 : if (head) {
961 0 : do {
962 0 : bh = head;
963 0 : head = head->b_this_page;
964 0 : free_buffer_head(bh);
965 0 : } while (head);
966 : }
967 :
968 0 : goto out;
969 : }
970 : EXPORT_SYMBOL_GPL(folio_alloc_buffers);
971 :
972 0 : struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
973 : bool retry)
974 : {
975 0 : return folio_alloc_buffers(page_folio(page), size, retry);
976 : }
977 : EXPORT_SYMBOL_GPL(alloc_page_buffers);
978 :
979 231 : static inline void link_dev_buffers(struct folio *folio,
980 : struct buffer_head *head)
981 : {
982 231 : struct buffer_head *bh, *tail;
983 :
984 231 : bh = head;
985 13488 : do {
986 13488 : tail = bh;
987 13488 : bh = bh->b_this_page;
988 13488 : } while (bh);
989 231 : tail->b_this_page = head;
990 231 : folio_attach_private(folio, head);
991 231 : }
992 :
993 231 : static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
994 : {
995 231 : sector_t retval = ~((sector_t)0);
996 231 : loff_t sz = bdev_nr_bytes(bdev);
997 :
998 231 : if (sz) {
999 231 : unsigned int sizebits = blksize_bits(size);
1000 231 : retval = (sz >> sizebits);
1001 : }
1002 231 : return retval;
1003 : }
1004 :
1005 : /*
1006 : * Initialise the state of a blockdev folio's buffers.
1007 : */
1008 231 : static sector_t folio_init_buffers(struct folio *folio,
1009 : struct block_device *bdev, sector_t block, int size)
1010 : {
1011 231 : struct buffer_head *head = folio_buffers(folio);
1012 231 : struct buffer_head *bh = head;
1013 231 : bool uptodate = folio_test_uptodate(folio);
1014 231 : sector_t end_block = blkdev_max_block(bdev, size);
1015 :
1016 13488 : do {
1017 26976 : if (!buffer_mapped(bh)) {
1018 13488 : bh->b_end_io = NULL;
1019 13488 : bh->b_private = NULL;
1020 13488 : bh->b_bdev = bdev;
1021 13488 : bh->b_blocknr = block;
1022 13488 : if (uptodate)
1023 768 : set_buffer_uptodate(bh);
1024 13488 : if (block < end_block)
1025 13488 : set_buffer_mapped(bh);
1026 : }
1027 13488 : block++;
1028 13488 : bh = bh->b_this_page;
1029 13488 : } while (bh != head);
1030 :
1031 : /*
1032 : * Caller needs to validate requested block against end of device.
1033 : */
1034 231 : return end_block;
1035 : }
1036 :
1037 : /*
1038 : * Create the page-cache page that contains the requested block.
1039 : *
1040 : * This is used purely for blockdev mappings.
1041 : */
1042 : static int
1043 231 : grow_dev_page(struct block_device *bdev, sector_t block,
1044 : pgoff_t index, int size, int sizebits, gfp_t gfp)
1045 : {
1046 231 : struct inode *inode = bdev->bd_inode;
1047 231 : struct folio *folio;
1048 231 : struct buffer_head *bh;
1049 231 : sector_t end_block;
1050 231 : int ret = 0;
1051 231 : gfp_t gfp_mask;
1052 :
1053 231 : gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
1054 :
1055 : /*
1056 : * XXX: __getblk_slow() can not really deal with failure and
1057 : * will endlessly loop on improvised global reclaim. Prefer
1058 : * looping in the allocator rather than here, at least that
1059 : * code knows what it's doing.
1060 : */
1061 231 : gfp_mask |= __GFP_NOFAIL;
1062 :
1063 231 : folio = __filemap_get_folio(inode->i_mapping, index,
1064 : FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp_mask);
1065 :
1066 231 : bh = folio_buffers(folio);
1067 231 : if (bh) {
1068 0 : if (bh->b_size == size) {
1069 0 : end_block = folio_init_buffers(folio, bdev,
1070 0 : (sector_t)index << sizebits, size);
1071 0 : goto done;
1072 : }
1073 0 : if (!try_to_free_buffers(folio))
1074 0 : goto failed;
1075 : }
1076 :
1077 231 : bh = folio_alloc_buffers(folio, size, true);
1078 :
1079 : /*
1080 : * Link the folio to the buffers and initialise them. Take the
1081 : * lock to be atomic wrt __find_get_block(), which does not
1082 : * run under the folio lock.
1083 : */
1084 231 : spin_lock(&inode->i_mapping->private_lock);
1085 231 : link_dev_buffers(folio, bh);
1086 462 : end_block = folio_init_buffers(folio, bdev,
1087 231 : (sector_t)index << sizebits, size);
1088 231 : spin_unlock(&inode->i_mapping->private_lock);
1089 231 : done:
1090 231 : ret = (block < end_block) ? 1 : -ENXIO;
1091 231 : failed:
1092 231 : folio_unlock(folio);
1093 231 : folio_put(folio);
1094 231 : return ret;
1095 : }
1096 :
1097 : /*
1098 : * Create buffers for the specified block device block's page. If
1099 : * that page was dirty, the buffers are set dirty also.
1100 : */
1101 : static int
1102 231 : grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
1103 : {
1104 231 : pgoff_t index;
1105 231 : int sizebits;
1106 :
1107 231 : sizebits = PAGE_SHIFT - __ffs(size);
1108 231 : index = block >> sizebits;
1109 :
1110 : /*
1111 : * Check for a block which wants to lie outside our maximum possible
1112 : * pagecache index. (this comparison is done using sector_t types).
1113 : */
1114 231 : if (unlikely(index != block >> sizebits)) {
1115 : printk(KERN_ERR "%s: requested out-of-range block %llu for "
1116 : "device %pg\n",
1117 : __func__, (unsigned long long)block,
1118 : bdev);
1119 : return -EIO;
1120 : }
1121 :
1122 : /* Create a page with the proper size buffers.. */
1123 231 : return grow_dev_page(bdev, block, index, size, sizebits, gfp);
1124 : }
1125 :
1126 : static struct buffer_head *
1127 231 : __getblk_slow(struct block_device *bdev, sector_t block,
1128 : unsigned size, gfp_t gfp)
1129 : {
1130 : /* Size must be multiple of hard sectorsize */
1131 462 : if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1132 : (size < 512 || size > PAGE_SIZE))) {
1133 0 : printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1134 : size);
1135 0 : printk(KERN_ERR "logical block size: %d\n",
1136 : bdev_logical_block_size(bdev));
1137 :
1138 0 : dump_stack();
1139 0 : return NULL;
1140 : }
1141 :
1142 462 : for (;;) {
1143 462 : struct buffer_head *bh;
1144 462 : int ret;
1145 :
1146 462 : bh = __find_get_block(bdev, block, size);
1147 462 : if (bh)
1148 231 : return bh;
1149 :
1150 231 : ret = grow_buffers(bdev, block, size, gfp);
1151 231 : if (ret < 0)
1152 : return NULL;
1153 : }
1154 : }
1155 :
1156 : /*
1157 : * The relationship between dirty buffers and dirty pages:
1158 : *
1159 : * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1160 : * the page is tagged dirty in the page cache.
1161 : *
1162 : * At all times, the dirtiness of the buffers represents the dirtiness of
1163 : * subsections of the page. If the page has buffers, the page dirty bit is
1164 : * merely a hint about the true dirty state.
1165 : *
1166 : * When a page is set dirty in its entirety, all its buffers are marked dirty
1167 : * (if the page has buffers).
1168 : *
1169 : * When a buffer is marked dirty, its page is dirtied, but the page's other
1170 : * buffers are not.
1171 : *
1172 : * Also. When blockdev buffers are explicitly read with bread(), they
1173 : * individually become uptodate. But their backing page remains not
1174 : * uptodate - even if all of its buffers are uptodate. A subsequent
1175 : * block_read_full_folio() against that folio will discover all the uptodate
1176 : * buffers, will set the folio uptodate and will perform no I/O.
1177 : */
1178 :
1179 : /**
1180 : * mark_buffer_dirty - mark a buffer_head as needing writeout
1181 : * @bh: the buffer_head to mark dirty
1182 : *
1183 : * mark_buffer_dirty() will set the dirty bit against the buffer, then set
1184 : * its backing page dirty, then tag the page as dirty in the page cache
1185 : * and then attach the address_space's inode to its superblock's dirty
1186 : * inode list.
1187 : *
1188 : * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->private_lock,
1189 : * i_pages lock and mapping->host->i_lock.
1190 : */
1191 5117962 : void mark_buffer_dirty(struct buffer_head *bh)
1192 : {
1193 10235924 : WARN_ON_ONCE(!buffer_uptodate(bh));
1194 :
1195 5117962 : trace_block_dirty_buffer(bh);
1196 :
1197 : /*
1198 : * Very *carefully* optimize the it-is-already-dirty case.
1199 : *
1200 : * Don't let the final "is it dirty" escape to before we
1201 : * perhaps modified the buffer.
1202 : */
1203 10235924 : if (buffer_dirty(bh)) {
1204 4894591 : smp_mb();
1205 9789182 : if (buffer_dirty(bh))
1206 : return;
1207 : }
1208 :
1209 446742 : if (!test_set_buffer_dirty(bh)) {
1210 223371 : struct folio *folio = bh->b_folio;
1211 223371 : struct address_space *mapping = NULL;
1212 :
1213 223371 : folio_memcg_lock(folio);
1214 446742 : if (!folio_test_set_dirty(folio)) {
1215 172627 : mapping = folio->mapping;
1216 172627 : if (mapping)
1217 172627 : __folio_mark_dirty(folio, mapping, 0);
1218 : }
1219 223371 : folio_memcg_unlock(folio);
1220 223371 : if (mapping)
1221 172627 : __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1222 : }
1223 : }
1224 : EXPORT_SYMBOL(mark_buffer_dirty);
1225 :
1226 22 : void mark_buffer_write_io_error(struct buffer_head *bh)
1227 : {
1228 22 : struct super_block *sb;
1229 :
1230 22 : set_buffer_write_io_error(bh);
1231 : /* FIXME: do we need to set this in both places? */
1232 22 : if (bh->b_folio && bh->b_folio->mapping)
1233 22 : mapping_set_error(bh->b_folio->mapping, -EIO);
1234 22 : if (bh->b_assoc_map)
1235 0 : mapping_set_error(bh->b_assoc_map, -EIO);
1236 22 : rcu_read_lock();
1237 22 : sb = READ_ONCE(bh->b_bdev->bd_super);
1238 22 : if (sb)
1239 0 : errseq_set(&sb->s_wb_err, -EIO);
1240 22 : rcu_read_unlock();
1241 22 : }
1242 : EXPORT_SYMBOL(mark_buffer_write_io_error);
1243 :
1244 : /*
1245 : * Decrement a buffer_head's reference count. If all buffers against a page
1246 : * have zero reference count, are clean and unlocked, and if the page is clean
1247 : * and unlocked then try_to_free_buffers() may strip the buffers from the page
1248 : * in preparation for freeing it (sometimes, rarely, buffers are removed from
1249 : * a page but it ends up not being freed, and buffers may later be reattached).
1250 : */
1251 158200 : void __brelse(struct buffer_head * buf)
1252 : {
1253 158200 : if (atomic_read(&buf->b_count)) {
1254 158200 : put_bh(buf);
1255 158200 : return;
1256 : }
1257 0 : WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1258 : }
1259 : EXPORT_SYMBOL(__brelse);
1260 :
1261 : /*
1262 : * bforget() is like brelse(), except it discards any
1263 : * potentially dirty data.
1264 : */
1265 87 : void __bforget(struct buffer_head *bh)
1266 : {
1267 87 : clear_buffer_dirty(bh);
1268 87 : if (bh->b_assoc_map) {
1269 76 : struct address_space *buffer_mapping = bh->b_folio->mapping;
1270 :
1271 76 : spin_lock(&buffer_mapping->private_lock);
1272 76 : list_del_init(&bh->b_assoc_buffers);
1273 76 : bh->b_assoc_map = NULL;
1274 76 : spin_unlock(&buffer_mapping->private_lock);
1275 : }
1276 87 : __brelse(bh);
1277 87 : }
1278 : EXPORT_SYMBOL(__bforget);
1279 :
1280 24 : static struct buffer_head *__bread_slow(struct buffer_head *bh)
1281 : {
1282 24 : lock_buffer(bh);
1283 48 : if (buffer_uptodate(bh)) {
1284 3 : unlock_buffer(bh);
1285 3 : return bh;
1286 : } else {
1287 21 : get_bh(bh);
1288 21 : bh->b_end_io = end_buffer_read_sync;
1289 21 : submit_bh(REQ_OP_READ, bh);
1290 21 : wait_on_buffer(bh);
1291 42 : if (buffer_uptodate(bh))
1292 : return bh;
1293 : }
1294 0 : brelse(bh);
1295 : return NULL;
1296 : }
1297 :
1298 : /*
1299 : * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1300 : * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1301 : * refcount elevated by one when they're in an LRU. A buffer can only appear
1302 : * once in a particular CPU's LRU. A single buffer can be present in multiple
1303 : * CPU's LRUs at the same time.
1304 : *
1305 : * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1306 : * sb_find_get_block().
1307 : *
1308 : * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1309 : * a local interrupt disable for that.
1310 : */
1311 :
1312 : #define BH_LRU_SIZE 16
1313 :
1314 : struct bh_lru {
1315 : struct buffer_head *bhs[BH_LRU_SIZE];
1316 : };
1317 :
1318 : static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1319 :
1320 : #ifdef CONFIG_SMP
1321 : #define bh_lru_lock() local_irq_disable()
1322 : #define bh_lru_unlock() local_irq_enable()
1323 : #else
1324 : #define bh_lru_lock() preempt_disable()
1325 : #define bh_lru_unlock() preempt_enable()
1326 : #endif
1327 :
1328 158313 : static inline void check_irqs_on(void)
1329 : {
1330 : #ifdef irqs_disabled
1331 158313 : BUG_ON(irqs_disabled());
1332 : #endif
1333 158313 : }
1334 :
1335 : /*
1336 : * Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
1337 : * inserted at the front, and the buffer_head at the back if any is evicted.
1338 : * Or, if already in the LRU it is moved to the front.
1339 : */
1340 2065 : static void bh_lru_install(struct buffer_head *bh)
1341 : {
1342 2065 : struct buffer_head *evictee = bh;
1343 2065 : struct bh_lru *b;
1344 2065 : int i;
1345 :
1346 2065 : check_irqs_on();
1347 2065 : bh_lru_lock();
1348 :
1349 : /*
1350 : * the refcount of buffer_head in bh_lru prevents dropping the
1351 : * attached page(i.e., try_to_free_buffers) so it could cause
1352 : * failing page migration.
1353 : * Skip putting upcoming bh into bh_lru until migration is done.
1354 : */
1355 2065 : if (lru_cache_disabled()) {
1356 0 : bh_lru_unlock();
1357 0 : return;
1358 : }
1359 :
1360 2065 : b = this_cpu_ptr(&bh_lrus);
1361 35105 : for (i = 0; i < BH_LRU_SIZE; i++) {
1362 33040 : swap(evictee, b->bhs[i]);
1363 33040 : if (evictee == bh) {
1364 0 : bh_lru_unlock();
1365 0 : return;
1366 : }
1367 : }
1368 :
1369 2065 : get_bh(bh);
1370 2065 : bh_lru_unlock();
1371 2065 : brelse(evictee);
1372 : }
1373 :
1374 : /*
1375 : * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1376 : */
1377 : static struct buffer_head *
1378 156248 : lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1379 : {
1380 156248 : struct buffer_head *ret = NULL;
1381 156248 : unsigned int i;
1382 :
1383 156248 : check_irqs_on();
1384 156248 : bh_lru_lock();
1385 583363 : for (i = 0; i < BH_LRU_SIZE; i++) {
1386 424584 : struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1387 :
1388 424587 : if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1389 153720 : bh->b_size == size) {
1390 153720 : if (i) {
1391 362650 : while (i) {
1392 230378 : __this_cpu_write(bh_lrus.bhs[i],
1393 : __this_cpu_read(bh_lrus.bhs[i - 1]));
1394 230378 : i--;
1395 : }
1396 132272 : __this_cpu_write(bh_lrus.bhs[0], bh);
1397 : }
1398 153720 : get_bh(bh);
1399 153720 : ret = bh;
1400 153720 : break;
1401 : }
1402 : }
1403 156251 : bh_lru_unlock();
1404 156251 : return ret;
1405 : }
1406 :
1407 : /*
1408 : * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1409 : * it in the LRU and mark it as accessed. If it is not present then return
1410 : * NULL
1411 : */
1412 : struct buffer_head *
1413 156248 : __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1414 : {
1415 156248 : struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1416 :
1417 156251 : if (bh == NULL) {
1418 : /* __find_get_block_slow will mark the page accessed */
1419 2531 : bh = __find_get_block_slow(bdev, block);
1420 2531 : if (bh)
1421 2065 : bh_lru_install(bh);
1422 : } else
1423 153720 : touch_buffer(bh);
1424 :
1425 156250 : return bh;
1426 : }
1427 : EXPORT_SYMBOL(__find_get_block);
1428 :
1429 : /*
1430 : * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
1431 : * which corresponds to the passed block_device, block and size. The
1432 : * returned buffer has its reference count incremented.
1433 : *
1434 : * __getblk_gfp() will lock up the machine if grow_dev_page's
1435 : * try_to_free_buffers() attempt is failing. FIXME, perhaps?
1436 : */
1437 : struct buffer_head *
1438 155776 : __getblk_gfp(struct block_device *bdev, sector_t block,
1439 : unsigned size, gfp_t gfp)
1440 : {
1441 155776 : struct buffer_head *bh = __find_get_block(bdev, block, size);
1442 :
1443 155776 : might_sleep();
1444 155776 : if (bh == NULL)
1445 231 : bh = __getblk_slow(bdev, block, size, gfp);
1446 155776 : return bh;
1447 : }
1448 : EXPORT_SYMBOL(__getblk_gfp);
1449 :
1450 : /*
1451 : * Do async read-ahead on a buffer..
1452 : */
1453 200 : void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1454 : {
1455 200 : struct buffer_head *bh = __getblk(bdev, block, size);
1456 200 : if (likely(bh)) {
1457 200 : bh_readahead(bh, REQ_RAHEAD);
1458 200 : brelse(bh);
1459 : }
1460 200 : }
1461 : EXPORT_SYMBOL(__breadahead);
1462 :
1463 : /**
1464 : * __bread_gfp() - reads a specified block and returns the bh
1465 : * @bdev: the block_device to read from
1466 : * @block: number of block
1467 : * @size: size (in bytes) to read
1468 : * @gfp: page allocation flag
1469 : *
1470 : * Reads a specified block, and returns buffer head that contains it.
1471 : * The page cache can be allocated from non-movable area
1472 : * not to prevent page migration if you set gfp to zero.
1473 : * It returns NULL if the block was unreadable.
1474 : */
1475 : struct buffer_head *
1476 121554 : __bread_gfp(struct block_device *bdev, sector_t block,
1477 : unsigned size, gfp_t gfp)
1478 : {
1479 121554 : struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1480 :
1481 243104 : if (likely(bh) && !buffer_uptodate(bh))
1482 24 : bh = __bread_slow(bh);
1483 121552 : return bh;
1484 : }
1485 : EXPORT_SYMBOL(__bread_gfp);
1486 :
1487 28669 : static void __invalidate_bh_lrus(struct bh_lru *b)
1488 : {
1489 28669 : int i;
1490 :
1491 487054 : for (i = 0; i < BH_LRU_SIZE; i++) {
1492 458382 : brelse(b->bhs[i]);
1493 458355 : b->bhs[i] = NULL;
1494 : }
1495 28672 : }
1496 : /*
1497 : * invalidate_bh_lrus() is called rarely - but not only at unmount.
1498 : * This doesn't race because it runs in each cpu either in irq
1499 : * or with preempt disabled.
1500 : */
1501 12 : static void invalidate_bh_lru(void *arg)
1502 : {
1503 12 : struct bh_lru *b = &get_cpu_var(bh_lrus);
1504 :
1505 12 : __invalidate_bh_lrus(b);
1506 12 : put_cpu_var(bh_lrus);
1507 12 : }
1508 :
1509 187147 : bool has_bh_in_lru(int cpu, void *dummy)
1510 : {
1511 187147 : struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1512 187147 : int i;
1513 :
1514 3181307 : for (i = 0; i < BH_LRU_SIZE; i++) {
1515 2994172 : if (b->bhs[i])
1516 : return true;
1517 : }
1518 :
1519 : return false;
1520 : }
1521 :
1522 84318 : void invalidate_bh_lrus(void)
1523 : {
1524 84318 : on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
1525 84318 : }
1526 : EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1527 :
1528 : /*
1529 : * It's called from workqueue context so we need a bh_lru_lock to close
1530 : * the race with preemption/irq.
1531 : */
1532 28663 : void invalidate_bh_lrus_cpu(void)
1533 : {
1534 28663 : struct bh_lru *b;
1535 :
1536 28663 : bh_lru_lock();
1537 28663 : b = this_cpu_ptr(&bh_lrus);
1538 28663 : __invalidate_bh_lrus(b);
1539 28651 : bh_lru_unlock();
1540 28662 : }
1541 :
1542 3487 : void set_bh_page(struct buffer_head *bh,
1543 : struct page *page, unsigned long offset)
1544 : {
1545 3487 : bh->b_page = page;
1546 3487 : BUG_ON(offset >= PAGE_SIZE);
1547 3487 : if (PageHighMem(page))
1548 : /*
1549 : * This catches illegal uses and preserves the offset:
1550 : */
1551 : bh->b_data = (char *)(0 + offset);
1552 : else
1553 3487 : bh->b_data = page_address(page) + offset;
1554 3487 : }
1555 : EXPORT_SYMBOL(set_bh_page);
1556 :
1557 355688 : void folio_set_bh(struct buffer_head *bh, struct folio *folio,
1558 : unsigned long offset)
1559 : {
1560 355688 : bh->b_folio = folio;
1561 355688 : BUG_ON(offset >= folio_size(folio));
1562 355688 : if (folio_test_highmem(folio))
1563 : /*
1564 : * This catches illegal uses and preserves the offset:
1565 : */
1566 : bh->b_data = (char *)(0 + offset);
1567 : else
1568 355688 : bh->b_data = folio_address(folio) + offset;
1569 355688 : }
1570 : EXPORT_SYMBOL(folio_set_bh);
1571 :
1572 : /*
1573 : * Called when truncating a buffer on a page completely.
1574 : */
1575 :
1576 : /* Bits that are cleared during an invalidate */
1577 : #define BUFFER_FLAGS_DISCARD \
1578 : (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1579 : 1 << BH_Delay | 1 << BH_Unwritten)
1580 :
1581 191928 : static void discard_buffer(struct buffer_head * bh)
1582 : {
1583 191928 : unsigned long b_state;
1584 :
1585 191928 : lock_buffer(bh);
1586 191928 : clear_buffer_dirty(bh);
1587 191928 : bh->b_bdev = NULL;
1588 191928 : b_state = READ_ONCE(bh->b_state);
1589 191928 : do {
1590 191928 : } while (!try_cmpxchg(&bh->b_state, &b_state,
1591 : b_state & ~BUFFER_FLAGS_DISCARD));
1592 191928 : unlock_buffer(bh);
1593 191928 : }
1594 :
1595 : /**
1596 : * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
1597 : * @folio: The folio which is affected.
1598 : * @offset: start of the range to invalidate
1599 : * @length: length of the range to invalidate
1600 : *
1601 : * block_invalidate_folio() is called when all or part of the folio has been
1602 : * invalidated by a truncate operation.
1603 : *
1604 : * block_invalidate_folio() does not have to release all buffers, but it must
1605 : * ensure that no dirty buffer is left outside @offset and that no I/O
1606 : * is underway against any of the blocks which are outside the truncation
1607 : * point. Because the caller is about to free (and possibly reuse) those
1608 : * blocks on-disk.
1609 : */
1610 137795 : void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
1611 : {
1612 137795 : struct buffer_head *head, *bh, *next;
1613 137795 : size_t curr_off = 0;
1614 137795 : size_t stop = length + offset;
1615 :
1616 137795 : BUG_ON(!folio_test_locked(folio));
1617 :
1618 : /*
1619 : * Check for overflow
1620 : */
1621 137795 : BUG_ON(stop > folio_size(folio) || stop < length);
1622 :
1623 137795 : head = folio_buffers(folio);
1624 137795 : if (!head)
1625 : return;
1626 :
1627 : bh = head;
1628 198915 : do {
1629 198915 : size_t next_off = curr_off + bh->b_size;
1630 198915 : next = bh->b_this_page;
1631 :
1632 : /*
1633 : * Are we still fully in range ?
1634 : */
1635 198915 : if (next_off > stop)
1636 4263 : goto out;
1637 :
1638 : /*
1639 : * is this block fully invalidated?
1640 : */
1641 194652 : if (offset <= curr_off)
1642 191928 : discard_buffer(bh);
1643 194652 : curr_off = next_off;
1644 194652 : bh = next;
1645 194652 : } while (bh != head);
1646 :
1647 : /*
1648 : * We release buffers only if the entire folio is being invalidated.
1649 : * The get_block cached value has been unconditionally invalidated,
1650 : * so real IO is not possible anymore.
1651 : */
1652 133532 : if (length == folio_size(folio))
1653 132175 : filemap_release_folio(folio, 0);
1654 1357 : out:
1655 : return;
1656 : }
1657 : EXPORT_SYMBOL(block_invalidate_folio);
1658 :
1659 : /*
1660 : * We attach and possibly dirty the buffers atomically wrt
1661 : * block_dirty_folio() via private_lock. try_to_free_buffers
1662 : * is already excluded via the folio lock.
1663 : */
1664 177563 : void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
1665 : unsigned long b_state)
1666 : {
1667 177563 : struct buffer_head *bh, *head, *tail;
1668 :
1669 177563 : head = folio_alloc_buffers(folio, blocksize, true);
1670 177563 : bh = head;
1671 342200 : do {
1672 342200 : bh->b_state |= b_state;
1673 342200 : tail = bh;
1674 342200 : bh = bh->b_this_page;
1675 342200 : } while (bh);
1676 177563 : tail->b_this_page = head;
1677 :
1678 177563 : spin_lock(&folio->mapping->private_lock);
1679 351939 : if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
1680 : bh = head;
1681 7285 : do {
1682 7285 : if (folio_test_dirty(folio))
1683 0 : set_buffer_dirty(bh);
1684 7285 : if (folio_test_uptodate(folio))
1685 7285 : set_buffer_uptodate(bh);
1686 7285 : bh = bh->b_this_page;
1687 7285 : } while (bh != head);
1688 : }
1689 177563 : folio_attach_private(folio, head);
1690 177563 : spin_unlock(&folio->mapping->private_lock);
1691 177563 : }
1692 : EXPORT_SYMBOL(folio_create_empty_buffers);
1693 :
1694 0 : void create_empty_buffers(struct page *page,
1695 : unsigned long blocksize, unsigned long b_state)
1696 : {
1697 0 : folio_create_empty_buffers(page_folio(page), blocksize, b_state);
1698 0 : }
1699 : EXPORT_SYMBOL(create_empty_buffers);
1700 :
1701 : /**
1702 : * clean_bdev_aliases: clean a range of buffers in block device
1703 : * @bdev: Block device to clean buffers in
1704 : * @block: Start of a range of blocks to clean
1705 : * @len: Number of blocks to clean
1706 : *
1707 : * We are taking a range of blocks for data and we don't want writeback of any
1708 : * buffer-cache aliases starting from return from this function and until the
1709 : * moment when something will explicitly mark the buffer dirty (hopefully that
1710 : * will not happen until we will free that block ;-) We don't even need to mark
1711 : * it not-uptodate - nobody can expect anything from a newly allocated buffer
1712 : * anyway. We used to use unmap_buffer() for such invalidation, but that was
1713 : * wrong. We definitely don't want to mark the alias unmapped, for example - it
1714 : * would confuse anyone who might pick it with bread() afterwards...
1715 : *
1716 : * Also.. Note that bforget() doesn't lock the buffer. So there can be
1717 : * writeout I/O going on against recently-freed buffers. We don't wait on that
1718 : * I/O in bforget() - it's more efficient to wait on the I/O only if we really
1719 : * need to. That happens here.
1720 : */
1721 33242 : void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
1722 : {
1723 33242 : struct inode *bd_inode = bdev->bd_inode;
1724 33242 : struct address_space *bd_mapping = bd_inode->i_mapping;
1725 33242 : struct folio_batch fbatch;
1726 33242 : pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
1727 33242 : pgoff_t end;
1728 33242 : int i, count;
1729 33242 : struct buffer_head *bh;
1730 33242 : struct buffer_head *head;
1731 :
1732 33242 : end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
1733 33242 : folio_batch_init(&fbatch);
1734 33242 : while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
1735 8749 : count = folio_batch_count(&fbatch);
1736 17498 : for (i = 0; i < count; i++) {
1737 8749 : struct folio *folio = fbatch.folios[i];
1738 :
1739 8749 : if (!folio_buffers(folio))
1740 531 : continue;
1741 : /*
1742 : * We use folio lock instead of bd_mapping->private_lock
1743 : * to pin buffers here since we can afford to sleep and
1744 : * it scales better than a global spinlock lock.
1745 : */
1746 8218 : folio_lock(folio);
1747 : /* Recheck when the folio is locked which pins bhs */
1748 8218 : head = folio_buffers(folio);
1749 8218 : if (!head)
1750 0 : goto unlock_page;
1751 : bh = head;
1752 324940 : do {
1753 649880 : if (!buffer_mapped(bh) || (bh->b_blocknr < block))
1754 308711 : goto next;
1755 16229 : if (bh->b_blocknr >= block + len)
1756 : break;
1757 8218 : clear_buffer_dirty(bh);
1758 8218 : wait_on_buffer(bh);
1759 8218 : clear_buffer_req(bh);
1760 316929 : next:
1761 316929 : bh = bh->b_this_page;
1762 316929 : } while (bh != head);
1763 8218 : unlock_page:
1764 8218 : folio_unlock(folio);
1765 : }
1766 8749 : folio_batch_release(&fbatch);
1767 8749 : cond_resched();
1768 : /* End of range already reached? */
1769 8749 : if (index > end || !index)
1770 : break;
1771 : }
1772 33242 : }
1773 : EXPORT_SYMBOL(clean_bdev_aliases);
1774 :
1775 : /*
1776 : * Size is a power-of-two in the range 512..PAGE_SIZE,
1777 : * and the case we care about most is PAGE_SIZE.
1778 : *
1779 : * So this *could* possibly be written with those
1780 : * constraints in mind (relevant mostly if some
1781 : * architecture has a slow bit-scan instruction)
1782 : */
1783 5138904 : static inline int block_size_bits(unsigned int blocksize)
1784 : {
1785 5138904 : return ilog2(blocksize);
1786 : }
1787 :
1788 5138904 : static struct buffer_head *folio_create_buffers(struct folio *folio,
1789 : struct inode *inode,
1790 : unsigned int b_state)
1791 : {
1792 5138904 : BUG_ON(!folio_test_locked(folio));
1793 :
1794 5138904 : if (!folio_buffers(folio))
1795 355048 : folio_create_empty_buffers(folio,
1796 177524 : 1 << READ_ONCE(inode->i_blkbits),
1797 : b_state);
1798 5138904 : return folio_buffers(folio);
1799 : }
1800 :
1801 : /*
1802 : * NOTE! All mapped/uptodate combinations are valid:
1803 : *
1804 : * Mapped Uptodate Meaning
1805 : *
1806 : * No No "unknown" - must do get_block()
1807 : * No Yes "hole" - zero-filled
1808 : * Yes No "allocated" - allocated on disk, not read in
1809 : * Yes Yes "valid" - allocated and up-to-date in memory.
1810 : *
1811 : * "Dirty" is valid only with the last case (mapped+uptodate).
1812 : */
1813 :
1814 : /*
1815 : * While block_write_full_page is writing back the dirty buffers under
1816 : * the page lock, whoever dirtied the buffers may decide to clean them
1817 : * again at any time. We handle that by only looking at the buffer
1818 : * state inside lock_buffer().
1819 : *
1820 : * If block_write_full_page() is called for regular writeback
1821 : * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1822 : * locked buffer. This only can happen if someone has written the buffer
1823 : * directly, with submit_bh(). At the address_space level PageWriteback
1824 : * prevents this contention from occurring.
1825 : *
1826 : * If block_write_full_page() is called with wbc->sync_mode ==
1827 : * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
1828 : * causes the writes to be flagged as synchronous writes.
1829 : */
1830 162862 : int __block_write_full_folio(struct inode *inode, struct folio *folio,
1831 : get_block_t *get_block, struct writeback_control *wbc,
1832 : bh_end_io_t *handler)
1833 : {
1834 162862 : int err;
1835 162862 : sector_t block;
1836 162862 : sector_t last_block;
1837 162862 : struct buffer_head *bh, *head;
1838 162862 : unsigned int blocksize, bbits;
1839 162862 : int nr_underway = 0;
1840 162862 : blk_opf_t write_flags = wbc_to_write_flags(wbc);
1841 :
1842 162862 : head = folio_create_buffers(folio, inode,
1843 : (1 << BH_Dirty) | (1 << BH_Uptodate));
1844 :
1845 : /*
1846 : * Be very careful. We have no exclusion from block_dirty_folio
1847 : * here, and the (potentially unmapped) buffers may become dirty at
1848 : * any time. If a buffer becomes dirty here after we've inspected it
1849 : * then we just miss that fact, and the folio stays dirty.
1850 : *
1851 : * Buffers outside i_size may be dirtied by block_dirty_folio;
1852 : * handle that here by just cleaning them.
1853 : */
1854 :
1855 162862 : bh = head;
1856 162862 : blocksize = bh->b_size;
1857 162862 : bbits = block_size_bits(blocksize);
1858 :
1859 162862 : block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
1860 162862 : last_block = (i_size_read(inode) - 1) >> bbits;
1861 :
1862 : /*
1863 : * Get all the dirty buffers mapped to disk addresses and
1864 : * handle any aliases from the underlying blockdev's mapping.
1865 : */
1866 250155 : do {
1867 250155 : if (block > last_block) {
1868 : /*
1869 : * mapped buffers outside i_size will occur, because
1870 : * this folio can be outside i_size when there is a
1871 : * truncate in progress.
1872 : */
1873 : /*
1874 : * The buffer was zeroed by block_write_full_page()
1875 : */
1876 1220 : clear_buffer_dirty(bh);
1877 1220 : set_buffer_uptodate(bh);
1878 746805 : } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1879 : buffer_dirty(bh)) {
1880 135 : WARN_ON(bh->b_size != blocksize);
1881 135 : err = get_block(inode, block, bh, 1);
1882 135 : if (err)
1883 2 : goto recover;
1884 133 : clear_buffer_delay(bh);
1885 266 : if (buffer_new(bh)) {
1886 : /* blockdev mappings never come here */
1887 117 : clear_buffer_new(bh);
1888 117 : clean_bdev_bh_alias(bh);
1889 : }
1890 : }
1891 250153 : bh = bh->b_this_page;
1892 250153 : block++;
1893 250153 : } while (bh != head);
1894 :
1895 250274 : do {
1896 500548 : if (!buffer_mapped(bh))
1897 23400 : continue;
1898 : /*
1899 : * If it's a fully non-blocking write attempt and we cannot
1900 : * lock the buffer then redirty the folio. Note that this can
1901 : * potentially cause a busy-wait loop from writeback threads
1902 : * and kswapd activity, but those code paths have their own
1903 : * higher-level throttling.
1904 : */
1905 226874 : if (wbc->sync_mode != WB_SYNC_NONE) {
1906 188820 : lock_buffer(bh);
1907 38054 : } else if (!trylock_buffer(bh)) {
1908 0 : folio_redirty_for_writepage(wbc, folio);
1909 0 : continue;
1910 : }
1911 453779 : if (test_clear_buffer_dirty(bh)) {
1912 192604 : mark_buffer_async_write_endio(bh, handler);
1913 : } else {
1914 34288 : unlock_buffer(bh);
1915 : }
1916 250274 : } while ((bh = bh->b_this_page) != head);
1917 :
1918 : /*
1919 : * The folio and its buffers are protected by the writeback flag,
1920 : * so we can drop the bh refcounts early.
1921 : */
1922 162860 : BUG_ON(folio_test_writeback(folio));
1923 162860 : folio_start_writeback(folio);
1924 :
1925 250278 : do {
1926 250278 : struct buffer_head *next = bh->b_this_page;
1927 500556 : if (buffer_async_write(bh)) {
1928 192604 : submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
1929 192604 : nr_underway++;
1930 : }
1931 250278 : bh = next;
1932 250278 : } while (bh != head);
1933 162860 : folio_unlock(folio);
1934 :
1935 162860 : err = 0;
1936 162861 : done:
1937 162861 : if (nr_underway == 0) {
1938 : /*
1939 : * The folio was marked dirty, but the buffers were
1940 : * clean. Someone wrote them back by hand with
1941 : * write_dirty_buffer/submit_bh. A rare case.
1942 : */
1943 177 : folio_end_writeback(folio);
1944 :
1945 : /*
1946 : * The folio and buffer_heads can be released at any time from
1947 : * here on.
1948 : */
1949 : }
1950 162861 : return err;
1951 :
1952 : recover:
1953 : /*
1954 : * ENOSPC, or some other error. We may already have added some
1955 : * blocks to the file, so we need to write these out to avoid
1956 : * exposing stale data.
1957 : * The folio is currently locked and not marked for writeback
1958 : */
1959 2 : bh = head;
1960 : /* Recovery: lock and submit the mapped buffers */
1961 128 : do {
1962 256 : if (buffer_mapped(bh) && buffer_dirty(bh) &&
1963 : !buffer_delay(bh)) {
1964 0 : lock_buffer(bh);
1965 0 : mark_buffer_async_write_endio(bh, handler);
1966 : } else {
1967 : /*
1968 : * The buffer may have been set dirty during
1969 : * attachment to a dirty folio.
1970 : */
1971 128 : clear_buffer_dirty(bh);
1972 : }
1973 128 : } while ((bh = bh->b_this_page) != head);
1974 2 : folio_set_error(folio);
1975 2 : BUG_ON(folio_test_writeback(folio));
1976 2 : mapping_set_error(folio->mapping, err);
1977 2 : folio_start_writeback(folio);
1978 128 : do {
1979 128 : struct buffer_head *next = bh->b_this_page;
1980 256 : if (buffer_async_write(bh)) {
1981 0 : clear_buffer_dirty(bh);
1982 0 : submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
1983 0 : nr_underway++;
1984 : }
1985 128 : bh = next;
1986 128 : } while (bh != head);
1987 2 : folio_unlock(folio);
1988 2 : goto done;
1989 : }
1990 : EXPORT_SYMBOL(__block_write_full_folio);
1991 :
1992 : /*
1993 : * If a folio has any new buffers, zero them out here, and mark them uptodate
1994 : * and dirty so they'll be written out (in order to prevent uninitialised
1995 : * block data from leaking). And clear the new bit.
1996 : */
1997 17 : void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
1998 : {
1999 17 : size_t block_start, block_end;
2000 17 : struct buffer_head *head, *bh;
2001 :
2002 17 : BUG_ON(!folio_test_locked(folio));
2003 17 : head = folio_buffers(folio);
2004 17 : if (!head)
2005 : return;
2006 :
2007 : bh = head;
2008 : block_start = 0;
2009 1088 : do {
2010 1088 : block_end = block_start + bh->b_size;
2011 :
2012 2176 : if (buffer_new(bh)) {
2013 0 : if (block_end > from && block_start < to) {
2014 0 : if (!folio_test_uptodate(folio)) {
2015 0 : size_t start, xend;
2016 :
2017 0 : start = max(from, block_start);
2018 0 : xend = min(to, block_end);
2019 :
2020 0 : folio_zero_segment(folio, start, xend);
2021 0 : set_buffer_uptodate(bh);
2022 : }
2023 :
2024 0 : clear_buffer_new(bh);
2025 0 : mark_buffer_dirty(bh);
2026 : }
2027 : }
2028 :
2029 1088 : block_start = block_end;
2030 1088 : bh = bh->b_this_page;
2031 1088 : } while (bh != head);
2032 : }
2033 : EXPORT_SYMBOL(folio_zero_new_buffers);
2034 :
2035 : static void
2036 0 : iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
2037 : const struct iomap *iomap)
2038 : {
2039 0 : loff_t offset = block << inode->i_blkbits;
2040 :
2041 0 : bh->b_bdev = iomap->bdev;
2042 :
2043 : /*
2044 : * Block points to offset in file we need to map, iomap contains
2045 : * the offset at which the map starts. If the map ends before the
2046 : * current block, then do not map the buffer and let the caller
2047 : * handle it.
2048 : */
2049 0 : BUG_ON(offset >= iomap->offset + iomap->length);
2050 :
2051 0 : switch (iomap->type) {
2052 : case IOMAP_HOLE:
2053 : /*
2054 : * If the buffer is not up to date or beyond the current EOF,
2055 : * we need to mark it as new to ensure sub-block zeroing is
2056 : * executed if necessary.
2057 : */
2058 0 : if (!buffer_uptodate(bh) ||
2059 : (offset >= i_size_read(inode)))
2060 0 : set_buffer_new(bh);
2061 : break;
2062 : case IOMAP_DELALLOC:
2063 0 : if (!buffer_uptodate(bh) ||
2064 : (offset >= i_size_read(inode)))
2065 0 : set_buffer_new(bh);
2066 0 : set_buffer_uptodate(bh);
2067 0 : set_buffer_mapped(bh);
2068 0 : set_buffer_delay(bh);
2069 : break;
2070 : case IOMAP_UNWRITTEN:
2071 : /*
2072 : * For unwritten regions, we always need to ensure that regions
2073 : * in the block we are not writing to are zeroed. Mark the
2074 : * buffer as new to ensure this.
2075 : */
2076 0 : set_buffer_new(bh);
2077 0 : set_buffer_unwritten(bh);
2078 0 : fallthrough;
2079 0 : case IOMAP_MAPPED:
2080 0 : if ((iomap->flags & IOMAP_F_NEW) ||
2081 : offset >= i_size_read(inode))
2082 0 : set_buffer_new(bh);
2083 0 : bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
2084 0 : inode->i_blkbits;
2085 0 : set_buffer_mapped(bh);
2086 : break;
2087 : }
2088 0 : }
2089 :
2090 4969741 : int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
2091 : get_block_t *get_block, const struct iomap *iomap)
2092 : {
2093 4969741 : unsigned from = pos & (PAGE_SIZE - 1);
2094 4969741 : unsigned to = from + len;
2095 4969741 : struct inode *inode = folio->mapping->host;
2096 4969741 : unsigned block_start, block_end;
2097 4969741 : sector_t block;
2098 4969741 : int err = 0;
2099 4969741 : unsigned blocksize, bbits;
2100 4969741 : struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
2101 :
2102 4969741 : BUG_ON(!folio_test_locked(folio));
2103 4969741 : BUG_ON(from > PAGE_SIZE);
2104 4969741 : BUG_ON(to > PAGE_SIZE);
2105 4969741 : BUG_ON(from > to);
2106 :
2107 4969741 : head = folio_create_buffers(folio, inode, 0);
2108 4969741 : blocksize = head->b_size;
2109 4969741 : bbits = block_size_bits(blocksize);
2110 :
2111 4969741 : block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
2112 :
2113 10124287 : for(bh = head, block_start = 0; bh != head || !block_start;
2114 5154546 : block++, block_start=block_end, bh = bh->b_this_page) {
2115 5154563 : block_end = block_start + blocksize;
2116 5154563 : if (block_end <= from || block_start >= to) {
2117 139535 : if (folio_test_uptodate(folio)) {
2118 70980 : if (!buffer_uptodate(bh))
2119 0 : set_buffer_uptodate(bh);
2120 : }
2121 139535 : continue;
2122 : }
2123 10030056 : if (buffer_new(bh))
2124 0 : clear_buffer_new(bh);
2125 10030056 : if (!buffer_mapped(bh)) {
2126 220431 : WARN_ON(bh->b_size != blocksize);
2127 220431 : if (get_block) {
2128 220431 : err = get_block(inode, block, bh, 1);
2129 220431 : if (err)
2130 : break;
2131 : } else {
2132 0 : iomap_to_bh(inode, block, bh, iomap);
2133 : }
2134 :
2135 440828 : if (buffer_new(bh)) {
2136 32933 : clean_bdev_bh_alias(bh);
2137 32933 : if (folio_test_uptodate(folio)) {
2138 631 : clear_buffer_new(bh);
2139 631 : set_buffer_uptodate(bh);
2140 631 : mark_buffer_dirty(bh);
2141 631 : continue;
2142 : }
2143 32302 : if (block_end > to || block_start < from)
2144 898 : folio_zero_segments(folio,
2145 : to, block_end,
2146 : block_start, from);
2147 32302 : continue;
2148 : }
2149 : }
2150 4982078 : if (folio_test_uptodate(folio)) {
2151 9595582 : if (!buffer_uptodate(bh))
2152 0 : set_buffer_uptodate(bh);
2153 4797791 : continue;
2154 : }
2155 736442 : if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2156 183934 : !buffer_unwritten(bh) &&
2157 183934 : (block_start < from || block_end > to)) {
2158 102000 : bh_read_nowait(bh, 0);
2159 102000 : *wait_bh++=bh;
2160 : }
2161 : }
2162 : /*
2163 : * If we issued read requests - let them complete.
2164 : */
2165 5071741 : while(wait_bh > wait) {
2166 102000 : wait_on_buffer(*--wait_bh);
2167 204000 : if (!buffer_uptodate(*wait_bh))
2168 0 : err = -EIO;
2169 : }
2170 4969741 : if (unlikely(err))
2171 17 : folio_zero_new_buffers(folio, from, to);
2172 4969741 : return err;
2173 : }
2174 :
2175 4969741 : int __block_write_begin(struct page *page, loff_t pos, unsigned len,
2176 : get_block_t *get_block)
2177 : {
2178 9939482 : return __block_write_begin_int(page_folio(page), pos, len, get_block,
2179 : NULL);
2180 : }
2181 : EXPORT_SYMBOL(__block_write_begin);
2182 :
2183 4969724 : static int __block_commit_write(struct inode *inode, struct folio *folio,
2184 : size_t from, size_t to)
2185 : {
2186 4969724 : size_t block_start, block_end;
2187 4969724 : bool partial = false;
2188 4969724 : unsigned blocksize;
2189 4969724 : struct buffer_head *bh, *head;
2190 :
2191 4969724 : bh = head = folio_buffers(folio);
2192 4969724 : blocksize = bh->b_size;
2193 :
2194 4969724 : block_start = 0;
2195 5154122 : do {
2196 5154122 : block_end = block_start + blocksize;
2197 5154122 : if (block_end <= from || block_start >= to) {
2198 278272 : if (!buffer_uptodate(bh))
2199 68876 : partial = true;
2200 : } else {
2201 5014986 : set_buffer_uptodate(bh);
2202 5014986 : mark_buffer_dirty(bh);
2203 : }
2204 10308244 : if (buffer_new(bh))
2205 32302 : clear_buffer_new(bh);
2206 :
2207 5154122 : block_start = block_end;
2208 5154122 : bh = bh->b_this_page;
2209 5154122 : } while (bh != head);
2210 :
2211 : /*
2212 : * If this is a partial write which happened to make all buffers
2213 : * uptodate then we can optimize away a bogus read_folio() for
2214 : * the next read(). Here we 'discover' whether the folio went
2215 : * uptodate as a result of this (potentially partial) write.
2216 : */
2217 4969724 : if (!partial)
2218 4963707 : folio_mark_uptodate(folio);
2219 4969724 : return 0;
2220 : }
2221 :
2222 : /*
2223 : * block_write_begin takes care of the basic task of block allocation and
2224 : * bringing partial write blocks uptodate first.
2225 : *
2226 : * The filesystem needs to handle block truncation upon failure.
2227 : */
2228 4969283 : int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2229 : struct page **pagep, get_block_t *get_block)
2230 : {
2231 4969283 : pgoff_t index = pos >> PAGE_SHIFT;
2232 4969283 : struct page *page;
2233 4969283 : int status;
2234 :
2235 4969283 : page = grab_cache_page_write_begin(mapping, index);
2236 4969283 : if (!page)
2237 : return -ENOMEM;
2238 :
2239 4969283 : status = __block_write_begin(page, pos, len, get_block);
2240 4969283 : if (unlikely(status)) {
2241 17 : unlock_page(page);
2242 17 : put_page(page);
2243 17 : page = NULL;
2244 : }
2245 :
2246 4969283 : *pagep = page;
2247 4969283 : return status;
2248 : }
2249 : EXPORT_SYMBOL(block_write_begin);
2250 :
2251 4969724 : int block_write_end(struct file *file, struct address_space *mapping,
2252 : loff_t pos, unsigned len, unsigned copied,
2253 : struct page *page, void *fsdata)
2254 : {
2255 4969724 : struct folio *folio = page_folio(page);
2256 4969724 : struct inode *inode = mapping->host;
2257 4969724 : size_t start = pos - folio_pos(folio);
2258 :
2259 4969724 : if (unlikely(copied < len)) {
2260 : /*
2261 : * The buffers that were written will now be uptodate, so
2262 : * we don't have to worry about a read_folio reading them
2263 : * and overwriting a partial write. However if we have
2264 : * encountered a short write and only partially written
2265 : * into a buffer, it will not be marked uptodate, so a
2266 : * read_folio might come in and destroy our partial write.
2267 : *
2268 : * Do the simplest thing, and just treat any short write to a
2269 : * non uptodate folio as a zero-length write, and force the
2270 : * caller to redo the whole thing.
2271 : */
2272 0 : if (!folio_test_uptodate(folio))
2273 0 : copied = 0;
2274 :
2275 0 : folio_zero_new_buffers(folio, start+copied, start+len);
2276 : }
2277 4969724 : flush_dcache_folio(folio);
2278 :
2279 : /* This could be a short (even 0-length) commit */
2280 4969724 : __block_commit_write(inode, folio, start, start + copied);
2281 :
2282 4969724 : return copied;
2283 : }
2284 : EXPORT_SYMBOL(block_write_end);
2285 :
2286 2534 : int generic_write_end(struct file *file, struct address_space *mapping,
2287 : loff_t pos, unsigned len, unsigned copied,
2288 : struct page *page, void *fsdata)
2289 : {
2290 2534 : struct inode *inode = mapping->host;
2291 2534 : loff_t old_size = inode->i_size;
2292 2534 : bool i_size_changed = false;
2293 :
2294 2534 : copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2295 :
2296 : /*
2297 : * No need to use i_size_read() here, the i_size cannot change under us
2298 : * because we hold i_rwsem.
2299 : *
2300 : * But it's important to update i_size while still holding page lock:
2301 : * page writeout could otherwise come in and zero beyond i_size.
2302 : */
2303 2534 : if (pos + copied > inode->i_size) {
2304 357 : i_size_write(inode, pos + copied);
2305 357 : i_size_changed = true;
2306 : }
2307 :
2308 2534 : unlock_page(page);
2309 2534 : put_page(page);
2310 :
2311 2534 : if (old_size < pos)
2312 166 : pagecache_isize_extended(inode, old_size, pos);
2313 : /*
2314 : * Don't mark the inode dirty under page lock. First, it unnecessarily
2315 : * makes the holding time of page lock longer. Second, it forces lock
2316 : * ordering of page lock and transaction start for journaling
2317 : * filesystems.
2318 : */
2319 2534 : if (i_size_changed)
2320 357 : mark_inode_dirty(inode);
2321 2534 : return copied;
2322 : }
2323 : EXPORT_SYMBOL(generic_write_end);
2324 :
2325 : /*
2326 : * block_is_partially_uptodate checks whether buffers within a folio are
2327 : * uptodate or not.
2328 : *
2329 : * Returns true if all buffers which correspond to the specified part
2330 : * of the folio are uptodate.
2331 : */
2332 15 : bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
2333 : {
2334 15 : unsigned block_start, block_end, blocksize;
2335 15 : unsigned to;
2336 15 : struct buffer_head *bh, *head;
2337 15 : bool ret = true;
2338 :
2339 15 : head = folio_buffers(folio);
2340 15 : if (!head)
2341 : return false;
2342 15 : blocksize = head->b_size;
2343 15 : to = min_t(unsigned, folio_size(folio) - from, count);
2344 15 : to = from + to;
2345 15 : if (from < blocksize && to > folio_size(folio) - blocksize)
2346 : return false;
2347 :
2348 : bh = head;
2349 : block_start = 0;
2350 204 : do {
2351 204 : block_end = block_start + blocksize;
2352 204 : if (block_end > from && block_start < to) {
2353 118 : if (!buffer_uptodate(bh)) {
2354 : ret = false;
2355 : break;
2356 : }
2357 48 : if (block_end >= to)
2358 : break;
2359 : }
2360 192 : block_start = block_end;
2361 192 : bh = bh->b_this_page;
2362 192 : } while (bh != head);
2363 :
2364 : return ret;
2365 : }
2366 : EXPORT_SYMBOL(block_is_partially_uptodate);
2367 :
2368 : /*
2369 : * Generic "read_folio" function for block devices that have the normal
2370 : * get_block functionality. This is most of the block device filesystems.
2371 : * Reads the folio asynchronously --- the unlock_buffer() and
2372 : * set/clear_buffer_uptodate() functions propagate buffer state into the
2373 : * folio once IO has completed.
2374 : */
2375 6301 : int block_read_full_folio(struct folio *folio, get_block_t *get_block)
2376 : {
2377 6301 : struct inode *inode = folio->mapping->host;
2378 6301 : sector_t iblock, lblock;
2379 6301 : struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2380 6301 : unsigned int blocksize, bbits;
2381 6301 : int nr, i;
2382 6301 : int fully_mapped = 1;
2383 6301 : bool page_error = false;
2384 6301 : loff_t limit = i_size_read(inode);
2385 :
2386 : /* This is needed for ext4. */
2387 6301 : if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
2388 : limit = inode->i_sb->s_maxbytes;
2389 :
2390 6301 : VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
2391 :
2392 6301 : head = folio_create_buffers(folio, inode, 0);
2393 6301 : blocksize = head->b_size;
2394 6301 : bbits = block_size_bits(blocksize);
2395 :
2396 6301 : iblock = (sector_t)folio->index << (PAGE_SHIFT - bbits);
2397 6301 : lblock = (limit+blocksize-1) >> bbits;
2398 6301 : bh = head;
2399 6301 : nr = 0;
2400 6301 : i = 0;
2401 :
2402 90610 : do {
2403 181220 : if (buffer_uptodate(bh))
2404 1302 : continue;
2405 :
2406 178616 : if (!buffer_mapped(bh)) {
2407 88294 : int err = 0;
2408 :
2409 88294 : fully_mapped = 0;
2410 88294 : if (iblock < lblock) {
2411 85476 : WARN_ON(bh->b_size != blocksize);
2412 85476 : err = get_block(inode, iblock, bh, 0);
2413 85476 : if (err) {
2414 0 : folio_set_error(folio);
2415 : page_error = true;
2416 : }
2417 : }
2418 176588 : if (!buffer_mapped(bh)) {
2419 5118 : folio_zero_range(folio, i * blocksize,
2420 : blocksize);
2421 5118 : if (!err)
2422 5118 : set_buffer_uptodate(bh);
2423 5118 : continue;
2424 : }
2425 : /*
2426 : * get_block() might have updated the buffer
2427 : * synchronously
2428 : */
2429 166352 : if (buffer_uptodate(bh))
2430 0 : continue;
2431 : }
2432 84190 : arr[nr++] = bh;
2433 90610 : } while (i++, iblock++, (bh = bh->b_this_page) != head);
2434 :
2435 6301 : if (fully_mapped)
2436 57 : folio_set_mappedtodisk(folio);
2437 :
2438 6301 : if (!nr) {
2439 : /*
2440 : * All buffers are uptodate - we can set the folio uptodate
2441 : * as well. But not if get_block() returned an error.
2442 : */
2443 90 : if (!page_error)
2444 90 : folio_mark_uptodate(folio);
2445 90 : folio_unlock(folio);
2446 90 : return 0;
2447 : }
2448 :
2449 : /* Stage two: lock the buffers */
2450 90401 : for (i = 0; i < nr; i++) {
2451 84190 : bh = arr[i];
2452 84190 : lock_buffer(bh);
2453 84190 : mark_buffer_async_read(bh);
2454 : }
2455 :
2456 : /*
2457 : * Stage 3: start the IO. Check for uptodateness
2458 : * inside the buffer lock in case another process reading
2459 : * the underlying blockdev brought it uptodate (the sct fix).
2460 : */
2461 90401 : for (i = 0; i < nr; i++) {
2462 84190 : bh = arr[i];
2463 168380 : if (buffer_uptodate(bh))
2464 2 : end_buffer_async_read(bh, 1);
2465 : else
2466 84188 : submit_bh(REQ_OP_READ, bh);
2467 : }
2468 : return 0;
2469 : }
2470 : EXPORT_SYMBOL(block_read_full_folio);
2471 :
2472 : /* utility function for filesystems that need to do work on expanding
2473 : * truncates. Uses filesystem pagecache writes to allow the filesystem to
2474 : * deal with the hole.
2475 : */
2476 0 : int generic_cont_expand_simple(struct inode *inode, loff_t size)
2477 : {
2478 0 : struct address_space *mapping = inode->i_mapping;
2479 0 : const struct address_space_operations *aops = mapping->a_ops;
2480 0 : struct page *page;
2481 0 : void *fsdata = NULL;
2482 0 : int err;
2483 :
2484 0 : err = inode_newsize_ok(inode, size);
2485 0 : if (err)
2486 0 : goto out;
2487 :
2488 0 : err = aops->write_begin(NULL, mapping, size, 0, &page, &fsdata);
2489 0 : if (err)
2490 0 : goto out;
2491 :
2492 0 : err = aops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
2493 0 : BUG_ON(err > 0);
2494 :
2495 0 : out:
2496 0 : return err;
2497 : }
2498 : EXPORT_SYMBOL(generic_cont_expand_simple);
2499 :
2500 0 : static int cont_expand_zero(struct file *file, struct address_space *mapping,
2501 : loff_t pos, loff_t *bytes)
2502 : {
2503 0 : struct inode *inode = mapping->host;
2504 0 : const struct address_space_operations *aops = mapping->a_ops;
2505 0 : unsigned int blocksize = i_blocksize(inode);
2506 0 : struct page *page;
2507 0 : void *fsdata = NULL;
2508 0 : pgoff_t index, curidx;
2509 0 : loff_t curpos;
2510 0 : unsigned zerofrom, offset, len;
2511 0 : int err = 0;
2512 :
2513 0 : index = pos >> PAGE_SHIFT;
2514 0 : offset = pos & ~PAGE_MASK;
2515 :
2516 0 : while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
2517 0 : zerofrom = curpos & ~PAGE_MASK;
2518 0 : if (zerofrom & (blocksize-1)) {
2519 0 : *bytes |= (blocksize-1);
2520 0 : (*bytes)++;
2521 : }
2522 0 : len = PAGE_SIZE - zerofrom;
2523 :
2524 0 : err = aops->write_begin(file, mapping, curpos, len,
2525 : &page, &fsdata);
2526 0 : if (err)
2527 0 : goto out;
2528 0 : zero_user(page, zerofrom, len);
2529 0 : err = aops->write_end(file, mapping, curpos, len, len,
2530 : page, fsdata);
2531 0 : if (err < 0)
2532 0 : goto out;
2533 0 : BUG_ON(err != len);
2534 0 : err = 0;
2535 :
2536 0 : balance_dirty_pages_ratelimited(mapping);
2537 :
2538 0 : if (fatal_signal_pending(current)) {
2539 0 : err = -EINTR;
2540 0 : goto out;
2541 : }
2542 : }
2543 :
2544 : /* page covers the boundary, find the boundary offset */
2545 0 : if (index == curidx) {
2546 0 : zerofrom = curpos & ~PAGE_MASK;
2547 : /* if we will expand the thing last block will be filled */
2548 0 : if (offset <= zerofrom) {
2549 0 : goto out;
2550 : }
2551 0 : if (zerofrom & (blocksize-1)) {
2552 0 : *bytes |= (blocksize-1);
2553 0 : (*bytes)++;
2554 : }
2555 0 : len = offset - zerofrom;
2556 :
2557 0 : err = aops->write_begin(file, mapping, curpos, len,
2558 : &page, &fsdata);
2559 0 : if (err)
2560 0 : goto out;
2561 0 : zero_user(page, zerofrom, len);
2562 0 : err = aops->write_end(file, mapping, curpos, len, len,
2563 : page, fsdata);
2564 0 : if (err < 0)
2565 0 : goto out;
2566 0 : BUG_ON(err != len);
2567 : err = 0;
2568 : }
2569 0 : out:
2570 0 : return err;
2571 : }
2572 :
2573 : /*
2574 : * For moronic filesystems that do not allow holes in file.
2575 : * We may have to extend the file.
2576 : */
2577 0 : int cont_write_begin(struct file *file, struct address_space *mapping,
2578 : loff_t pos, unsigned len,
2579 : struct page **pagep, void **fsdata,
2580 : get_block_t *get_block, loff_t *bytes)
2581 : {
2582 0 : struct inode *inode = mapping->host;
2583 0 : unsigned int blocksize = i_blocksize(inode);
2584 0 : unsigned int zerofrom;
2585 0 : int err;
2586 :
2587 0 : err = cont_expand_zero(file, mapping, pos, bytes);
2588 0 : if (err)
2589 : return err;
2590 :
2591 0 : zerofrom = *bytes & ~PAGE_MASK;
2592 0 : if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2593 0 : *bytes |= (blocksize-1);
2594 0 : (*bytes)++;
2595 : }
2596 :
2597 0 : return block_write_begin(mapping, pos, len, pagep, get_block);
2598 : }
2599 : EXPORT_SYMBOL(cont_write_begin);
2600 :
2601 0 : int block_commit_write(struct page *page, unsigned from, unsigned to)
2602 : {
2603 0 : struct folio *folio = page_folio(page);
2604 0 : struct inode *inode = folio->mapping->host;
2605 0 : __block_commit_write(inode, folio, from, to);
2606 0 : return 0;
2607 : }
2608 : EXPORT_SYMBOL(block_commit_write);
2609 :
2610 : /*
2611 : * block_page_mkwrite() is not allowed to change the file size as it gets
2612 : * called from a page fault handler when a page is first dirtied. Hence we must
2613 : * be careful to check for EOF conditions here. We set the page up correctly
2614 : * for a written page which means we get ENOSPC checking when writing into
2615 : * holes and correct delalloc and unwritten extent mapping on filesystems that
2616 : * support these features.
2617 : *
2618 : * We are not allowed to take the i_mutex here so we have to play games to
2619 : * protect against truncate races as the page could now be beyond EOF. Because
2620 : * truncate writes the inode size before removing pages, once we have the
2621 : * page lock we can determine safely if the page is beyond EOF. If it is not
2622 : * beyond EOF, then the page is guaranteed safe against truncation until we
2623 : * unlock the page.
2624 : *
2625 : * Direct callers of this function should protect against filesystem freezing
2626 : * using sb_start_pagefault() - sb_end_pagefault() functions.
2627 : */
2628 0 : int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2629 : get_block_t get_block)
2630 : {
2631 0 : struct folio *folio = page_folio(vmf->page);
2632 0 : struct inode *inode = file_inode(vma->vm_file);
2633 0 : unsigned long end;
2634 0 : loff_t size;
2635 0 : int ret;
2636 :
2637 0 : folio_lock(folio);
2638 0 : size = i_size_read(inode);
2639 0 : if ((folio->mapping != inode->i_mapping) ||
2640 : (folio_pos(folio) >= size)) {
2641 : /* We overload EFAULT to mean page got truncated */
2642 0 : ret = -EFAULT;
2643 0 : goto out_unlock;
2644 : }
2645 :
2646 0 : end = folio_size(folio);
2647 : /* folio is wholly or partially inside EOF */
2648 0 : if (folio_pos(folio) + end > size)
2649 0 : end = size - folio_pos(folio);
2650 :
2651 0 : ret = __block_write_begin_int(folio, 0, end, get_block, NULL);
2652 0 : if (!ret)
2653 0 : ret = __block_commit_write(inode, folio, 0, end);
2654 :
2655 0 : if (unlikely(ret < 0))
2656 0 : goto out_unlock;
2657 0 : folio_mark_dirty(folio);
2658 0 : folio_wait_stable(folio);
2659 0 : return 0;
2660 0 : out_unlock:
2661 0 : folio_unlock(folio);
2662 0 : return ret;
2663 : }
2664 : EXPORT_SYMBOL(block_page_mkwrite);
2665 :
2666 43 : int block_truncate_page(struct address_space *mapping,
2667 : loff_t from, get_block_t *get_block)
2668 : {
2669 43 : pgoff_t index = from >> PAGE_SHIFT;
2670 43 : unsigned blocksize;
2671 43 : sector_t iblock;
2672 43 : size_t offset, length, pos;
2673 43 : struct inode *inode = mapping->host;
2674 43 : struct folio *folio;
2675 43 : struct buffer_head *bh;
2676 43 : int err = 0;
2677 :
2678 43 : blocksize = i_blocksize(inode);
2679 43 : length = from & (blocksize - 1);
2680 :
2681 : /* Block boundary? Nothing to do */
2682 43 : if (!length)
2683 : return 0;
2684 :
2685 41 : length = blocksize - length;
2686 41 : iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
2687 :
2688 41 : folio = filemap_grab_folio(mapping, index);
2689 41 : if (IS_ERR(folio))
2690 0 : return PTR_ERR(folio);
2691 :
2692 41 : bh = folio_buffers(folio);
2693 41 : if (!bh) {
2694 39 : folio_create_empty_buffers(folio, blocksize, 0);
2695 39 : bh = folio_buffers(folio);
2696 : }
2697 :
2698 : /* Find the buffer that contains "offset" */
2699 41 : offset = offset_in_folio(folio, from);
2700 41 : pos = blocksize;
2701 1212 : while (offset >= pos) {
2702 1171 : bh = bh->b_this_page;
2703 1171 : iblock++;
2704 1171 : pos += blocksize;
2705 : }
2706 :
2707 82 : if (!buffer_mapped(bh)) {
2708 39 : WARN_ON(bh->b_size != blocksize);
2709 39 : err = get_block(inode, iblock, bh, 0);
2710 39 : if (err)
2711 0 : goto unlock;
2712 : /* unmapped? It's a hole - nothing to do */
2713 78 : if (!buffer_mapped(bh))
2714 39 : goto unlock;
2715 : }
2716 :
2717 : /* Ok, it's mapped. Make sure it's up-to-date */
2718 2 : if (folio_test_uptodate(folio))
2719 2 : set_buffer_uptodate(bh);
2720 :
2721 4 : if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2722 0 : err = bh_read(bh, 0);
2723 : /* Uhhuh. Read error. Complain and punt. */
2724 0 : if (err < 0)
2725 0 : goto unlock;
2726 : }
2727 :
2728 2 : folio_zero_range(folio, offset, length);
2729 2 : mark_buffer_dirty(bh);
2730 :
2731 41 : unlock:
2732 41 : folio_unlock(folio);
2733 41 : folio_put(folio);
2734 :
2735 41 : return err;
2736 : }
2737 : EXPORT_SYMBOL(block_truncate_page);
2738 :
2739 : /*
2740 : * The generic ->writepage function for buffer-backed address_spaces
2741 : */
2742 162862 : int block_write_full_page(struct page *page, get_block_t *get_block,
2743 : struct writeback_control *wbc)
2744 : {
2745 162862 : struct folio *folio = page_folio(page);
2746 162862 : struct inode * const inode = folio->mapping->host;
2747 162862 : loff_t i_size = i_size_read(inode);
2748 :
2749 : /* Is the folio fully inside i_size? */
2750 162862 : if (folio_pos(folio) + folio_size(folio) <= i_size)
2751 162816 : return __block_write_full_folio(inode, folio, get_block, wbc,
2752 : end_buffer_async_write);
2753 :
2754 : /* Is the folio fully outside i_size? (truncate in progress) */
2755 45 : if (folio_pos(folio) >= i_size) {
2756 0 : folio_unlock(folio);
2757 0 : return 0; /* don't care */
2758 : }
2759 :
2760 : /*
2761 : * The folio straddles i_size. It must be zeroed out on each and every
2762 : * writepage invocation because it may be mmapped. "A file is mapped
2763 : * in multiples of the page size. For a file that is not a multiple of
2764 : * the page size, the remaining memory is zeroed when mapped, and
2765 : * writes to that region are not written out to the file."
2766 : */
2767 45 : folio_zero_segment(folio, offset_in_folio(folio, i_size),
2768 : folio_size(folio));
2769 45 : return __block_write_full_folio(inode, folio, get_block, wbc,
2770 : end_buffer_async_write);
2771 : }
2772 : EXPORT_SYMBOL(block_write_full_page);
2773 :
2774 0 : sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2775 : get_block_t *get_block)
2776 : {
2777 0 : struct inode *inode = mapping->host;
2778 0 : struct buffer_head tmp = {
2779 0 : .b_size = i_blocksize(inode),
2780 : };
2781 :
2782 0 : get_block(inode, block, &tmp, 0);
2783 0 : return tmp.b_blocknr;
2784 : }
2785 : EXPORT_SYMBOL(generic_block_bmap);
2786 :
2787 379677 : static void end_bio_bh_io_sync(struct bio *bio)
2788 : {
2789 379677 : struct buffer_head *bh = bio->bi_private;
2790 :
2791 379677 : if (unlikely(bio_flagged(bio, BIO_QUIET)))
2792 0 : set_bit(BH_Quiet, &bh->b_state);
2793 :
2794 379677 : bh->b_end_io(bh, !bio->bi_status);
2795 379677 : bio_put(bio);
2796 379677 : }
2797 :
2798 379677 : static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
2799 : struct writeback_control *wbc)
2800 : {
2801 379677 : const enum req_op op = opf & REQ_OP_MASK;
2802 379677 : struct bio *bio;
2803 :
2804 759354 : BUG_ON(!buffer_locked(bh));
2805 759354 : BUG_ON(!buffer_mapped(bh));
2806 379677 : BUG_ON(!bh->b_end_io);
2807 759354 : BUG_ON(buffer_delay(bh));
2808 759354 : BUG_ON(buffer_unwritten(bh));
2809 :
2810 : /*
2811 : * Only clear out a write error when rewriting
2812 : */
2813 759354 : if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
2814 95871 : clear_buffer_write_io_error(bh);
2815 :
2816 759354 : if (buffer_meta(bh))
2817 0 : opf |= REQ_META;
2818 759354 : if (buffer_prio(bh))
2819 0 : opf |= REQ_PRIO;
2820 :
2821 379677 : bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);
2822 :
2823 379677 : fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
2824 :
2825 379677 : bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2826 :
2827 379677 : __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
2828 :
2829 379677 : bio->bi_end_io = end_bio_bh_io_sync;
2830 379677 : bio->bi_private = bh;
2831 :
2832 : /* Take care of bh's that straddle the end of the device */
2833 379677 : guard_bio_eod(bio);
2834 :
2835 379677 : if (wbc) {
2836 192604 : wbc_init_bio(wbc, bio);
2837 192604 : wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
2838 : }
2839 :
2840 379677 : submit_bio(bio);
2841 379677 : }
2842 :
2843 2 : void submit_bh(blk_opf_t opf, struct buffer_head *bh)
2844 : {
2845 84211 : submit_bh_wbc(opf, bh, NULL);
2846 84363 : }
2847 : EXPORT_SYMBOL(submit_bh);
2848 :
2849 175 : void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2850 : {
2851 175 : lock_buffer(bh);
2852 350 : if (!test_clear_buffer_dirty(bh)) {
2853 0 : unlock_buffer(bh);
2854 0 : return;
2855 : }
2856 175 : bh->b_end_io = end_buffer_write_sync;
2857 175 : get_bh(bh);
2858 175 : submit_bh(REQ_OP_WRITE | op_flags, bh);
2859 : }
2860 : EXPORT_SYMBOL(write_dirty_buffer);
2861 :
2862 : /*
2863 : * For a data-integrity writeout, we need to wait upon any in-progress I/O
2864 : * and then start new I/O and then wait upon it. The caller must have a ref on
2865 : * the buffer_head.
2866 : */
2867 673 : int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2868 : {
2869 673 : WARN_ON(atomic_read(&bh->b_count) < 1);
2870 673 : lock_buffer(bh);
2871 1346 : if (test_clear_buffer_dirty(bh)) {
2872 : /*
2873 : * The bh should be mapped, but it might not be if the
2874 : * device was hot-removed. Not much we can do but fail the I/O.
2875 : */
2876 1346 : if (!buffer_mapped(bh)) {
2877 0 : unlock_buffer(bh);
2878 0 : return -EIO;
2879 : }
2880 :
2881 673 : get_bh(bh);
2882 673 : bh->b_end_io = end_buffer_write_sync;
2883 673 : submit_bh(REQ_OP_WRITE | op_flags, bh);
2884 673 : wait_on_buffer(bh);
2885 1346 : if (!buffer_uptodate(bh))
2886 0 : return -EIO;
2887 : } else {
2888 0 : unlock_buffer(bh);
2889 : }
2890 : return 0;
2891 : }
2892 : EXPORT_SYMBOL(__sync_dirty_buffer);
2893 :
2894 673 : int sync_dirty_buffer(struct buffer_head *bh)
2895 : {
2896 673 : return __sync_dirty_buffer(bh, REQ_SYNC);
2897 : }
2898 : EXPORT_SYMBOL(sync_dirty_buffer);
2899 :
2900 : /*
2901 : * try_to_free_buffers() checks if all the buffers on this particular folio
2902 : * are unused, and releases them if so.
2903 : *
2904 : * Exclusion against try_to_free_buffers may be obtained by either
2905 : * locking the folio or by holding its mapping's private_lock.
2906 : *
2907 : * If the folio is dirty but all the buffers are clean then we need to
2908 : * be sure to mark the folio clean as well. This is because the folio
2909 : * may be against a block device, and a later reattachment of buffers
2910 : * to a dirty folio will set *all* buffers dirty. Which would corrupt
2911 : * filesystem data on the same device.
2912 : *
2913 : * The same applies to regular filesystem folios: if all the buffers are
2914 : * clean then we set the folio clean and proceed. To do that, we require
2915 : * total exclusion from block_dirty_folio(). That is obtained with
2916 : * private_lock.
2917 : *
2918 : * try_to_free_buffers() is non-blocking.
2919 : */
2920 : static inline int buffer_busy(struct buffer_head *bh)
2921 : {
2922 355688 : return atomic_read(&bh->b_count) |
2923 355688 : (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2924 : }
2925 :
2926 : static bool
2927 177794 : drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free)
2928 : {
2929 177794 : struct buffer_head *head = folio_buffers(folio);
2930 177794 : struct buffer_head *bh;
2931 :
2932 177794 : bh = head;
2933 355688 : do {
2934 355688 : if (buffer_busy(bh))
2935 0 : goto failed;
2936 355688 : bh = bh->b_this_page;
2937 355688 : } while (bh != head);
2938 :
2939 355688 : do {
2940 355688 : struct buffer_head *next = bh->b_this_page;
2941 :
2942 355688 : if (bh->b_assoc_map)
2943 0 : __remove_assoc_queue(bh);
2944 355688 : bh = next;
2945 355688 : } while (bh != head);
2946 177794 : *buffers_to_free = head;
2947 177794 : folio_detach_private(folio);
2948 177794 : return true;
2949 : failed:
2950 0 : return false;
2951 : }
2952 :
2953 177794 : bool try_to_free_buffers(struct folio *folio)
2954 : {
2955 177794 : struct address_space * const mapping = folio->mapping;
2956 177794 : struct buffer_head *buffers_to_free = NULL;
2957 177794 : bool ret = 0;
2958 :
2959 177794 : BUG_ON(!folio_test_locked(folio));
2960 177794 : if (folio_test_writeback(folio))
2961 : return false;
2962 :
2963 177794 : if (mapping == NULL) { /* can this still happen? */
2964 0 : ret = drop_buffers(folio, &buffers_to_free);
2965 0 : goto out;
2966 : }
2967 :
2968 177794 : spin_lock(&mapping->private_lock);
2969 177794 : ret = drop_buffers(folio, &buffers_to_free);
2970 :
2971 : /*
2972 : * If the filesystem writes its buffers by hand (eg ext3)
2973 : * then we can have clean buffers against a dirty folio. We
2974 : * clean the folio here; otherwise the VM will never notice
2975 : * that the filesystem did any IO at all.
2976 : *
2977 : * Also, during truncate, discard_buffer will have marked all
2978 : * the folio's buffers clean. We discover that here and clean
2979 : * the folio also.
2980 : *
2981 : * private_lock must be held over this entire operation in order
2982 : * to synchronise against block_dirty_folio and prevent the
2983 : * dirty bit from being lost.
2984 : */
2985 177794 : if (ret)
2986 177794 : folio_cancel_dirty(folio);
2987 177794 : spin_unlock(&mapping->private_lock);
2988 177794 : out:
2989 177794 : if (buffers_to_free) {
2990 : struct buffer_head *bh = buffers_to_free;
2991 :
2992 355681 : do {
2993 355681 : struct buffer_head *next = bh->b_this_page;
2994 355681 : free_buffer_head(bh);
2995 355681 : bh = next;
2996 355681 : } while (bh != buffers_to_free);
2997 : }
2998 : return ret;
2999 : }
3000 : EXPORT_SYMBOL(try_to_free_buffers);
3001 :
3002 : /*
3003 : * Buffer-head allocation
3004 : */
3005 : static struct kmem_cache *bh_cachep __read_mostly;
3006 :
3007 : /*
3008 : * Once the number of bh's in the machine exceeds this level, we start
3009 : * stripping them in writeback.
3010 : */
3011 : static unsigned long max_buffer_heads;
3012 :
3013 : int buffer_heads_over_limit;
3014 :
3015 : struct bh_accounting {
3016 : int nr; /* Number of live bh's */
3017 : int ratelimit; /* Limit cacheline bouncing */
3018 : };
3019 :
3020 : static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3021 :
3022 711367 : static void recalc_bh_state(void)
3023 : {
3024 711367 : int i;
3025 711367 : int tot = 0;
3026 :
3027 711367 : if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3028 : return;
3029 170 : __this_cpu_write(bh_accounting.ratelimit, 0);
3030 510 : for_each_online_cpu(i)
3031 340 : tot += per_cpu(bh_accounting, i).nr;
3032 170 : buffer_heads_over_limit = (tot > max_buffer_heads);
3033 : }
3034 :
3035 355688 : struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3036 : {
3037 355688 : struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3038 355688 : if (ret) {
3039 355688 : INIT_LIST_HEAD(&ret->b_assoc_buffers);
3040 355688 : spin_lock_init(&ret->b_uptodate_lock);
3041 355688 : preempt_disable();
3042 355688 : __this_cpu_inc(bh_accounting.nr);
3043 355688 : recalc_bh_state();
3044 355688 : preempt_enable();
3045 : }
3046 355688 : return ret;
3047 : }
3048 : EXPORT_SYMBOL(alloc_buffer_head);
3049 :
3050 355685 : void free_buffer_head(struct buffer_head *bh)
3051 : {
3052 355685 : BUG_ON(!list_empty(&bh->b_assoc_buffers));
3053 355685 : kmem_cache_free(bh_cachep, bh);
3054 355679 : preempt_disable();
3055 355677 : __this_cpu_dec(bh_accounting.nr);
3056 355677 : recalc_bh_state();
3057 355682 : preempt_enable();
3058 355682 : }
3059 : EXPORT_SYMBOL(free_buffer_head);
3060 :
3061 15 : static int buffer_exit_cpu_dead(unsigned int cpu)
3062 : {
3063 15 : int i;
3064 15 : struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3065 :
3066 255 : for (i = 0; i < BH_LRU_SIZE; i++) {
3067 240 : brelse(b->bhs[i]);
3068 240 : b->bhs[i] = NULL;
3069 : }
3070 15 : this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3071 15 : per_cpu(bh_accounting, cpu).nr = 0;
3072 15 : return 0;
3073 : }
3074 :
3075 : /**
3076 : * bh_uptodate_or_lock - Test whether the buffer is uptodate
3077 : * @bh: struct buffer_head
3078 : *
3079 : * Return true if the buffer is up-to-date and false,
3080 : * with the buffer locked, if not.
3081 : */
3082 135584 : int bh_uptodate_or_lock(struct buffer_head *bh)
3083 : {
3084 271168 : if (!buffer_uptodate(bh)) {
3085 102007 : lock_buffer(bh);
3086 204014 : if (!buffer_uptodate(bh))
3087 : return 0;
3088 0 : unlock_buffer(bh);
3089 : }
3090 : return 1;
3091 : }
3092 : EXPORT_SYMBOL(bh_uptodate_or_lock);
3093 :
3094 : /**
3095 : * __bh_read - Submit read for a locked buffer
3096 : * @bh: struct buffer_head
3097 : * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3098 : * @wait: wait until reading finish
3099 : *
3100 : * Returns zero on success or don't wait, and -EIO on error.
3101 : */
3102 102014 : int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
3103 : {
3104 102014 : int ret = 0;
3105 :
3106 204028 : BUG_ON(!buffer_locked(bh));
3107 :
3108 102014 : get_bh(bh);
3109 102014 : bh->b_end_io = end_buffer_read_sync;
3110 102014 : submit_bh(REQ_OP_READ | op_flags, bh);
3111 102014 : if (wait) {
3112 7 : wait_on_buffer(bh);
3113 14 : if (!buffer_uptodate(bh))
3114 0 : ret = -EIO;
3115 : }
3116 102014 : return ret;
3117 : }
3118 : EXPORT_SYMBOL(__bh_read);
3119 :
3120 : /**
3121 : * __bh_read_batch - Submit read for a batch of unlocked buffers
3122 : * @nr: entry number of the buffer batch
3123 : * @bhs: a batch of struct buffer_head
3124 : * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3125 : * @force_lock: force to get a lock on the buffer if set, otherwise drops any
3126 : * buffer that cannot lock.
3127 : *
3128 : * Returns zero on success or don't wait, and -EIO on error.
3129 : */
3130 0 : void __bh_read_batch(int nr, struct buffer_head *bhs[],
3131 : blk_opf_t op_flags, bool force_lock)
3132 : {
3133 0 : int i;
3134 :
3135 0 : for (i = 0; i < nr; i++) {
3136 0 : struct buffer_head *bh = bhs[i];
3137 :
3138 0 : if (buffer_uptodate(bh))
3139 0 : continue;
3140 :
3141 0 : if (force_lock)
3142 0 : lock_buffer(bh);
3143 : else
3144 0 : if (!trylock_buffer(bh))
3145 0 : continue;
3146 :
3147 0 : if (buffer_uptodate(bh)) {
3148 0 : unlock_buffer(bh);
3149 0 : continue;
3150 : }
3151 :
3152 0 : bh->b_end_io = end_buffer_read_sync;
3153 0 : get_bh(bh);
3154 0 : submit_bh(REQ_OP_READ | op_flags, bh);
3155 : }
3156 0 : }
3157 : EXPORT_SYMBOL(__bh_read_batch);
3158 :
3159 0 : void __init buffer_init(void)
3160 : {
3161 0 : unsigned long nrpages;
3162 0 : int ret;
3163 :
3164 0 : bh_cachep = kmem_cache_create("buffer_head",
3165 : sizeof(struct buffer_head), 0,
3166 : (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3167 : SLAB_MEM_SPREAD),
3168 : NULL);
3169 :
3170 : /*
3171 : * Limit the bh occupancy to 10% of ZONE_NORMAL
3172 : */
3173 0 : nrpages = (nr_free_buffer_pages() * 10) / 100;
3174 0 : max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3175 0 : ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
3176 : NULL, buffer_exit_cpu_dead);
3177 0 : WARN_ON(ret < 0);
3178 0 : }
|