Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 : * All Rights Reserved.
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_log_format.h"
11 : #include "xfs_trans_resv.h"
12 : #include "xfs_bit.h"
13 : #include "xfs_mount.h"
14 : #include "xfs_trans.h"
15 : #include "xfs_trans_priv.h"
16 : #include "xfs_buf_item.h"
17 : #include "xfs_inode.h"
18 : #include "xfs_inode_item.h"
19 : #include "xfs_quota.h"
20 : #include "xfs_dquot_item.h"
21 : #include "xfs_dquot.h"
22 : #include "xfs_trace.h"
23 : #include "xfs_log.h"
24 : #include "xfs_log_priv.h"
25 :
26 :
27 : struct kmem_cache *xfs_buf_item_cache;
28 :
29 : static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
30 : {
31 : return container_of(lip, struct xfs_buf_log_item, bli_item);
32 : }
33 :
34 : /* Is this log iovec plausibly large enough to contain the buffer log format? */
35 : bool
36 16817442 : xfs_buf_log_check_iovec(
37 : struct xfs_log_iovec *iovec)
38 : {
39 16817442 : struct xfs_buf_log_format *blfp = iovec->i_addr;
40 16817442 : char *bmp_end;
41 16817442 : char *item_end;
42 :
43 16817442 : if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len)
44 : return false;
45 :
46 16817442 : item_end = (char *)iovec->i_addr + iovec->i_len;
47 16817442 : bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size];
48 16817442 : return bmp_end <= item_end;
49 : }
50 :
51 : static inline int
52 : xfs_buf_log_format_size(
53 : struct xfs_buf_log_format *blfp)
54 : {
55 6720416441 : return offsetof(struct xfs_buf_log_format, blf_data_map) +
56 6720416441 : (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
57 : }
58 :
59 : static inline bool
60 6316328300 : xfs_buf_item_straddle(
61 : struct xfs_buf *bp,
62 : uint offset,
63 : int first_bit,
64 : int nbits)
65 : {
66 6316328300 : void *first, *last;
67 :
68 6316328300 : first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT));
69 6316251640 : last = xfs_buf_offset(bp,
70 6316251640 : offset + ((first_bit + nbits) << XFS_BLF_SHIFT));
71 :
72 6316327265 : if (last - first != nbits * XFS_BLF_CHUNK)
73 0 : return true;
74 : return false;
75 : }
76 :
77 : /*
78 : * Return the number of log iovecs and space needed to log the given buf log
79 : * item segment.
80 : *
81 : * It calculates this as 1 iovec for the buf log format structure and 1 for each
82 : * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged
83 : * in a single iovec.
84 : */
85 : STATIC void
86 3328900600 : xfs_buf_item_size_segment(
87 : struct xfs_buf_log_item *bip,
88 : struct xfs_buf_log_format *blfp,
89 : uint offset,
90 : int *nvecs,
91 : int *nbytes)
92 : {
93 3328900600 : struct xfs_buf *bp = bip->bli_buf;
94 3328900600 : int first_bit;
95 3328900600 : int nbits;
96 3328900600 : int next_bit;
97 3328900600 : int last_bit;
98 :
99 3328900600 : first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
100 3328458189 : if (first_bit == -1)
101 : return;
102 :
103 3328363272 : (*nvecs)++;
104 3328363272 : *nbytes += xfs_buf_log_format_size(blfp);
105 :
106 5417109644 : do {
107 5417109644 : nbits = xfs_contig_bits(blfp->blf_data_map,
108 : blfp->blf_map_size, first_bit);
109 5418875445 : ASSERT(nbits > 0);
110 :
111 : /*
112 : * Straddling a page is rare because we don't log contiguous
113 : * chunks of unmapped buffers anywhere.
114 : */
115 8577319994 : if (nbits > 1 &&
116 3158710829 : xfs_buf_item_straddle(bp, offset, first_bit, nbits))
117 0 : goto slow_scan;
118 :
119 5418609165 : (*nvecs)++;
120 5418609165 : *nbytes += nbits * XFS_BLF_CHUNK;
121 :
122 : /*
123 : * This takes the bit number to start looking from and
124 : * returns the next set bit from there. It returns -1
125 : * if there are no more bits set or the start bit is
126 : * beyond the end of the bitmap.
127 : */
128 5418609165 : first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
129 5418609165 : (uint)first_bit + nbits + 1);
130 5416972532 : } while (first_bit != -1);
131 :
132 : return;
133 :
134 : slow_scan:
135 : /* Count the first bit we jumped out of the above loop from */
136 0 : (*nvecs)++;
137 0 : *nbytes += XFS_BLF_CHUNK;
138 0 : last_bit = first_bit;
139 0 : while (last_bit != -1) {
140 : /*
141 : * This takes the bit number to start looking from and
142 : * returns the next set bit from there. It returns -1
143 : * if there are no more bits set or the start bit is
144 : * beyond the end of the bitmap.
145 : */
146 0 : next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
147 0 : last_bit + 1);
148 : /*
149 : * If we run out of bits, leave the loop,
150 : * else if we find a new set of bits bump the number of vecs,
151 : * else keep scanning the current set of bits.
152 : */
153 0 : if (next_bit == -1) {
154 : break;
155 0 : } else if (next_bit != last_bit + 1 ||
156 0 : xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
157 0 : last_bit = next_bit;
158 0 : first_bit = next_bit;
159 0 : (*nvecs)++;
160 0 : nbits = 1;
161 : } else {
162 0 : last_bit++;
163 0 : nbits++;
164 : }
165 0 : *nbytes += XFS_BLF_CHUNK;
166 : }
167 : }
168 :
169 : /*
170 : * Return the number of log iovecs and space needed to log the given buf log
171 : * item.
172 : *
173 : * Discontiguous buffers need a format structure per region that is being
174 : * logged. This makes the changes in the buffer appear to log recovery as though
175 : * they came from separate buffers, just like would occur if multiple buffers
176 : * were used instead of a single discontiguous buffer. This enables
177 : * discontiguous buffers to be in-memory constructs, completely transparent to
178 : * what ends up on disk.
179 : *
180 : * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
181 : * format structures. If the item has previously been logged and has dirty
182 : * regions, we do not relog them in stale buffers. This has the effect of
183 : * reducing the size of the relogged item by the amount of dirty data tracked
184 : * by the log item. This can result in the committing transaction reducing the
185 : * amount of space being consumed by the CIL.
186 : */
187 : STATIC void
188 3362071931 : xfs_buf_item_size(
189 : struct xfs_log_item *lip,
190 : int *nvecs,
191 : int *nbytes)
192 : {
193 3362071931 : struct xfs_buf_log_item *bip = BUF_ITEM(lip);
194 3362071931 : struct xfs_buf *bp = bip->bli_buf;
195 3362071931 : int i;
196 3362071931 : int bytes;
197 3362071931 : uint offset = 0;
198 :
199 3362071931 : ASSERT(atomic_read(&bip->bli_refcount) > 0);
200 3362071931 : if (bip->bli_flags & XFS_BLI_STALE) {
201 : /*
202 : * The buffer is stale, so all we need to log is the buf log
203 : * format structure with the cancel flag in it as we are never
204 : * going to replay the changes tracked in the log item.
205 : */
206 31557747 : trace_xfs_buf_item_size_stale(bip);
207 31554562 : ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
208 31554562 : *nvecs += bip->bli_format_count;
209 63111027 : for (i = 0; i < bip->bli_format_count; i++) {
210 31556465 : *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]);
211 : }
212 33714103 : return;
213 : }
214 :
215 3330514184 : ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
216 :
217 3330514184 : if (bip->bli_flags & XFS_BLI_ORDERED) {
218 : /*
219 : * The buffer has been logged just to order it. It is not being
220 : * included in the transaction commit, so no vectors are used at
221 : * all.
222 : */
223 2159566 : trace_xfs_buf_item_size_ordered(bip);
224 2159541 : *nvecs = XFS_LOG_VEC_ORDERED;
225 2159541 : return;
226 : }
227 :
228 : /*
229 : * The vector count is based on the number of buffer vectors we have
230 : * dirty bits in. This will only be greater than one when we have a
231 : * compound buffer with more than one segment dirty. Hence for compound
232 : * buffers we need to track which segment the dirty bits correspond to,
233 : * and when we move from one segment to the next increment the vector
234 : * count for the extra buf log format structure that will need to be
235 : * written.
236 : */
237 3328354618 : bytes = 0;
238 6656938402 : for (i = 0; i < bip->bli_format_count; i++) {
239 3328608097 : xfs_buf_item_size_segment(bip, &bip->bli_formats[i], offset,
240 : nvecs, &bytes);
241 3328583784 : offset += BBTOB(bp->b_maps[i].bm_len);
242 : }
243 :
244 : /*
245 : * Round up the buffer size required to minimise the number of memory
246 : * allocations that need to be done as this item grows when relogged by
247 : * repeated modifications.
248 : */
249 3328330305 : *nbytes = round_up(bytes, 512);
250 3328330305 : trace_xfs_buf_item_size(bip);
251 : }
252 :
253 : static inline void
254 5421992220 : xfs_buf_item_copy_iovec(
255 : struct xfs_log_vec *lv,
256 : struct xfs_log_iovec **vecp,
257 : struct xfs_buf *bp,
258 : uint offset,
259 : int first_bit,
260 : uint nbits)
261 : {
262 5421992220 : offset += first_bit * XFS_BLF_CHUNK;
263 5421428843 : xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK,
264 : xfs_buf_offset(bp, offset),
265 5421992220 : nbits * XFS_BLF_CHUNK);
266 5424140186 : }
267 :
268 : static void
269 3360496704 : xfs_buf_item_format_segment(
270 : struct xfs_buf_log_item *bip,
271 : struct xfs_log_vec *lv,
272 : struct xfs_log_iovec **vecp,
273 : uint offset,
274 : struct xfs_buf_log_format *blfp)
275 : {
276 3360496704 : struct xfs_buf *bp = bip->bli_buf;
277 3360496704 : uint base_size;
278 3360496704 : int first_bit;
279 3360496704 : int last_bit;
280 3360496704 : int next_bit;
281 3360496704 : uint nbits;
282 :
283 : /* copy the flags across from the base format item */
284 3360496704 : blfp->blf_flags = bip->__bli_format.blf_flags;
285 :
286 : /*
287 : * Base size is the actual size of the ondisk structure - it reflects
288 : * the actual size of the dirty bitmap rather than the size of the in
289 : * memory structure.
290 : */
291 3360496704 : base_size = xfs_buf_log_format_size(blfp);
292 :
293 3360496704 : first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
294 3360343430 : if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
295 : /*
296 : * If the map is not be dirty in the transaction, mark
297 : * the size as zero and do not advance the vector pointer.
298 : */
299 : return;
300 : }
301 :
302 3360248513 : blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size);
303 3359591000 : blfp->blf_size = 1;
304 :
305 3359591000 : if (bip->bli_flags & XFS_BLI_STALE) {
306 : /*
307 : * The buffer is stale, so all we need to log
308 : * is the buf log format structure with the
309 : * cancel flag in it.
310 : */
311 31552999 : trace_xfs_buf_item_format_stale(bip);
312 31553704 : ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
313 31553704 : return;
314 : }
315 :
316 :
317 : /*
318 : * Fill in an iovec for each set of contiguous chunks.
319 : */
320 5422292846 : do {
321 5422292846 : ASSERT(first_bit >= 0);
322 5422292846 : nbits = xfs_contig_bits(blfp->blf_data_map,
323 : blfp->blf_map_size, first_bit);
324 5423848596 : ASSERT(nbits > 0);
325 :
326 : /*
327 : * Straddling a page is rare because we don't log contiguous
328 : * chunks of unmapped buffers anywhere.
329 : */
330 8582686340 : if (nbits > 1 &&
331 3158809064 : xfs_buf_item_straddle(bp, offset, first_bit, nbits))
332 0 : goto slow_scan;
333 :
334 5423877276 : xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
335 : first_bit, nbits);
336 5423687520 : blfp->blf_size++;
337 :
338 : /*
339 : * This takes the bit number to start looking from and
340 : * returns the next set bit from there. It returns -1
341 : * if there are no more bits set or the start bit is
342 : * beyond the end of the bitmap.
343 : */
344 5423687520 : first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
345 5423687520 : (uint)first_bit + nbits + 1);
346 5423077790 : } while (first_bit != -1);
347 :
348 : return;
349 :
350 : slow_scan:
351 0 : ASSERT(bp->b_addr == NULL);
352 : last_bit = first_bit;
353 : nbits = 1;
354 0 : for (;;) {
355 : /*
356 : * This takes the bit number to start looking from and
357 : * returns the next set bit from there. It returns -1
358 : * if there are no more bits set or the start bit is
359 : * beyond the end of the bitmap.
360 : */
361 0 : next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
362 0 : (uint)last_bit + 1);
363 : /*
364 : * If we run out of bits fill in the last iovec and get out of
365 : * the loop. Else if we start a new set of bits then fill in
366 : * the iovec for the series we were looking at and start
367 : * counting the bits in the new one. Else we're still in the
368 : * same set of bits so just keep counting and scanning.
369 : */
370 0 : if (next_bit == -1) {
371 0 : xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
372 : first_bit, nbits);
373 0 : blfp->blf_size++;
374 0 : break;
375 0 : } else if (next_bit != last_bit + 1 ||
376 0 : xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
377 0 : xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
378 : first_bit, nbits);
379 0 : blfp->blf_size++;
380 0 : first_bit = next_bit;
381 0 : last_bit = next_bit;
382 0 : nbits = 1;
383 : } else {
384 0 : last_bit++;
385 0 : nbits++;
386 : }
387 : }
388 : }
389 :
390 : /*
391 : * This is called to fill in the vector of log iovecs for the
392 : * given log buf item. It fills the first entry with a buf log
393 : * format structure, and the rest point to contiguous chunks
394 : * within the buffer.
395 : */
396 : STATIC void
397 3360296823 : xfs_buf_item_format(
398 : struct xfs_log_item *lip,
399 : struct xfs_log_vec *lv)
400 : {
401 3360296823 : struct xfs_buf_log_item *bip = BUF_ITEM(lip);
402 3360296823 : struct xfs_buf *bp = bip->bli_buf;
403 3360296823 : struct xfs_log_iovec *vecp = NULL;
404 3360296823 : uint offset = 0;
405 3360296823 : int i;
406 :
407 3360296823 : ASSERT(atomic_read(&bip->bli_refcount) > 0);
408 3360296823 : ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
409 : (bip->bli_flags & XFS_BLI_STALE));
410 3360296823 : ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
411 : (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
412 : && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
413 3360296823 : ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
414 : (bip->bli_flags & XFS_BLI_STALE));
415 :
416 :
417 : /*
418 : * If it is an inode buffer, transfer the in-memory state to the
419 : * format flags and clear the in-memory state.
420 : *
421 : * For buffer based inode allocation, we do not transfer
422 : * this state if the inode buffer allocation has not yet been committed
423 : * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
424 : * correct replay of the inode allocation.
425 : *
426 : * For icreate item based inode allocation, the buffers aren't written
427 : * to the journal during allocation, and hence we should always tag the
428 : * buffer as an inode buffer so that the correct unlinked list replay
429 : * occurs during recovery.
430 : */
431 3360296823 : if (bip->bli_flags & XFS_BLI_INODE_BUF) {
432 73423053 : if (xfs_has_v3inodes(lip->li_log->l_mp) ||
433 0 : !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
434 0 : xfs_log_item_in_current_chkpt(lip)))
435 73423053 : bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
436 73423053 : bip->bli_flags &= ~XFS_BLI_INODE_BUF;
437 : }
438 :
439 6720720603 : for (i = 0; i < bip->bli_format_count; i++) {
440 3360529531 : xfs_buf_item_format_segment(bip, lv, &vecp, offset,
441 3360529531 : &bip->bli_formats[i]);
442 3360423780 : offset += BBTOB(bp->b_maps[i].bm_len);
443 : }
444 :
445 : /*
446 : * Check to make sure everything is consistent.
447 : */
448 3360191072 : trace_xfs_buf_item_format(bip);
449 3359884860 : }
450 :
451 : /*
452 : * This is called to pin the buffer associated with the buf log item in memory
453 : * so it cannot be written out.
454 : *
455 : * We take a reference to the buffer log item here so that the BLI life cycle
456 : * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and
457 : * inserted into the AIL.
458 : *
459 : * We also need to take a reference to the buffer itself as the BLI unpin
460 : * processing requires accessing the buffer after the BLI has dropped the final
461 : * BLI reference. See xfs_buf_item_unpin() for an explanation.
462 : * If unpins race to drop the final BLI reference and only the
463 : * BLI owns a reference to the buffer, then the loser of the race can have the
464 : * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per
465 : * pin count ensures the life cycle of the buffer extends for as
466 : * long as we hold the buffer pin reference in xfs_buf_item_unpin().
467 : */
468 : STATIC void
469 337828456 : xfs_buf_item_pin(
470 : struct xfs_log_item *lip)
471 : {
472 337828456 : struct xfs_buf_log_item *bip = BUF_ITEM(lip);
473 :
474 337828456 : ASSERT(atomic_read(&bip->bli_refcount) > 0);
475 337828456 : ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
476 : (bip->bli_flags & XFS_BLI_ORDERED) ||
477 : (bip->bli_flags & XFS_BLI_STALE));
478 :
479 337828456 : trace_xfs_buf_item_pin(bip);
480 :
481 337813231 : xfs_buf_hold(bip->bli_buf);
482 337827524 : atomic_inc(&bip->bli_refcount);
483 337851524 : atomic_inc(&bip->bli_buf->b_pin_count);
484 337836667 : }
485 :
486 : /*
487 : * This is called to unpin the buffer associated with the buf log item which was
488 : * previously pinned with a call to xfs_buf_item_pin(). We enter this function
489 : * with a buffer pin count, a buffer reference and a BLI reference.
490 : *
491 : * We must drop the BLI reference before we unpin the buffer because the AIL
492 : * doesn't acquire a BLI reference whenever it accesses it. Therefore if the
493 : * refcount drops to zero, the bli could still be AIL resident and the buffer
494 : * submitted for I/O at any point before we return. This can result in IO
495 : * completion freeing the buffer while we are still trying to access it here.
496 : * This race condition can also occur in shutdown situations where we abort and
497 : * unpin buffers from contexts other that journal IO completion.
498 : *
499 : * Hence we have to hold a buffer reference per pin count to ensure that the
500 : * buffer cannot be freed until we have finished processing the unpin operation.
501 : * The reference is taken in xfs_buf_item_pin(), and we must hold it until we
502 : * are done processing the buffer state. In the case of an abort (remove =
503 : * true) then we re-use the current pin reference as the IO reference we hand
504 : * off to IO failure handling.
505 : */
506 : STATIC void
507 337870129 : xfs_buf_item_unpin(
508 : struct xfs_log_item *lip,
509 : int remove)
510 : {
511 337870129 : struct xfs_buf_log_item *bip = BUF_ITEM(lip);
512 337870129 : struct xfs_buf *bp = bip->bli_buf;
513 337870129 : int stale = bip->bli_flags & XFS_BLI_STALE;
514 337870129 : int freed;
515 :
516 337870129 : ASSERT(bp->b_log_item == bip);
517 337870129 : ASSERT(atomic_read(&bip->bli_refcount) > 0);
518 :
519 337870129 : trace_xfs_buf_item_unpin(bip);
520 :
521 337870127 : freed = atomic_dec_and_test(&bip->bli_refcount);
522 337870130 : if (atomic_dec_and_test(&bp->b_pin_count))
523 303778599 : wake_up_all(&bp->b_waiters);
524 :
525 : /*
526 : * Nothing to do but drop the buffer pin reference if the BLI is
527 : * still active.
528 : */
529 337870129 : if (!freed) {
530 35519726 : xfs_buf_rele(bp);
531 35519726 : return;
532 : }
533 :
534 302350403 : if (stale) {
535 31570973 : ASSERT(bip->bli_flags & XFS_BLI_STALE);
536 31570973 : ASSERT(xfs_buf_islocked(bp));
537 31570973 : ASSERT(bp->b_flags & XBF_STALE);
538 31570973 : ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
539 31570973 : ASSERT(list_empty(&lip->li_trans));
540 31570973 : ASSERT(!bp->b_transp);
541 :
542 31570973 : trace_xfs_buf_item_unpin_stale(bip);
543 :
544 : /*
545 : * The buffer has been locked and referenced since it was marked
546 : * stale so we own both lock and reference exclusively here. We
547 : * do not need the pin reference any more, so drop it now so
548 : * that we only have one reference to drop once item completion
549 : * processing is complete.
550 : */
551 31570973 : xfs_buf_rele(bp);
552 :
553 : /*
554 : * If we get called here because of an IO error, we may or may
555 : * not have the item on the AIL. xfs_trans_ail_delete() will
556 : * take care of that situation. xfs_trans_ail_delete() drops
557 : * the AIL lock.
558 : */
559 31570973 : if (bip->bli_flags & XFS_BLI_STALE_INODE) {
560 713145 : xfs_buf_item_done(bp);
561 713145 : xfs_buf_inode_iodone(bp);
562 713145 : ASSERT(list_empty(&bp->b_li_list));
563 : } else {
564 30857828 : xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
565 30857828 : xfs_buf_item_relse(bp);
566 30857828 : ASSERT(bp->b_log_item == NULL);
567 : }
568 31570973 : xfs_buf_relse(bp);
569 31570973 : return;
570 : }
571 :
572 270779430 : if (remove) {
573 : /*
574 : * We need to simulate an async IO failures here to ensure that
575 : * the correct error completion is run on this buffer. This
576 : * requires a reference to the buffer and for the buffer to be
577 : * locked. We can safely pass ownership of the pin reference to
578 : * the IO to ensure that nothing can free the buffer while we
579 : * wait for the lock and then run the IO failure completion.
580 : */
581 804135 : xfs_buf_lock(bp);
582 804136 : bp->b_flags |= XBF_ASYNC;
583 804136 : xfs_buf_ioend_fail(bp);
584 804136 : return;
585 : }
586 :
587 : /*
588 : * BLI has no more active references - it will be moved to the AIL to
589 : * manage the remaining BLI/buffer life cycle. There is nothing left for
590 : * us to do here so drop the pin reference to the buffer.
591 : */
592 269975295 : xfs_buf_rele(bp);
593 : }
594 :
595 : STATIC uint
596 60459571 : xfs_buf_item_push(
597 : struct xfs_log_item *lip,
598 : struct list_head *buffer_list)
599 : {
600 60459571 : struct xfs_buf_log_item *bip = BUF_ITEM(lip);
601 60459571 : struct xfs_buf *bp = bip->bli_buf;
602 60459571 : uint rval = XFS_ITEM_SUCCESS;
603 :
604 60459571 : if (xfs_buf_ispinned(bp))
605 : return XFS_ITEM_PINNED;
606 57291518 : if (!xfs_buf_trylock(bp)) {
607 : /*
608 : * If we have just raced with a buffer being pinned and it has
609 : * been marked stale, we could end up stalling until someone else
610 : * issues a log force to unpin the stale buffer. Check for the
611 : * race condition here so xfsaild recognizes the buffer is pinned
612 : * and queues a log force to move it along.
613 : */
614 1101030 : if (xfs_buf_ispinned(bp))
615 : return XFS_ITEM_PINNED;
616 1101013 : return XFS_ITEM_LOCKED;
617 : }
618 :
619 56190486 : ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
620 :
621 56190486 : trace_xfs_buf_item_push(bip);
622 :
623 : /* has a previous flush failed due to IO errors? */
624 56190486 : if (bp->b_flags & XBF_WRITE_FAIL) {
625 11246 : xfs_buf_alert_ratelimited(bp, "XFS: Failing async write",
626 : "Failing async write on buffer block 0x%llx. Retrying async write.",
627 : (long long)xfs_buf_daddr(bp));
628 : }
629 :
630 56190486 : if (!xfs_buf_delwri_queue(bp, buffer_list))
631 1532325 : rval = XFS_ITEM_FLUSHING;
632 56190486 : xfs_buf_unlock(bp);
633 56190486 : return rval;
634 : }
635 :
636 : /*
637 : * Drop the buffer log item refcount and take appropriate action. This helper
638 : * determines whether the bli must be freed or not, since a decrement to zero
639 : * does not necessarily mean the bli is unused.
640 : *
641 : * Return true if the bli is freed, false otherwise.
642 : */
643 : bool
644 22656249476 : xfs_buf_item_put(
645 : struct xfs_buf_log_item *bip)
646 : {
647 22656249476 : struct xfs_log_item *lip = &bip->bli_item;
648 22656249476 : bool aborted;
649 22656249476 : bool dirty;
650 :
651 : /* drop the bli ref and return if it wasn't the last one */
652 22656249476 : if (!atomic_dec_and_test(&bip->bli_refcount))
653 : return false;
654 :
655 : /*
656 : * We dropped the last ref and must free the item if clean or aborted.
657 : * If the bli is dirty and non-aborted, the buffer was clean in the
658 : * transaction but still awaiting writeback from previous changes. In
659 : * that case, the bli is freed on buffer writeback completion.
660 : */
661 25811622767 : aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
662 12905617399 : xlog_is_shutdown(lip->li_log);
663 12906005368 : dirty = bip->bli_flags & XFS_BLI_DIRTY;
664 12906005368 : if (dirty && !aborted)
665 : return false;
666 :
667 : /*
668 : * The bli is aborted or clean. An aborted item may be in the AIL
669 : * regardless of dirty state. For example, consider an aborted
670 : * transaction that invalidated a dirty bli and cleared the dirty
671 : * state.
672 : */
673 8593820960 : if (aborted)
674 15953 : xfs_trans_ail_delete(lip, 0);
675 8593820960 : xfs_buf_item_relse(bip->bli_buf);
676 8593820960 : return true;
677 : }
678 :
679 : /*
680 : * Release the buffer associated with the buf log item. If there is no dirty
681 : * logged data associated with the buffer recorded in the buf log item, then
682 : * free the buf log item and remove the reference to it in the buffer.
683 : *
684 : * This call ignores the recursion count. It is only called when the buffer
685 : * should REALLY be unlocked, regardless of the recursion count.
686 : *
687 : * We unconditionally drop the transaction's reference to the log item. If the
688 : * item was logged, then another reference was taken when it was pinned, so we
689 : * can safely drop the transaction reference now. This also allows us to avoid
690 : * potential races with the unpin code freeing the bli by not referencing the
691 : * bli after we've dropped the reference count.
692 : *
693 : * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
694 : * if necessary but do not unlock the buffer. This is for support of
695 : * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
696 : * free the item.
697 : */
698 : STATIC void
699 4082867351 : xfs_buf_item_release(
700 : struct xfs_log_item *lip)
701 : {
702 4082867351 : struct xfs_buf_log_item *bip = BUF_ITEM(lip);
703 4082867351 : struct xfs_buf *bp = bip->bli_buf;
704 4082867351 : bool released;
705 4082867351 : bool hold = bip->bli_flags & XFS_BLI_HOLD;
706 4082867351 : bool stale = bip->bli_flags & XFS_BLI_STALE;
707 : #if defined(DEBUG) || defined(XFS_WARN)
708 4082867351 : bool ordered = bip->bli_flags & XFS_BLI_ORDERED;
709 4082867351 : bool dirty = bip->bli_flags & XFS_BLI_DIRTY;
710 4082867351 : bool aborted = test_bit(XFS_LI_ABORTED,
711 : &lip->li_flags);
712 : #endif
713 :
714 4082867351 : trace_xfs_buf_item_release(bip);
715 :
716 : /*
717 : * The bli dirty state should match whether the blf has logged segments
718 : * except for ordered buffers, where only the bli should be dirty.
719 : */
720 4082692918 : ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
721 : (ordered && dirty && !xfs_buf_item_dirty_format(bip)));
722 4082681349 : ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
723 :
724 : /*
725 : * Clear the buffer's association with this transaction and
726 : * per-transaction state from the bli, which has been copied above.
727 : */
728 4082681349 : bp->b_transp = NULL;
729 4082681349 : bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
730 :
731 : /*
732 : * Unref the item and unlock the buffer unless held or stale. Stale
733 : * buffers remain locked until final unpin unless the bli is freed by
734 : * the unref call. The latter implies shutdown because buffer
735 : * invalidation dirties the bli and transaction.
736 : */
737 4082681349 : released = xfs_buf_item_put(bip);
738 4083719598 : if (hold || (stale && !released))
739 : return;
740 4043149797 : ASSERT(!stale || aborted);
741 4043149797 : xfs_buf_relse(bp);
742 : }
743 :
744 : STATIC void
745 4080644113 : xfs_buf_item_committing(
746 : struct xfs_log_item *lip,
747 : xfs_csn_t seq)
748 : {
749 4080644113 : return xfs_buf_item_release(lip);
750 : }
751 :
752 : /*
753 : * This is called to find out where the oldest active copy of the
754 : * buf log item in the on disk log resides now that the last log
755 : * write of it completed at the given lsn.
756 : * We always re-log all the dirty data in a buffer, so usually the
757 : * latest copy in the on disk log is the only one that matters. For
758 : * those cases we simply return the given lsn.
759 : *
760 : * The one exception to this is for buffers full of newly allocated
761 : * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF
762 : * flag set, indicating that only the di_next_unlinked fields from the
763 : * inodes in the buffers will be replayed during recovery. If the
764 : * original newly allocated inode images have not yet been flushed
765 : * when the buffer is so relogged, then we need to make sure that we
766 : * keep the old images in the 'active' portion of the log. We do this
767 : * by returning the original lsn of that transaction here rather than
768 : * the current one.
769 : */
770 : STATIC xfs_lsn_t
771 337870129 : xfs_buf_item_committed(
772 : struct xfs_log_item *lip,
773 : xfs_lsn_t lsn)
774 : {
775 337870129 : struct xfs_buf_log_item *bip = BUF_ITEM(lip);
776 :
777 337870129 : trace_xfs_buf_item_committed(bip);
778 :
779 337870129 : if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0)
780 426755 : return lip->li_lsn;
781 : return lsn;
782 : }
783 :
784 : static const struct xfs_item_ops xfs_buf_item_ops = {
785 : .iop_size = xfs_buf_item_size,
786 : .iop_format = xfs_buf_item_format,
787 : .iop_pin = xfs_buf_item_pin,
788 : .iop_unpin = xfs_buf_item_unpin,
789 : .iop_release = xfs_buf_item_release,
790 : .iop_committing = xfs_buf_item_committing,
791 : .iop_committed = xfs_buf_item_committed,
792 : .iop_push = xfs_buf_item_push,
793 : };
794 :
795 : STATIC void
796 8688738317 : xfs_buf_item_get_format(
797 : struct xfs_buf_log_item *bip,
798 : int count)
799 : {
800 8688738317 : ASSERT(bip->bli_formats == NULL);
801 8688738317 : bip->bli_format_count = count;
802 :
803 8688738317 : if (count == 1) {
804 8688727635 : bip->bli_formats = &bip->__bli_format;
805 8688727635 : return;
806 : }
807 :
808 10682 : bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
809 : 0);
810 : }
811 :
812 : STATIC void
813 8695386251 : xfs_buf_item_free_format(
814 : struct xfs_buf_log_item *bip)
815 : {
816 8695386251 : if (bip->bli_formats != &bip->__bli_format) {
817 10682 : kmem_free(bip->bli_formats);
818 10682 : bip->bli_formats = NULL;
819 : }
820 8695386251 : }
821 :
822 : /*
823 : * Allocate a new buf log item to go with the given buffer.
824 : * Set the buffer's b_log_item field to point to the new
825 : * buf log item.
826 : */
827 : int
828 22665161730 : xfs_buf_item_init(
829 : struct xfs_buf *bp,
830 : struct xfs_mount *mp)
831 : {
832 22665161730 : struct xfs_buf_log_item *bip = bp->b_log_item;
833 22665161730 : int chunks;
834 22665161730 : int map_size;
835 22665161730 : int i;
836 :
837 : /*
838 : * Check to see if there is already a buf log item for
839 : * this buffer. If we do already have one, there is
840 : * nothing to do here so return.
841 : */
842 22665161730 : ASSERT(bp->b_mount == mp);
843 22665161730 : if (bip) {
844 13974536545 : ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
845 13974536545 : ASSERT(!bp->b_transp);
846 13974536545 : ASSERT(bip->bli_buf == bp);
847 13974536545 : return 0;
848 : }
849 :
850 8690625185 : bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL);
851 8693643807 : xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
852 8681092360 : bip->bli_buf = bp;
853 :
854 : /*
855 : * chunks is the number of XFS_BLF_CHUNK size pieces the buffer
856 : * can be divided into. Make sure not to truncate any pieces.
857 : * map_size is the size of the bitmap needed to describe the
858 : * chunks of the buffer.
859 : *
860 : * Discontiguous buffer support follows the layout of the underlying
861 : * buffer. This makes the implementation as simple as possible.
862 : */
863 8681092360 : xfs_buf_item_get_format(bip, bp->b_map_count);
864 :
865 26053724415 : for (i = 0; i < bip->bli_format_count; i++) {
866 8689806868 : chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
867 : XFS_BLF_CHUNK);
868 8689806868 : map_size = DIV_ROUND_UP(chunks, NBWORD);
869 :
870 8689806868 : if (map_size > XFS_BLF_DATAMAP_SIZE) {
871 0 : kmem_cache_free(xfs_buf_item_cache, bip);
872 0 : xfs_err(mp,
873 : "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!",
874 : map_size,
875 : BBTOB(bp->b_maps[i].bm_len));
876 0 : return -EFSCORRUPTED;
877 : }
878 :
879 8689806868 : bip->bli_formats[i].blf_type = XFS_LI_BUF;
880 8689806868 : bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn;
881 8689806868 : bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len;
882 8689806868 : bip->bli_formats[i].blf_map_size = map_size;
883 : }
884 :
885 8682825187 : bp->b_log_item = bip;
886 8682825187 : xfs_buf_hold(bp);
887 8682825187 : return 0;
888 : }
889 :
890 :
891 : /*
892 : * Mark bytes first through last inclusive as dirty in the buf
893 : * item's bitmap.
894 : */
895 : static void
896 7392386462 : xfs_buf_item_log_segment(
897 : uint first,
898 : uint last,
899 : uint *map)
900 : {
901 7392386462 : uint first_bit;
902 7392386462 : uint last_bit;
903 7392386462 : uint bits_to_set;
904 7392386462 : uint bits_set;
905 7392386462 : uint word_num;
906 7392386462 : uint *wordp;
907 7392386462 : uint bit;
908 7392386462 : uint end_bit;
909 7392386462 : uint mask;
910 :
911 7392386462 : ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD);
912 7392386462 : ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD);
913 :
914 : /*
915 : * Convert byte offsets to bit numbers.
916 : */
917 7392386462 : first_bit = first >> XFS_BLF_SHIFT;
918 7392386462 : last_bit = last >> XFS_BLF_SHIFT;
919 :
920 : /*
921 : * Calculate the total number of bits to be set.
922 : */
923 7392386462 : bits_to_set = last_bit - first_bit + 1;
924 :
925 : /*
926 : * Get a pointer to the first word in the bitmap
927 : * to set a bit in.
928 : */
929 7392386462 : word_num = first_bit >> BIT_TO_WORD_SHIFT;
930 7392386462 : wordp = &map[word_num];
931 :
932 : /*
933 : * Calculate the starting bit in the first word.
934 : */
935 7392386462 : bit = first_bit & (uint)(NBWORD - 1);
936 :
937 : /*
938 : * First set any bits in the first word of our range.
939 : * If it starts at bit 0 of the word, it will be
940 : * set below rather than here. That is what the variable
941 : * bit tells us. The variable bits_set tracks the number
942 : * of bits that have been set so far. End_bit is the number
943 : * of the last bit to be set in this word plus one.
944 : */
945 7392386462 : if (bit) {
946 3418493309 : end_bit = min(bit + bits_to_set, (uint)NBWORD);
947 3418493309 : mask = ((1U << (end_bit - bit)) - 1) << bit;
948 3418493309 : *wordp |= mask;
949 3418493309 : wordp++;
950 3418493309 : bits_set = end_bit - bit;
951 : } else {
952 : bits_set = 0;
953 : }
954 :
955 : /*
956 : * Now set bits a whole word at a time that are between
957 : * first_bit and last_bit.
958 : */
959 7530804678 : while ((bits_to_set - bits_set) >= NBWORD) {
960 138418216 : *wordp = 0xffffffff;
961 138418216 : bits_set += NBWORD;
962 138418216 : wordp++;
963 : }
964 :
965 : /*
966 : * Finally, set any bits left to be set in one last partial word.
967 : */
968 7392386462 : end_bit = bits_to_set - bits_set;
969 7392386462 : if (end_bit) {
970 3838325516 : mask = (1U << end_bit) - 1;
971 3838325516 : *wordp |= mask;
972 : }
973 7392386462 : }
974 :
975 : /*
976 : * Mark bytes first through last inclusive as dirty in the buf
977 : * item's bitmap.
978 : */
979 : void
980 7392292305 : xfs_buf_item_log(
981 : struct xfs_buf_log_item *bip,
982 : uint first,
983 : uint last)
984 : {
985 7392292305 : int i;
986 7392292305 : uint start;
987 7392292305 : uint end;
988 7392292305 : struct xfs_buf *bp = bip->bli_buf;
989 :
990 : /*
991 : * walk each buffer segment and mark them dirty appropriately.
992 : */
993 7392292305 : start = 0;
994 14785594058 : for (i = 0; i < bip->bli_format_count; i++) {
995 7393615283 : if (start > last)
996 : break;
997 7393389614 : end = start + BBTOB(bp->b_maps[i].bm_len) - 1;
998 :
999 : /* skip to the map that includes the first byte to log */
1000 7393389614 : if (first > end) {
1001 382183 : start += BBTOB(bp->b_maps[i].bm_len);
1002 382183 : continue;
1003 : }
1004 :
1005 : /*
1006 : * Trim the range to this segment and mark it in the bitmap.
1007 : * Note that we must convert buffer offsets to segment relative
1008 : * offsets (e.g., the first byte of each segment is byte 0 of
1009 : * that segment).
1010 : */
1011 7393007431 : if (first < start)
1012 : first = start;
1013 7393007431 : if (end > last)
1014 : end = last;
1015 7393007431 : xfs_buf_item_log_segment(first - start, end - start,
1016 7393007431 : &bip->bli_formats[i].blf_data_map[0]);
1017 :
1018 7392919570 : start += BBTOB(bp->b_maps[i].bm_len);
1019 : }
1020 7392204444 : }
1021 :
1022 :
1023 : /*
1024 : * Return true if the buffer has any ranges logged/dirtied by a transaction,
1025 : * false otherwise.
1026 : */
1027 : bool
1028 4084844380 : xfs_buf_item_dirty_format(
1029 : struct xfs_buf_log_item *bip)
1030 : {
1031 4084844380 : int i;
1032 :
1033 4157663493 : for (i = 0; i < bip->bli_format_count; i++) {
1034 4084925288 : if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
1035 4084842959 : bip->bli_formats[i].blf_map_size))
1036 : return true;
1037 : }
1038 :
1039 : return false;
1040 : }
1041 :
1042 : STATIC void
1043 8695883899 : xfs_buf_item_free(
1044 : struct xfs_buf_log_item *bip)
1045 : {
1046 8695883899 : xfs_buf_item_free_format(bip);
1047 8694884061 : kmem_free(bip->bli_item.li_lv_shadow);
1048 8695313204 : kmem_cache_free(xfs_buf_item_cache, bip);
1049 8696226914 : }
1050 :
1051 : /*
1052 : * xfs_buf_item_relse() is called when the buf log item is no longer needed.
1053 : */
1054 : void
1055 8696717542 : xfs_buf_item_relse(
1056 : struct xfs_buf *bp)
1057 : {
1058 8696717542 : struct xfs_buf_log_item *bip = bp->b_log_item;
1059 :
1060 8696717542 : trace_xfs_buf_item_relse(bp, _RET_IP_);
1061 8695312536 : ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
1062 :
1063 8695312536 : if (atomic_read(&bip->bli_refcount))
1064 : return;
1065 8695114751 : bp->b_log_item = NULL;
1066 8695114751 : xfs_buf_rele(bp);
1067 8696081584 : xfs_buf_item_free(bip);
1068 : }
1069 :
1070 : void
1071 72720695 : xfs_buf_item_done(
1072 : struct xfs_buf *bp)
1073 : {
1074 : /*
1075 : * If we are forcibly shutting down, this may well be off the AIL
1076 : * already. That's because we simulate the log-committed callbacks to
1077 : * unpin these buffers. Or we may never have put this item on AIL
1078 : * because of the transaction was aborted forcibly.
1079 : * xfs_trans_ail_delete() takes care of these.
1080 : *
1081 : * Either way, AIL is useless if we're forcing a shutdown.
1082 : *
1083 : * Note that log recovery writes might have buffer items that are not on
1084 : * the AIL even when the file system is not shut down.
1085 : */
1086 72720695 : xfs_trans_ail_delete(&bp->b_log_item->bli_item,
1087 72720695 : (bp->b_flags & _XBF_LOGRECOVERY) ? 0 :
1088 : SHUTDOWN_CORRUPT_INCORE);
1089 72720693 : xfs_buf_item_relse(bp);
1090 72720694 : }
|