Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 : * All Rights Reserved.
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_log_format.h"
11 : #include "xfs_trans_resv.h"
12 : #include "xfs_bit.h"
13 : #include "xfs_mount.h"
14 : #include "xfs_trans.h"
15 : #include "xfs_trans_priv.h"
16 : #include "xfs_buf_item.h"
17 : #include "xfs_inode.h"
18 : #include "xfs_inode_item.h"
19 : #include "xfs_quota.h"
20 : #include "xfs_dquot_item.h"
21 : #include "xfs_dquot.h"
22 : #include "xfs_trace.h"
23 : #include "xfs_log.h"
24 : #include "xfs_log_priv.h"
25 :
26 :
27 : struct kmem_cache *xfs_buf_item_cache;
28 :
29 : static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
30 : {
31 : return container_of(lip, struct xfs_buf_log_item, bli_item);
32 : }
33 :
34 : /* Is this log iovec plausibly large enough to contain the buffer log format? */
35 : bool
36 13735006 : xfs_buf_log_check_iovec(
37 : struct xfs_log_iovec *iovec)
38 : {
39 13735006 : struct xfs_buf_log_format *blfp = iovec->i_addr;
40 13735006 : char *bmp_end;
41 13735006 : char *item_end;
42 :
43 13735006 : if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len)
44 : return false;
45 :
46 13735006 : item_end = (char *)iovec->i_addr + iovec->i_len;
47 13735006 : bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size];
48 13735006 : return bmp_end <= item_end;
49 : }
50 :
51 : static inline int
52 : xfs_buf_log_format_size(
53 : struct xfs_buf_log_format *blfp)
54 : {
55 3558167418 : return offsetof(struct xfs_buf_log_format, blf_data_map) +
56 3558167418 : (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
57 : }
58 :
59 : static inline bool
60 3556389471 : xfs_buf_item_straddle(
61 : struct xfs_buf *bp,
62 : uint offset,
63 : int first_bit,
64 : int nbits)
65 : {
66 3556389471 : void *first, *last;
67 :
68 3556389471 : first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT));
69 3556604643 : last = xfs_buf_offset(bp,
70 3556604643 : offset + ((first_bit + nbits) << XFS_BLF_SHIFT));
71 :
72 3556624327 : if (last - first != nbits * XFS_BLF_CHUNK)
73 0 : return true;
74 : return false;
75 : }
76 :
77 : /*
78 : * Return the number of log iovecs and space needed to log the given buf log
79 : * item segment.
80 : *
81 : * It calculates this as 1 iovec for the buf log format structure and 1 for each
82 : * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged
83 : * in a single iovec.
84 : */
85 : STATIC void
86 1749706225 : xfs_buf_item_size_segment(
87 : struct xfs_buf_log_item *bip,
88 : struct xfs_buf_log_format *blfp,
89 : uint offset,
90 : int *nvecs,
91 : int *nbytes)
92 : {
93 1749706225 : struct xfs_buf *bp = bip->bli_buf;
94 1749706225 : int first_bit;
95 1749706225 : int nbits;
96 1749706225 : int next_bit;
97 1749706225 : int last_bit;
98 :
99 1749706225 : first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
100 1749714580 : if (first_bit == -1)
101 : return;
102 :
103 1749694028 : (*nvecs)++;
104 1749694028 : *nbytes += xfs_buf_log_format_size(blfp);
105 :
106 2641865199 : do {
107 2641865199 : nbits = xfs_contig_bits(blfp->blf_data_map,
108 : blfp->blf_map_size, first_bit);
109 2642123402 : ASSERT(nbits > 0);
110 :
111 : /*
112 : * Straddling a page is rare because we don't log contiguous
113 : * chunks of unmapped buffers anywhere.
114 : */
115 4420978977 : if (nbits > 1 &&
116 1778754859 : xfs_buf_item_straddle(bp, offset, first_bit, nbits))
117 0 : goto slow_scan;
118 :
119 2642224118 : (*nvecs)++;
120 2642224118 : *nbytes += nbits * XFS_BLF_CHUNK;
121 :
122 : /*
123 : * This takes the bit number to start looking from and
124 : * returns the next set bit from there. It returns -1
125 : * if there are no more bits set or the start bit is
126 : * beyond the end of the bitmap.
127 : */
128 2642224118 : first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
129 2642224118 : (uint)first_bit + nbits + 1);
130 2641959185 : } while (first_bit != -1);
131 :
132 : return;
133 :
134 : slow_scan:
135 : /* Count the first bit we jumped out of the above loop from */
136 0 : (*nvecs)++;
137 0 : *nbytes += XFS_BLF_CHUNK;
138 0 : last_bit = first_bit;
139 0 : while (last_bit != -1) {
140 : /*
141 : * This takes the bit number to start looking from and
142 : * returns the next set bit from there. It returns -1
143 : * if there are no more bits set or the start bit is
144 : * beyond the end of the bitmap.
145 : */
146 0 : next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
147 0 : last_bit + 1);
148 : /*
149 : * If we run out of bits, leave the loop,
150 : * else if we find a new set of bits bump the number of vecs,
151 : * else keep scanning the current set of bits.
152 : */
153 0 : if (next_bit == -1) {
154 : break;
155 0 : } else if (next_bit != last_bit + 1 ||
156 0 : xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
157 0 : last_bit = next_bit;
158 0 : first_bit = next_bit;
159 0 : (*nvecs)++;
160 0 : nbits = 1;
161 : } else {
162 0 : last_bit++;
163 0 : nbits++;
164 : }
165 0 : *nbytes += XFS_BLF_CHUNK;
166 : }
167 : }
168 :
169 : /*
170 : * Return the number of log iovecs and space needed to log the given buf log
171 : * item.
172 : *
173 : * Discontiguous buffers need a format structure per region that is being
174 : * logged. This makes the changes in the buffer appear to log recovery as though
175 : * they came from separate buffers, just like would occur if multiple buffers
176 : * were used instead of a single discontiguous buffer. This enables
177 : * discontiguous buffers to be in-memory constructs, completely transparent to
178 : * what ends up on disk.
179 : *
180 : * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
181 : * format structures. If the item has previously been logged and has dirty
182 : * regions, we do not relog them in stale buffers. This has the effect of
183 : * reducing the size of the relogged item by the amount of dirty data tracked
184 : * by the log item. This can result in the committing transaction reducing the
185 : * amount of space being consumed by the CIL.
186 : */
187 : STATIC void
188 1779863287 : xfs_buf_item_size(
189 : struct xfs_log_item *lip,
190 : int *nvecs,
191 : int *nbytes)
192 : {
193 1779863287 : struct xfs_buf_log_item *bip = BUF_ITEM(lip);
194 1779863287 : struct xfs_buf *bp = bip->bli_buf;
195 1779863287 : int i;
196 1779863287 : int bytes;
197 1779863287 : uint offset = 0;
198 :
199 1779863287 : ASSERT(atomic_read(&bip->bli_refcount) > 0);
200 1779863287 : if (bip->bli_flags & XFS_BLI_STALE) {
201 : /*
202 : * The buffer is stale, so all we need to log is the buf log
203 : * format structure with the cancel flag in it as we are never
204 : * going to replay the changes tracked in the log item.
205 : */
206 29385911 : trace_xfs_buf_item_size_stale(bip);
207 29385929 : ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
208 29385929 : *nvecs += bip->bli_format_count;
209 58770563 : for (i = 0; i < bip->bli_format_count; i++) {
210 29384634 : *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]);
211 : }
212 30272091 : return;
213 : }
214 :
215 1750477376 : ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
216 :
217 1750477376 : if (bip->bli_flags & XFS_BLI_ORDERED) {
218 : /*
219 : * The buffer has been logged just to order it. It is not being
220 : * included in the transaction commit, so no vectors are used at
221 : * all.
222 : */
223 886163 : trace_xfs_buf_item_size_ordered(bip);
224 886162 : *nvecs = XFS_LOG_VEC_ORDERED;
225 886162 : return;
226 : }
227 :
228 : /*
229 : * The vector count is based on the number of buffer vectors we have
230 : * dirty bits in. This will only be greater than one when we have a
231 : * compound buffer with more than one segment dirty. Hence for compound
232 : * buffers we need to track which segment the dirty bits correspond to,
233 : * and when we move from one segment to the next increment the vector
234 : * count for the extra buf log format structure that will need to be
235 : * written.
236 : */
237 1749591213 : bytes = 0;
238 3499376778 : for (i = 0; i < bip->bli_format_count; i++) {
239 1749624836 : xfs_buf_item_size_segment(bip, &bip->bli_formats[i], offset,
240 : nvecs, &bytes);
241 1749785565 : offset += BBTOB(bp->b_maps[i].bm_len);
242 : }
243 :
244 : /*
245 : * Round up the buffer size required to minimise the number of memory
246 : * allocations that need to be done as this item grows when relogged by
247 : * repeated modifications.
248 : */
249 1749751942 : *nbytes = round_up(bytes, 512);
250 1749751942 : trace_xfs_buf_item_size(bip);
251 : }
252 :
253 : static inline void
254 2641584994 : xfs_buf_item_copy_iovec(
255 : struct xfs_log_vec *lv,
256 : struct xfs_log_iovec **vecp,
257 : struct xfs_buf *bp,
258 : uint offset,
259 : int first_bit,
260 : uint nbits)
261 : {
262 2641584994 : offset += first_bit * XFS_BLF_CHUNK;
263 2641584994 : xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK,
264 : xfs_buf_offset(bp, offset),
265 2641719675 : nbits * XFS_BLF_CHUNK);
266 2641167484 : }
267 :
268 : static void
269 1779088756 : xfs_buf_item_format_segment(
270 : struct xfs_buf_log_item *bip,
271 : struct xfs_log_vec *lv,
272 : struct xfs_log_iovec **vecp,
273 : uint offset,
274 : struct xfs_buf_log_format *blfp)
275 : {
276 1779088756 : struct xfs_buf *bp = bip->bli_buf;
277 1779088756 : uint base_size;
278 1779088756 : int first_bit;
279 1779088756 : int last_bit;
280 1779088756 : int next_bit;
281 1779088756 : uint nbits;
282 :
283 : /* copy the flags across from the base format item */
284 1779088756 : blfp->blf_flags = bip->__bli_format.blf_flags;
285 :
286 : /*
287 : * Base size is the actual size of the ondisk structure - it reflects
288 : * the actual size of the dirty bitmap rather than the size of the in
289 : * memory structure.
290 : */
291 1779088756 : base_size = xfs_buf_log_format_size(blfp);
292 :
293 1779088756 : first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
294 1779251793 : if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
295 : /*
296 : * If the map is not be dirty in the transaction, mark
297 : * the size as zero and do not advance the vector pointer.
298 : */
299 : return;
300 : }
301 :
302 1779231241 : blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size);
303 1779189848 : blfp->blf_size = 1;
304 :
305 1779189848 : if (bip->bli_flags & XFS_BLI_STALE) {
306 : /*
307 : * The buffer is stale, so all we need to log
308 : * is the buf log format structure with the
309 : * cancel flag in it.
310 : */
311 29384489 : trace_xfs_buf_item_format_stale(bip);
312 29384527 : ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
313 29384527 : return;
314 : }
315 :
316 :
317 : /*
318 : * Fill in an iovec for each set of contiguous chunks.
319 : */
320 2641390009 : do {
321 2641390009 : ASSERT(first_bit >= 0);
322 2641390009 : nbits = xfs_contig_bits(blfp->blf_data_map,
323 : blfp->blf_map_size, first_bit);
324 2642038485 : ASSERT(nbits > 0);
325 :
326 : /*
327 : * Straddling a page is rare because we don't log contiguous
328 : * chunks of unmapped buffers anywhere.
329 : */
330 4420866102 : if (nbits > 1 &&
331 1778726134 : xfs_buf_item_straddle(bp, offset, first_bit, nbits))
332 0 : goto slow_scan;
333 :
334 2642139968 : xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
335 : first_bit, nbits);
336 2641159937 : blfp->blf_size++;
337 :
338 : /*
339 : * This takes the bit number to start looking from and
340 : * returns the next set bit from there. It returns -1
341 : * if there are no more bits set or the start bit is
342 : * beyond the end of the bitmap.
343 : */
344 2641159937 : first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
345 2641159937 : (uint)first_bit + nbits + 1);
346 2641247717 : } while (first_bit != -1);
347 :
348 : return;
349 :
350 : slow_scan:
351 0 : ASSERT(bp->b_addr == NULL);
352 : last_bit = first_bit;
353 : nbits = 1;
354 0 : for (;;) {
355 : /*
356 : * This takes the bit number to start looking from and
357 : * returns the next set bit from there. It returns -1
358 : * if there are no more bits set or the start bit is
359 : * beyond the end of the bitmap.
360 : */
361 0 : next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
362 0 : (uint)last_bit + 1);
363 : /*
364 : * If we run out of bits fill in the last iovec and get out of
365 : * the loop. Else if we start a new set of bits then fill in
366 : * the iovec for the series we were looking at and start
367 : * counting the bits in the new one. Else we're still in the
368 : * same set of bits so just keep counting and scanning.
369 : */
370 0 : if (next_bit == -1) {
371 0 : xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
372 : first_bit, nbits);
373 0 : blfp->blf_size++;
374 0 : break;
375 0 : } else if (next_bit != last_bit + 1 ||
376 0 : xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
377 0 : xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
378 : first_bit, nbits);
379 0 : blfp->blf_size++;
380 0 : first_bit = next_bit;
381 0 : last_bit = next_bit;
382 0 : nbits = 1;
383 : } else {
384 0 : last_bit++;
385 0 : nbits++;
386 : }
387 : }
388 : }
389 :
390 : /*
391 : * This is called to fill in the vector of log iovecs for the
392 : * given log buf item. It fills the first entry with a buf log
393 : * format structure, and the rest point to contiguous chunks
394 : * within the buffer.
395 : */
396 : STATIC void
397 1778968895 : xfs_buf_item_format(
398 : struct xfs_log_item *lip,
399 : struct xfs_log_vec *lv)
400 : {
401 1778968895 : struct xfs_buf_log_item *bip = BUF_ITEM(lip);
402 1778968895 : struct xfs_buf *bp = bip->bli_buf;
403 1778968895 : struct xfs_log_iovec *vecp = NULL;
404 1778968895 : uint offset = 0;
405 1778968895 : int i;
406 :
407 1778968895 : ASSERT(atomic_read(&bip->bli_refcount) > 0);
408 1778968895 : ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
409 : (bip->bli_flags & XFS_BLI_STALE));
410 1778968895 : ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
411 : (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
412 : && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
413 1778968895 : ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
414 : (bip->bli_flags & XFS_BLI_STALE));
415 :
416 :
417 : /*
418 : * If it is an inode buffer, transfer the in-memory state to the
419 : * format flags and clear the in-memory state.
420 : *
421 : * For buffer based inode allocation, we do not transfer
422 : * this state if the inode buffer allocation has not yet been committed
423 : * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
424 : * correct replay of the inode allocation.
425 : *
426 : * For icreate item based inode allocation, the buffers aren't written
427 : * to the journal during allocation, and hence we should always tag the
428 : * buffer as an inode buffer so that the correct unlinked list replay
429 : * occurs during recovery.
430 : */
431 1778968895 : if (bip->bli_flags & XFS_BLI_INODE_BUF) {
432 23051111 : if (xfs_has_v3inodes(lip->li_log->l_mp) ||
433 0 : !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
434 0 : xfs_log_item_in_current_chkpt(lip)))
435 23051111 : bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
436 23051111 : bip->bli_flags &= ~XFS_BLI_INODE_BUF;
437 : }
438 :
439 3557987099 : for (i = 0; i < bip->bli_format_count; i++) {
440 1778986573 : xfs_buf_item_format_segment(bip, lv, &vecp, offset,
441 1778986573 : &bip->bli_formats[i]);
442 1779018204 : offset += BBTOB(bp->b_maps[i].bm_len);
443 : }
444 :
445 : /*
446 : * Check to make sure everything is consistent.
447 : */
448 1779000526 : trace_xfs_buf_item_format(bip);
449 1779018113 : }
450 :
451 : /*
452 : * This is called to pin the buffer associated with the buf log item in memory
453 : * so it cannot be written out.
454 : *
455 : * We take a reference to the buffer log item here so that the BLI life cycle
456 : * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and
457 : * inserted into the AIL.
458 : *
459 : * We also need to take a reference to the buffer itself as the BLI unpin
460 : * processing requires accessing the buffer after the BLI has dropped the final
461 : * BLI reference. See xfs_buf_item_unpin() for an explanation.
462 : * If unpins race to drop the final BLI reference and only the
463 : * BLI owns a reference to the buffer, then the loser of the race can have the
464 : * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per
465 : * pin count ensures the life cycle of the buffer extends for as
466 : * long as we hold the buffer pin reference in xfs_buf_item_unpin().
467 : */
468 : STATIC void
469 243022035 : xfs_buf_item_pin(
470 : struct xfs_log_item *lip)
471 : {
472 243022035 : struct xfs_buf_log_item *bip = BUF_ITEM(lip);
473 :
474 243022035 : ASSERT(atomic_read(&bip->bli_refcount) > 0);
475 243022035 : ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
476 : (bip->bli_flags & XFS_BLI_ORDERED) ||
477 : (bip->bli_flags & XFS_BLI_STALE));
478 :
479 243022035 : trace_xfs_buf_item_pin(bip);
480 :
481 243023531 : xfs_buf_hold(bip->bli_buf);
482 243028760 : atomic_inc(&bip->bli_refcount);
483 243029708 : atomic_inc(&bip->bli_buf->b_pin_count);
484 243028193 : }
485 :
486 : /*
487 : * This is called to unpin the buffer associated with the buf log item which was
488 : * previously pinned with a call to xfs_buf_item_pin(). We enter this function
489 : * with a buffer pin count, a buffer reference and a BLI reference.
490 : *
491 : * We must drop the BLI reference before we unpin the buffer because the AIL
492 : * doesn't acquire a BLI reference whenever it accesses it. Therefore if the
493 : * refcount drops to zero, the bli could still be AIL resident and the buffer
494 : * submitted for I/O at any point before we return. This can result in IO
495 : * completion freeing the buffer while we are still trying to access it here.
496 : * This race condition can also occur in shutdown situations where we abort and
497 : * unpin buffers from contexts other that journal IO completion.
498 : *
499 : * Hence we have to hold a buffer reference per pin count to ensure that the
500 : * buffer cannot be freed until we have finished processing the unpin operation.
501 : * The reference is taken in xfs_buf_item_pin(), and we must hold it until we
502 : * are done processing the buffer state. In the case of an abort (remove =
503 : * true) then we re-use the current pin reference as the IO reference we hand
504 : * off to IO failure handling.
505 : */
506 : STATIC void
507 243040704 : xfs_buf_item_unpin(
508 : struct xfs_log_item *lip,
509 : int remove)
510 : {
511 243040704 : struct xfs_buf_log_item *bip = BUF_ITEM(lip);
512 243040704 : struct xfs_buf *bp = bip->bli_buf;
513 243040704 : int stale = bip->bli_flags & XFS_BLI_STALE;
514 243040704 : int freed;
515 :
516 243040704 : ASSERT(bp->b_log_item == bip);
517 243040704 : ASSERT(atomic_read(&bip->bli_refcount) > 0);
518 :
519 243040704 : trace_xfs_buf_item_unpin(bip);
520 :
521 243040718 : freed = atomic_dec_and_test(&bip->bli_refcount);
522 486081462 : if (atomic_dec_and_test(&bp->b_pin_count))
523 224743410 : wake_up_all(&bp->b_waiters);
524 :
525 : /*
526 : * Nothing to do but drop the buffer pin reference if the BLI is
527 : * still active.
528 : */
529 243040730 : if (!freed) {
530 19117718 : xfs_buf_rele(bp);
531 19117718 : return;
532 : }
533 :
534 223923012 : if (stale) {
535 29386618 : ASSERT(bip->bli_flags & XFS_BLI_STALE);
536 29386618 : ASSERT(xfs_buf_islocked(bp));
537 29386618 : ASSERT(bp->b_flags & XBF_STALE);
538 29386618 : ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
539 29386618 : ASSERT(list_empty(&lip->li_trans));
540 29386618 : ASSERT(!bp->b_transp);
541 :
542 29386618 : trace_xfs_buf_item_unpin_stale(bip);
543 :
544 : /*
545 : * The buffer has been locked and referenced since it was marked
546 : * stale so we own both lock and reference exclusively here. We
547 : * do not need the pin reference any more, so drop it now so
548 : * that we only have one reference to drop once item completion
549 : * processing is complete.
550 : */
551 29386618 : xfs_buf_rele(bp);
552 :
553 : /*
554 : * If we get called here because of an IO error, we may or may
555 : * not have the item on the AIL. xfs_trans_ail_delete() will
556 : * take care of that situation. xfs_trans_ail_delete() drops
557 : * the AIL lock.
558 : */
559 29386618 : if (bip->bli_flags & XFS_BLI_STALE_INODE) {
560 137208 : xfs_buf_item_done(bp);
561 137208 : xfs_buf_inode_iodone(bp);
562 137208 : ASSERT(list_empty(&bp->b_li_list));
563 : } else {
564 29249410 : xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
565 29249410 : xfs_buf_item_relse(bp);
566 29249410 : ASSERT(bp->b_log_item == NULL);
567 : }
568 29386618 : xfs_buf_relse(bp);
569 29386618 : return;
570 : }
571 :
572 194536394 : if (remove) {
573 : /*
574 : * We need to simulate an async IO failures here to ensure that
575 : * the correct error completion is run on this buffer. This
576 : * requires a reference to the buffer and for the buffer to be
577 : * locked. We can safely pass ownership of the pin reference to
578 : * the IO to ensure that nothing can free the buffer while we
579 : * wait for the lock and then run the IO failure completion.
580 : */
581 670671 : xfs_buf_lock(bp);
582 670669 : bp->b_flags |= XBF_ASYNC;
583 670669 : xfs_buf_ioend_fail(bp);
584 670669 : return;
585 : }
586 :
587 : /*
588 : * BLI has no more active references - it will be moved to the AIL to
589 : * manage the remaining BLI/buffer life cycle. There is nothing left for
590 : * us to do here so drop the pin reference to the buffer.
591 : */
592 193865723 : xfs_buf_rele(bp);
593 : }
594 :
595 : STATIC uint
596 47330841 : xfs_buf_item_push(
597 : struct xfs_log_item *lip,
598 : struct list_head *buffer_list)
599 : {
600 47330841 : struct xfs_buf_log_item *bip = BUF_ITEM(lip);
601 47330841 : struct xfs_buf *bp = bip->bli_buf;
602 47330841 : uint rval = XFS_ITEM_SUCCESS;
603 :
604 47330841 : if (xfs_buf_ispinned(bp))
605 : return XFS_ITEM_PINNED;
606 45253578 : if (!xfs_buf_trylock(bp)) {
607 : /*
608 : * If we have just raced with a buffer being pinned and it has
609 : * been marked stale, we could end up stalling until someone else
610 : * issues a log force to unpin the stale buffer. Check for the
611 : * race condition here so xfsaild recognizes the buffer is pinned
612 : * and queues a log force to move it along.
613 : */
614 573485 : if (xfs_buf_ispinned(bp))
615 : return XFS_ITEM_PINNED;
616 573473 : return XFS_ITEM_LOCKED;
617 : }
618 :
619 44680114 : ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
620 :
621 44680114 : trace_xfs_buf_item_push(bip);
622 :
623 : /* has a previous flush failed due to IO errors? */
624 44680114 : if (bp->b_flags & XBF_WRITE_FAIL) {
625 1424 : xfs_buf_alert_ratelimited(bp, "XFS: Failing async write",
626 : "Failing async write on buffer block 0x%llx. Retrying async write.",
627 : (long long)xfs_buf_daddr(bp));
628 : }
629 :
630 44680114 : if (!xfs_buf_delwri_queue(bp, buffer_list))
631 997378 : rval = XFS_ITEM_FLUSHING;
632 44680114 : xfs_buf_unlock(bp);
633 44680114 : return rval;
634 : }
635 :
636 : /*
637 : * Drop the buffer log item refcount and take appropriate action. This helper
638 : * determines whether the bli must be freed or not, since a decrement to zero
639 : * does not necessarily mean the bli is unused.
640 : *
641 : * Return true if the bli is freed, false otherwise.
642 : */
643 : bool
644 12694654487 : xfs_buf_item_put(
645 : struct xfs_buf_log_item *bip)
646 : {
647 12694654487 : struct xfs_log_item *lip = &bip->bli_item;
648 12694654487 : bool aborted;
649 12694654487 : bool dirty;
650 :
651 : /* drop the bli ref and return if it wasn't the last one */
652 25395591195 : if (!atomic_dec_and_test(&bip->bli_refcount))
653 : return false;
654 :
655 : /*
656 : * We dropped the last ref and must free the item if clean or aborted.
657 : * If the bli is dirty and non-aborted, the buffer was clean in the
658 : * transaction but still awaiting writeback from previous changes. In
659 : * that case, the bli is freed on buffer writeback completion.
660 : */
661 15484078665 : aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
662 7741749401 : xlog_is_shutdown(lip->li_log);
663 7742329264 : dirty = bip->bli_flags & XFS_BLI_DIRTY;
664 7742329264 : if (dirty && !aborted)
665 : return false;
666 :
667 : /*
668 : * The bli is aborted or clean. An aborted item may be in the AIL
669 : * regardless of dirty state. For example, consider an aborted
670 : * transaction that invalidated a dirty bli and cleared the dirty
671 : * state.
672 : */
673 4846291773 : if (aborted)
674 12866 : xfs_trans_ail_delete(lip, 0);
675 4846291773 : xfs_buf_item_relse(bip->bli_buf);
676 4846291773 : return true;
677 : }
678 :
679 : /*
680 : * Release the buffer associated with the buf log item. If there is no dirty
681 : * logged data associated with the buffer recorded in the buf log item, then
682 : * free the buf log item and remove the reference to it in the buffer.
683 : *
684 : * This call ignores the recursion count. It is only called when the buffer
685 : * should REALLY be unlocked, regardless of the recursion count.
686 : *
687 : * We unconditionally drop the transaction's reference to the log item. If the
688 : * item was logged, then another reference was taken when it was pinned, so we
689 : * can safely drop the transaction reference now. This also allows us to avoid
690 : * potential races with the unpin code freeing the bli by not referencing the
691 : * bli after we've dropped the reference count.
692 : *
693 : * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
694 : * if necessary but do not unlock the buffer. This is for support of
695 : * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
696 : * free the item.
697 : */
698 : STATIC void
699 2191799728 : xfs_buf_item_release(
700 : struct xfs_log_item *lip)
701 : {
702 2191799728 : struct xfs_buf_log_item *bip = BUF_ITEM(lip);
703 2191799728 : struct xfs_buf *bp = bip->bli_buf;
704 2191799728 : bool released;
705 2191799728 : bool hold = bip->bli_flags & XFS_BLI_HOLD;
706 2191799728 : bool stale = bip->bli_flags & XFS_BLI_STALE;
707 : #if defined(DEBUG) || defined(XFS_WARN)
708 2191799728 : bool ordered = bip->bli_flags & XFS_BLI_ORDERED;
709 2191799728 : bool dirty = bip->bli_flags & XFS_BLI_DIRTY;
710 2191799728 : bool aborted = test_bit(XFS_LI_ABORTED,
711 : &lip->li_flags);
712 : #endif
713 :
714 2191799728 : trace_xfs_buf_item_release(bip);
715 :
716 : /*
717 : * The bli dirty state should match whether the blf has logged segments
718 : * except for ordered buffers, where only the bli should be dirty.
719 : */
720 2191814573 : ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
721 : (ordered && dirty && !xfs_buf_item_dirty_format(bip)));
722 2191878080 : ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
723 :
724 : /*
725 : * Clear the buffer's association with this transaction and
726 : * per-transaction state from the bli, which has been copied above.
727 : */
728 2191878080 : bp->b_transp = NULL;
729 2191878080 : bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
730 :
731 : /*
732 : * Unref the item and unlock the buffer unless held or stale. Stale
733 : * buffers remain locked until final unpin unless the bli is freed by
734 : * the unref call. The latter implies shutdown because buffer
735 : * invalidation dirties the bli and transaction.
736 : */
737 2191878080 : released = xfs_buf_item_put(bip);
738 2191949612 : if (hold || (stale && !released))
739 : return;
740 2157087730 : ASSERT(!stale || aborted);
741 2157087730 : xfs_buf_relse(bp);
742 : }
743 :
744 : STATIC void
745 2189979186 : xfs_buf_item_committing(
746 : struct xfs_log_item *lip,
747 : xfs_csn_t seq)
748 : {
749 2189979186 : return xfs_buf_item_release(lip);
750 : }
751 :
752 : /*
753 : * This is called to find out where the oldest active copy of the
754 : * buf log item in the on disk log resides now that the last log
755 : * write of it completed at the given lsn.
756 : * We always re-log all the dirty data in a buffer, so usually the
757 : * latest copy in the on disk log is the only one that matters. For
758 : * those cases we simply return the given lsn.
759 : *
760 : * The one exception to this is for buffers full of newly allocated
761 : * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF
762 : * flag set, indicating that only the di_next_unlinked fields from the
763 : * inodes in the buffers will be replayed during recovery. If the
764 : * original newly allocated inode images have not yet been flushed
765 : * when the buffer is so relogged, then we need to make sure that we
766 : * keep the old images in the 'active' portion of the log. We do this
767 : * by returning the original lsn of that transaction here rather than
768 : * the current one.
769 : */
770 : STATIC xfs_lsn_t
771 243040723 : xfs_buf_item_committed(
772 : struct xfs_log_item *lip,
773 : xfs_lsn_t lsn)
774 : {
775 243040723 : struct xfs_buf_log_item *bip = BUF_ITEM(lip);
776 :
777 243040723 : trace_xfs_buf_item_committed(bip);
778 :
779 243040723 : if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0)
780 125007 : return lip->li_lsn;
781 : return lsn;
782 : }
783 :
784 : static const struct xfs_item_ops xfs_buf_item_ops = {
785 : .iop_size = xfs_buf_item_size,
786 : .iop_format = xfs_buf_item_format,
787 : .iop_pin = xfs_buf_item_pin,
788 : .iop_unpin = xfs_buf_item_unpin,
789 : .iop_release = xfs_buf_item_release,
790 : .iop_committing = xfs_buf_item_committing,
791 : .iop_committed = xfs_buf_item_committed,
792 : .iop_push = xfs_buf_item_push,
793 : };
794 :
795 : STATIC void
796 4933672651 : xfs_buf_item_get_format(
797 : struct xfs_buf_log_item *bip,
798 : int count)
799 : {
800 4933672651 : ASSERT(bip->bli_formats == NULL);
801 4933672651 : bip->bli_format_count = count;
802 :
803 4933672651 : if (count == 1) {
804 4933672611 : bip->bli_formats = &bip->__bli_format;
805 4933672611 : return;
806 : }
807 :
808 40 : bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
809 : 0);
810 : }
811 :
812 : STATIC void
813 4933822740 : xfs_buf_item_free_format(
814 : struct xfs_buf_log_item *bip)
815 : {
816 4933822740 : if (bip->bli_formats != &bip->__bli_format) {
817 40 : kmem_free(bip->bli_formats);
818 40 : bip->bli_formats = NULL;
819 : }
820 4933822740 : }
821 :
822 : /*
823 : * Allocate a new buf log item to go with the given buffer.
824 : * Set the buffer's b_log_item field to point to the new
825 : * buf log item.
826 : */
827 : int
828 12710030341 : xfs_buf_item_init(
829 : struct xfs_buf *bp,
830 : struct xfs_mount *mp)
831 : {
832 12710030341 : struct xfs_buf_log_item *bip = bp->b_log_item;
833 12710030341 : int chunks;
834 12710030341 : int map_size;
835 12710030341 : int i;
836 :
837 : /*
838 : * Check to see if there is already a buf log item for
839 : * this buffer. If we do already have one, there is
840 : * nothing to do here so return.
841 : */
842 12710030341 : ASSERT(bp->b_mount == mp);
843 12710030341 : if (bip) {
844 7776762536 : ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
845 7776762536 : ASSERT(!bp->b_transp);
846 7776762536 : ASSERT(bip->bli_buf == bp);
847 7776762536 : return 0;
848 : }
849 :
850 4933267805 : bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL);
851 4933388777 : xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
852 4933622047 : bip->bli_buf = bp;
853 :
854 : /*
855 : * chunks is the number of XFS_BLF_CHUNK size pieces the buffer
856 : * can be divided into. Make sure not to truncate any pieces.
857 : * map_size is the size of the bitmap needed to describe the
858 : * chunks of the buffer.
859 : *
860 : * Discontiguous buffer support follows the layout of the underlying
861 : * buffer. This makes the implementation as simple as possible.
862 : */
863 4933622047 : xfs_buf_item_get_format(bip, bp->b_map_count);
864 :
865 14800871951 : for (i = 0; i < bip->bli_format_count; i++) {
866 4933689620 : chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
867 : XFS_BLF_CHUNK);
868 4933689620 : map_size = DIV_ROUND_UP(chunks, NBWORD);
869 :
870 4933689620 : if (map_size > XFS_BLF_DATAMAP_SIZE) {
871 0 : kmem_cache_free(xfs_buf_item_cache, bip);
872 0 : xfs_err(mp,
873 : "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!",
874 : map_size,
875 : BBTOB(bp->b_maps[i].bm_len));
876 0 : return -EFSCORRUPTED;
877 : }
878 :
879 4933689620 : bip->bli_formats[i].blf_type = XFS_LI_BUF;
880 4933689620 : bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn;
881 4933689620 : bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len;
882 4933689620 : bip->bli_formats[i].blf_map_size = map_size;
883 : }
884 :
885 4933560284 : bp->b_log_item = bip;
886 4933560284 : xfs_buf_hold(bp);
887 4933560284 : return 0;
888 : }
889 :
890 :
891 : /*
892 : * Mark bytes first through last inclusive as dirty in the buf
893 : * item's bitmap.
894 : */
895 : static void
896 4196080404 : xfs_buf_item_log_segment(
897 : uint first,
898 : uint last,
899 : uint *map)
900 : {
901 4196080404 : uint first_bit;
902 4196080404 : uint last_bit;
903 4196080404 : uint bits_to_set;
904 4196080404 : uint bits_set;
905 4196080404 : uint word_num;
906 4196080404 : uint *wordp;
907 4196080404 : uint bit;
908 4196080404 : uint end_bit;
909 4196080404 : uint mask;
910 :
911 4196080404 : ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD);
912 4196080404 : ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD);
913 :
914 : /*
915 : * Convert byte offsets to bit numbers.
916 : */
917 4196080404 : first_bit = first >> XFS_BLF_SHIFT;
918 4196080404 : last_bit = last >> XFS_BLF_SHIFT;
919 :
920 : /*
921 : * Calculate the total number of bits to be set.
922 : */
923 4196080404 : bits_to_set = last_bit - first_bit + 1;
924 :
925 : /*
926 : * Get a pointer to the first word in the bitmap
927 : * to set a bit in.
928 : */
929 4196080404 : word_num = first_bit >> BIT_TO_WORD_SHIFT;
930 4196080404 : wordp = &map[word_num];
931 :
932 : /*
933 : * Calculate the starting bit in the first word.
934 : */
935 4196080404 : bit = first_bit & (uint)(NBWORD - 1);
936 :
937 : /*
938 : * First set any bits in the first word of our range.
939 : * If it starts at bit 0 of the word, it will be
940 : * set below rather than here. That is what the variable
941 : * bit tells us. The variable bits_set tracks the number
942 : * of bits that have been set so far. End_bit is the number
943 : * of the last bit to be set in this word plus one.
944 : */
945 4196080404 : if (bit) {
946 1718079326 : end_bit = min(bit + bits_to_set, (uint)NBWORD);
947 1718079326 : mask = ((1U << (end_bit - bit)) - 1) << bit;
948 1718079326 : *wordp |= mask;
949 1718079326 : wordp++;
950 1718079326 : bits_set = end_bit - bit;
951 : } else {
952 : bits_set = 0;
953 : }
954 :
955 : /*
956 : * Now set bits a whole word at a time that are between
957 : * first_bit and last_bit.
958 : */
959 4372060847 : while ((bits_to_set - bits_set) >= NBWORD) {
960 175980443 : *wordp = 0xffffffff;
961 175980443 : bits_set += NBWORD;
962 175980443 : wordp++;
963 : }
964 :
965 : /*
966 : * Finally, set any bits left to be set in one last partial word.
967 : */
968 4196080404 : end_bit = bits_to_set - bits_set;
969 4196080404 : if (end_bit) {
970 2449859563 : mask = (1U << end_bit) - 1;
971 2449859563 : *wordp |= mask;
972 : }
973 4196080404 : }
974 :
975 : /*
976 : * Mark bytes first through last inclusive as dirty in the buf
977 : * item's bitmap.
978 : */
979 : void
980 4195710473 : xfs_buf_item_log(
981 : struct xfs_buf_log_item *bip,
982 : uint first,
983 : uint last)
984 : {
985 4195710473 : int i;
986 4195710473 : uint start;
987 4195710473 : uint end;
988 4195710473 : struct xfs_buf *bp = bip->bli_buf;
989 :
990 : /*
991 : * walk each buffer segment and mark them dirty appropriately.
992 : */
993 4195710473 : start = 0;
994 8392263555 : for (i = 0; i < bip->bli_format_count; i++) {
995 4195935075 : if (start > last)
996 : break;
997 4195868600 : end = start + BBTOB(bp->b_maps[i].bm_len) - 1;
998 :
999 : /* skip to the map that includes the first byte to log */
1000 4195868600 : if (first > end) {
1001 90520 : start += BBTOB(bp->b_maps[i].bm_len);
1002 90520 : continue;
1003 : }
1004 :
1005 : /*
1006 : * Trim the range to this segment and mark it in the bitmap.
1007 : * Note that we must convert buffer offsets to segment relative
1008 : * offsets (e.g., the first byte of each segment is byte 0 of
1009 : * that segment).
1010 : */
1011 4195778080 : if (first < start)
1012 : first = start;
1013 4195778080 : if (end > last)
1014 : end = last;
1015 4195778080 : xfs_buf_item_log_segment(first - start, end - start,
1016 4195778080 : &bip->bli_formats[i].blf_data_map[0]);
1017 :
1018 4196462562 : start += BBTOB(bp->b_maps[i].bm_len);
1019 : }
1020 4196394955 : }
1021 :
1022 :
1023 : /*
1024 : * Return true if the buffer has any ranges logged/dirtied by a transaction,
1025 : * false otherwise.
1026 : */
1027 : bool
1028 2192756016 : xfs_buf_item_dirty_format(
1029 : struct xfs_buf_log_item *bip)
1030 : {
1031 2192756016 : int i;
1032 :
1033 2258009274 : for (i = 0; i < bip->bli_format_count; i++) {
1034 2192755938 : if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
1035 : bip->bli_formats[i].blf_map_size))
1036 : return true;
1037 : }
1038 :
1039 : return false;
1040 : }
1041 :
1042 : STATIC void
1043 4933897775 : xfs_buf_item_free(
1044 : struct xfs_buf_log_item *bip)
1045 : {
1046 4933897775 : xfs_buf_item_free_format(bip);
1047 4933981893 : kmem_free(bip->bli_item.li_lv_shadow);
1048 4933953842 : kmem_cache_free(xfs_buf_item_cache, bip);
1049 4933826002 : }
1050 :
1051 : /*
1052 : * xfs_buf_item_relse() is called when the buf log item is no longer needed.
1053 : */
1054 : void
1055 4933043014 : xfs_buf_item_relse(
1056 : struct xfs_buf *bp)
1057 : {
1058 4933043014 : struct xfs_buf_log_item *bip = bp->b_log_item;
1059 :
1060 4933043014 : trace_xfs_buf_item_relse(bp, _RET_IP_);
1061 4933099692 : ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
1062 :
1063 4933099692 : if (atomic_read(&bip->bli_refcount))
1064 : return;
1065 4933095321 : bp->b_log_item = NULL;
1066 4933095321 : xfs_buf_rele(bp);
1067 4934048903 : xfs_buf_item_free(bip);
1068 : }
1069 :
1070 : void
1071 57567816 : xfs_buf_item_done(
1072 : struct xfs_buf *bp)
1073 : {
1074 : /*
1075 : * If we are forcibly shutting down, this may well be off the AIL
1076 : * already. That's because we simulate the log-committed callbacks to
1077 : * unpin these buffers. Or we may never have put this item on AIL
1078 : * because of the transaction was aborted forcibly.
1079 : * xfs_trans_ail_delete() takes care of these.
1080 : *
1081 : * Either way, AIL is useless if we're forcing a shutdown.
1082 : *
1083 : * Note that log recovery writes might have buffer items that are not on
1084 : * the AIL even when the file system is not shut down.
1085 : */
1086 57567816 : xfs_trans_ail_delete(&bp->b_log_item->bli_item,
1087 57567816 : (bp->b_flags & _XBF_LOGRECOVERY) ? 0 :
1088 : SHUTDOWN_CORRUPT_INCORE);
1089 57567826 : xfs_buf_item_relse(bp);
1090 57567826 : }
|