Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 : /*
3 : * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
4 : * Author: Darrick J. Wong <djwong@kernel.org>
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_trans_resv.h"
11 : #include "xfs_mount.h"
12 : #include "xfs_defer.h"
13 : #include "xfs_btree.h"
14 : #include "xfs_btree_staging.h"
15 : #include "xfs_bit.h"
16 : #include "xfs_log_format.h"
17 : #include "xfs_trans.h"
18 : #include "xfs_sb.h"
19 : #include "xfs_alloc.h"
20 : #include "xfs_alloc_btree.h"
21 : #include "xfs_rmap.h"
22 : #include "xfs_rmap_btree.h"
23 : #include "xfs_inode.h"
24 : #include "xfs_refcount.h"
25 : #include "xfs_extent_busy.h"
26 : #include "xfs_health.h"
27 : #include "xfs_bmap.h"
28 : #include "xfs_ialloc.h"
29 : #include "xfs_ag.h"
30 : #include "scrub/xfs_scrub.h"
31 : #include "scrub/scrub.h"
32 : #include "scrub/common.h"
33 : #include "scrub/btree.h"
34 : #include "scrub/trace.h"
35 : #include "scrub/repair.h"
36 : #include "scrub/bitmap.h"
37 : #include "scrub/xfile.h"
38 : #include "scrub/xfarray.h"
39 : #include "scrub/newbt.h"
40 : #include "scrub/reap.h"
41 :
42 : /*
43 : * Free Space Btree Repair
44 : * =======================
45 : *
46 : * The reverse mappings are supposed to record all space usage for the entire
47 : * AG. Therefore, we can recalculate the free extents in an AG by looking for
48 : * gaps in the physical extents recorded in the rmapbt. On a reflink
49 : * filesystem this is a little more tricky in that we have to be aware that
50 : * the rmap records are allowed to overlap.
51 : *
52 : * We derive which blocks belonged to the old bnobt/cntbt by recording all the
53 : * OWN_AG extents and subtracting out the blocks owned by all other OWN_AG
54 : * metadata: the rmapbt blocks visited while iterating the reverse mappings
55 : * and the AGFL blocks.
56 : *
57 : * Once we have both of those pieces, we can reconstruct the bnobt and cntbt
58 : * by blowing out the free block state and freeing all the extents that we
59 : * found. This adds the requirement that we can't have any busy extents in
60 : * the AG because the busy code cannot handle duplicate records.
61 : *
62 : * Note that we can only rebuild both free space btrees at the same time
63 : * because the regular extent freeing infrastructure loads both btrees at the
64 : * same time.
65 : *
66 : * We use the prefix 'xrep_abt' here because we regenerate both free space
67 : * allocation btrees at the same time.
68 : */
69 :
70 : struct xrep_abt {
71 : /* Blocks owned by the rmapbt or the agfl. */
72 : struct xagb_bitmap not_allocbt_blocks;
73 :
74 : /* All OWN_AG blocks. */
75 : struct xagb_bitmap old_allocbt_blocks;
76 :
77 : /*
78 : * New bnobt information. All btree block reservations are added to
79 : * the reservation list in new_bnobt.
80 : */
81 : struct xrep_newbt new_bnobt;
82 :
83 : /* new cntbt information */
84 : struct xrep_newbt new_cntbt;
85 :
86 : /* Free space extents. */
87 : struct xfarray *free_records;
88 :
89 : struct xfs_scrub *sc;
90 :
91 : /* Number of non-null records in @free_records. */
92 : uint64_t nr_real_records;
93 :
94 : /* get_records()'s position in the free space record array. */
95 : xfarray_idx_t array_cur;
96 :
97 : /*
98 : * Next block we anticipate seeing in the rmap records. If the next
99 : * rmap record is greater than next_agbno, we have found unused space.
100 : */
101 : xfs_agblock_t next_agbno;
102 :
103 : /* Number of free blocks in this AG. */
104 : xfs_agblock_t nr_blocks;
105 :
106 : /* Longest free extent we found in the AG. */
107 : xfs_agblock_t longest;
108 : };
109 :
110 : /* Set up to repair AG free space btrees. */
111 : int
112 143120 : xrep_setup_ag_allocbt(
113 : struct xfs_scrub *sc)
114 : {
115 143120 : unsigned int busy_gen;
116 :
117 : /*
118 : * Make sure the busy extent list is clear because we can't put extents
119 : * on there twice.
120 : */
121 143120 : busy_gen = READ_ONCE(sc->sa.pag->pagb_gen);
122 143120 : if (xfs_extent_busy_list_empty(sc->sa.pag))
123 : return 0;
124 :
125 51955 : return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0);
126 : }
127 :
128 : /* Check for any obvious conflicts in the free extent. */
129 : STATIC int
130 45034377 : xrep_abt_check_free_ext(
131 : struct xfs_scrub *sc,
132 : const struct xfs_alloc_rec_incore *rec)
133 : {
134 45034377 : enum xbtree_recpacking outcome;
135 45034377 : int error;
136 :
137 45034377 : if (xfs_alloc_check_perag_irec(sc->sa.pag, rec) != NULL)
138 : return -EFSCORRUPTED;
139 :
140 : /* Must not be an inode chunk. */
141 45033716 : error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
142 45033716 : rec->ar_startblock, rec->ar_blockcount, &outcome);
143 45035139 : if (error)
144 : return error;
145 45035139 : if (outcome != XBTREE_RECPACKING_EMPTY)
146 : return -EFSCORRUPTED;
147 :
148 : /* Must not be shared or CoW staging. */
149 45035139 : if (sc->sa.refc_cur) {
150 45034462 : error = xfs_refcount_has_records(sc->sa.refc_cur,
151 45034462 : XFS_REFC_DOMAIN_SHARED, rec->ar_startblock,
152 45034462 : rec->ar_blockcount, &outcome);
153 45033843 : if (error)
154 : return error;
155 45033843 : if (outcome != XBTREE_RECPACKING_EMPTY)
156 : return -EFSCORRUPTED;
157 :
158 45033843 : error = xfs_refcount_has_records(sc->sa.refc_cur,
159 45033843 : XFS_REFC_DOMAIN_COW, rec->ar_startblock,
160 45033843 : rec->ar_blockcount, &outcome);
161 45033854 : if (error)
162 : return error;
163 45033854 : if (outcome != XBTREE_RECPACKING_EMPTY)
164 0 : return -EFSCORRUPTED;
165 : }
166 :
167 : return 0;
168 : }
169 :
170 : /*
171 : * Stash a free space record for all the space since the last bno we found
172 : * all the way up to @end.
173 : */
174 : static int
175 45034135 : xrep_abt_stash(
176 : struct xrep_abt *ra,
177 : xfs_agblock_t end)
178 : {
179 45034135 : struct xfs_alloc_rec_incore arec = {
180 45034135 : .ar_startblock = ra->next_agbno,
181 45034135 : .ar_blockcount = end - ra->next_agbno,
182 : };
183 45034135 : struct xfs_scrub *sc = ra->sc;
184 45034135 : int error = 0;
185 :
186 45034135 : if (xchk_should_terminate(sc, &error))
187 0 : return error;
188 :
189 45033959 : error = xrep_abt_check_free_ext(ra->sc, &arec);
190 45034615 : if (error)
191 : return error;
192 :
193 45034622 : trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec);
194 :
195 45034240 : error = xfarray_append(ra->free_records, &arec);
196 45033650 : if (error)
197 : return error;
198 :
199 45033650 : ra->nr_blocks += arec.ar_blockcount;
200 45033650 : return 0;
201 : }
202 :
203 : /* Record extents that aren't in use from gaps in the rmap records. */
204 : STATIC int
205 970328237 : xrep_abt_walk_rmap(
206 : struct xfs_btree_cur *cur,
207 : const struct xfs_rmap_irec *rec,
208 : void *priv)
209 : {
210 970328237 : struct xrep_abt *ra = priv;
211 970328237 : int error;
212 :
213 : /* Record all the OWN_AG blocks... */
214 970328237 : if (rec->rm_owner == XFS_RMAP_OWN_AG) {
215 6604519 : error = xagb_bitmap_set(&ra->old_allocbt_blocks,
216 6604519 : rec->rm_startblock, rec->rm_blockcount);
217 6603578 : if (error)
218 : return error;
219 : }
220 :
221 : /* ...and all the rmapbt blocks... */
222 970327296 : error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur);
223 970336606 : if (error)
224 : return error;
225 :
226 : /* ...and all the free space. */
227 970336606 : if (rec->rm_startblock > ra->next_agbno) {
228 44892698 : error = xrep_abt_stash(ra, rec->rm_startblock);
229 44892581 : if (error)
230 : return error;
231 : }
232 :
233 : /*
234 : * rmap records can overlap on reflink filesystems, so project
235 : * next_agbno as far out into the AG space as we currently know about.
236 : */
237 970336489 : ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno,
238 : rec->rm_startblock + rec->rm_blockcount);
239 970336489 : return 0;
240 : }
241 :
242 : /* Collect an AGFL block for the not-to-release list. */
243 : static int
244 997465 : xrep_abt_walk_agfl(
245 : struct xfs_mount *mp,
246 : xfs_agblock_t agbno,
247 : void *priv)
248 : {
249 997465 : struct xrep_abt *ra = priv;
250 :
251 997465 : return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1);
252 : }
253 :
254 : /*
255 : * Compare two free space extents by block number. We want to sort in order of
256 : * increasing block number.
257 : */
258 : static int
259 606767470 : xrep_bnobt_extent_cmp(
260 : const void *a,
261 : const void *b)
262 : {
263 1490414635 : const struct xfs_alloc_rec_incore *ap = a;
264 1490414635 : const struct xfs_alloc_rec_incore *bp = b;
265 :
266 606767470 : if (ap->ar_startblock > bp->ar_startblock)
267 : return 1;
268 809806925 : else if (ap->ar_startblock < bp->ar_startblock)
269 809805108 : return -1;
270 : return 0;
271 : }
272 :
273 : /*
274 : * Re-sort the free extents by block number so so that we can put the records
275 : * into the bnobt in the correct order. Make sure the records do not overlap
276 : * in physical space.
277 : */
278 : STATIC int
279 142281 : xrep_bnobt_sort_records(
280 : struct xrep_abt *ra)
281 : {
282 142281 : struct xfs_alloc_rec_incore arec;
283 142281 : xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
284 142281 : xfs_agblock_t next_agbno = 0;
285 142281 : int error;
286 :
287 142281 : error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0);
288 142248 : if (error)
289 : return error;
290 :
291 45176759 : while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) {
292 45034511 : if (arec.ar_startblock < next_agbno)
293 : return -EFSCORRUPTED;
294 :
295 45034511 : next_agbno = arec.ar_startblock + arec.ar_blockcount;
296 : }
297 :
298 : return error;
299 : }
300 :
301 : /*
302 : * Compare two free space extents by length and then block number. We want
303 : * to sort first in order of increasing length and then in order of increasing
304 : * block number.
305 : */
306 : static int
307 1195476368 : xrep_cntbt_extent_cmp(
308 : const void *a,
309 : const void *b)
310 : {
311 1195476368 : const struct xfs_alloc_rec_incore *ap = a;
312 1195476368 : const struct xfs_alloc_rec_incore *bp = b;
313 :
314 1195476368 : if (ap->ar_blockcount > bp->ar_blockcount)
315 : return 1;
316 1039644627 : else if (ap->ar_blockcount < bp->ar_blockcount)
317 : return -1;
318 883647165 : return xrep_bnobt_extent_cmp(a, b);
319 : }
320 :
321 : /*
322 : * Sort the free extents by length so so that we can put the records into the
323 : * cntbt in the correct order. Don't let userspace kill us if we're resorting
324 : * after allocating btree blocks.
325 : */
326 : STATIC int
327 283819 : xrep_cntbt_sort_records(
328 : struct xrep_abt *ra,
329 : bool is_resort)
330 : {
331 568207 : return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp,
332 283819 : is_resort ? 0 : XFARRAY_SORT_KILLABLE);
333 : }
334 :
335 : /*
336 : * Iterate all reverse mappings to find (1) the gaps between rmap records (all
337 : * unowned space), (2) the OWN_AG extents (which encompass the free space
338 : * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL
339 : * blocks. The free space is (1) + (2) - (3) - (4).
340 : */
341 : STATIC int
342 142006 : xrep_abt_find_freespace(
343 : struct xrep_abt *ra)
344 : {
345 142006 : struct xfs_scrub *sc = ra->sc;
346 142006 : struct xfs_mount *mp = sc->mp;
347 142006 : struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
348 142006 : struct xfs_buf *agfl_bp;
349 142006 : xfs_agblock_t agend;
350 142006 : int error;
351 :
352 142006 : xagb_bitmap_init(&ra->not_allocbt_blocks);
353 :
354 141906 : xrep_ag_btcur_init(sc, &sc->sa);
355 :
356 : /*
357 : * Iterate all the reverse mappings to find gaps in the physical
358 : * mappings, all the OWN_AG blocks, and all the rmapbt extents.
359 : */
360 142216 : error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra);
361 142110 : if (error)
362 0 : goto err;
363 :
364 : /* Insert a record for space between the last rmap and EOAG. */
365 142110 : agend = be32_to_cpu(agf->agf_length);
366 142110 : if (ra->next_agbno < agend) {
367 141010 : error = xrep_abt_stash(ra, agend);
368 141188 : if (error)
369 0 : goto err;
370 : }
371 :
372 : /* Collect all the AGFL blocks. */
373 142288 : error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
374 142130 : if (error)
375 0 : goto err;
376 :
377 142130 : error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra);
378 142211 : if (error)
379 0 : goto err_agfl;
380 :
381 : /* Compute the old bnobt/cntbt blocks. */
382 142211 : error = xagb_bitmap_disunion(&ra->old_allocbt_blocks,
383 : &ra->not_allocbt_blocks);
384 141681 : if (error)
385 0 : goto err_agfl;
386 :
387 141681 : ra->nr_real_records = xfarray_length(ra->free_records);
388 141575 : err_agfl:
389 141575 : xfs_trans_brelse(sc->tp, agfl_bp);
390 142196 : err:
391 142196 : xchk_ag_btcur_free(&sc->sa);
392 142280 : xagb_bitmap_destroy(&ra->not_allocbt_blocks);
393 142088 : return error;
394 : }
395 :
396 : /*
397 : * We're going to use the observed free space records to reserve blocks for the
398 : * new free space btrees, so we play an iterative game where we try to converge
399 : * on the number of blocks we need:
400 : *
401 : * 1. Estimate how many blocks we'll need to store the records.
402 : * 2. If the first free record has more blocks than we need, we're done.
403 : * We will have to re-sort the records prior to building the cntbt.
404 : * 3. If that record has exactly the number of blocks we need, null out the
405 : * record. We're done.
406 : * 4. Otherwise, we still need more blocks. Null out the record, subtract its
407 : * length from the number of blocks we need, and go back to step 1.
408 : *
409 : * Fortunately, we don't have to do any transaction work to play this game, so
410 : * we don't have to tear down the staging cursors.
411 : */
412 : STATIC int
413 142212 : xrep_abt_reserve_space(
414 : struct xrep_abt *ra,
415 : struct xfs_btree_cur *bno_cur,
416 : struct xfs_btree_cur *cnt_cur,
417 : bool *needs_resort)
418 : {
419 142212 : struct xfs_scrub *sc = ra->sc;
420 142212 : xfarray_idx_t record_nr;
421 142212 : unsigned int allocated = 0;
422 142212 : int error = 0;
423 :
424 142212 : record_nr = xfarray_length(ra->free_records) - 1;
425 142204 : do {
426 142168 : struct xfs_alloc_rec_incore arec;
427 142168 : uint64_t required;
428 142168 : unsigned int desired;
429 142168 : unsigned int len;
430 :
431 : /* Compute how many blocks we'll need. */
432 142168 : error = xfs_btree_bload_compute_geometry(cnt_cur,
433 : &ra->new_cntbt.bload, ra->nr_real_records);
434 141814 : if (error)
435 : break;
436 :
437 141814 : error = xfs_btree_bload_compute_geometry(bno_cur,
438 : &ra->new_bnobt.bload, ra->nr_real_records);
439 142190 : if (error)
440 : break;
441 :
442 : /* How many btree blocks do we need to store all records? */
443 142190 : required = ra->new_bnobt.bload.nr_blocks +
444 142190 : ra->new_cntbt.bload.nr_blocks;
445 142190 : ASSERT(required < INT_MAX);
446 :
447 : /* If we've reserved enough blocks, we're done. */
448 142190 : if (allocated >= required)
449 : break;
450 :
451 142185 : desired = required - allocated;
452 :
453 : /* We need space but there's none left; bye! */
454 142185 : if (ra->nr_real_records == 0) {
455 : error = -ENOSPC;
456 : break;
457 : }
458 :
459 : /* Grab the first record from the list. */
460 142155 : error = xfarray_load(ra->free_records, record_nr, &arec);
461 141969 : if (error)
462 : break;
463 :
464 141969 : ASSERT(arec.ar_blockcount <= UINT_MAX);
465 141969 : len = min_t(unsigned int, arec.ar_blockcount, desired);
466 :
467 141969 : trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno,
468 : arec.ar_startblock, len, XFS_RMAP_OWN_AG);
469 :
470 141686 : error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag,
471 : arec.ar_startblock, len);
472 141747 : if (error)
473 : break;
474 141747 : allocated += len;
475 141747 : ra->nr_blocks -= len;
476 :
477 141747 : if (arec.ar_blockcount > desired) {
478 : /*
479 : * Record has more space than we need. The number of
480 : * free records doesn't change, so shrink the free
481 : * record, inform the caller that the records are no
482 : * longer sorted by length, and exit.
483 : */
484 141711 : arec.ar_startblock += desired;
485 141711 : arec.ar_blockcount -= desired;
486 141711 : error = xfarray_store(ra->free_records, record_nr,
487 : &arec);
488 142162 : if (error)
489 : break;
490 :
491 142163 : *needs_resort = true;
492 142163 : return 0;
493 : }
494 :
495 : /*
496 : * We're going to use up the entire record, so unset it and
497 : * move on to the next one. This changes the number of free
498 : * records (but doesn't break the sorting order), so we must
499 : * go around the loop once more to re-run _bload_init.
500 : */
501 36 : error = xfarray_unset(ra->free_records, record_nr);
502 36 : if (error)
503 : break;
504 36 : ra->nr_real_records--;
505 36 : record_nr--;
506 : } while (1);
507 :
508 34 : return error;
509 : }
510 :
511 : STATIC int
512 142244 : xrep_abt_dispose_one(
513 : struct xrep_abt *ra,
514 : struct xrep_newbt_resv *resv)
515 : {
516 142244 : struct xfs_scrub *sc = ra->sc;
517 142244 : struct xfs_perag *pag = sc->sa.pag;
518 142244 : xfs_agblock_t free_agbno = resv->agbno + resv->used;
519 142244 : xfs_extlen_t free_aglen = resv->len - resv->used;
520 142244 : int error;
521 :
522 142244 : ASSERT(pag == resv->pag);
523 :
524 : /* Add a deferred rmap for each extent we used. */
525 142244 : if (resv->used > 0) {
526 142219 : xfs_fsblock_t fsbno;
527 :
528 142219 : fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, resv->agbno);
529 142219 : xfs_rmap_alloc_extent(sc->tp, false, fsbno, resv->used,
530 : XFS_RMAP_OWN_AG);
531 : }
532 :
533 : /*
534 : * For each reserved btree block we didn't use, add it to the free
535 : * space btree. We didn't touch fdblocks when we reserved them, so
536 : * we don't touch it now.
537 : */
538 142285 : if (free_aglen == 0)
539 : return 0;
540 :
541 0 : trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
542 0 : free_aglen, ra->new_bnobt.oinfo.oi_owner);
543 :
544 0 : error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen,
545 0 : &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true);
546 0 : if (error)
547 : return error;
548 :
549 0 : return xrep_defer_finish(sc);
550 : }
551 :
552 : /*
553 : * Deal with all the space we reserved. Blocks that were allocated for the
554 : * free space btrees need to have a (deferred) rmap added for the OWN_AG
555 : * allocation, and blocks that didn't get used can be freed via the usual
556 : * (deferred) means.
557 : */
558 : STATIC void
559 142252 : xrep_abt_dispose_reservations(
560 : struct xrep_abt *ra,
561 : int error)
562 : {
563 142252 : struct xrep_newbt_resv *resv, *n;
564 :
565 142252 : if (error)
566 30 : goto junkit;
567 :
568 284427 : for_each_xrep_newbt_reservation(&ra->new_bnobt, resv, n) {
569 142248 : error = xrep_abt_dispose_one(ra, resv);
570 142205 : if (error)
571 0 : goto junkit;
572 : }
573 :
574 142179 : junkit:
575 284415 : for_each_xrep_newbt_reservation(&ra->new_bnobt, resv, n) {
576 142173 : xfs_perag_put(resv->pag);
577 142318 : list_del(&resv->list);
578 142294 : kfree(resv);
579 : }
580 :
581 142242 : xrep_newbt_cancel(&ra->new_bnobt);
582 142103 : xrep_newbt_cancel(&ra->new_cntbt);
583 142235 : }
584 :
585 : /* Retrieve free space data for bulk load. */
586 : STATIC int
587 482369 : xrep_abt_get_records(
588 : struct xfs_btree_cur *cur,
589 : unsigned int idx,
590 : struct xfs_btree_block *block,
591 : unsigned int nr_wanted,
592 : void *priv)
593 : {
594 482369 : struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a;
595 482369 : struct xrep_abt *ra = priv;
596 482369 : union xfs_btree_rec *block_rec;
597 482369 : unsigned int loaded;
598 482369 : int error;
599 :
600 90548729 : for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
601 90065480 : error = xfarray_load_next(ra->free_records, &ra->array_cur,
602 : arec);
603 90069467 : if (error)
604 0 : return error;
605 :
606 90069467 : ra->longest = max(ra->longest, arec->ar_blockcount);
607 :
608 90069467 : block_rec = xfs_btree_rec_addr(cur, idx, block);
609 90066502 : cur->bc_ops->init_rec_from_cur(cur, block_rec);
610 : }
611 :
612 483249 : return loaded;
613 : }
614 :
615 : /* Feed one of the new btree blocks to the bulk loader. */
616 : STATIC int
617 505422 : xrep_abt_claim_block(
618 : struct xfs_btree_cur *cur,
619 : union xfs_btree_ptr *ptr,
620 : void *priv)
621 : {
622 505422 : struct xrep_abt *ra = priv;
623 :
624 505422 : return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr);
625 : }
626 :
627 : /*
628 : * Reset the AGF counters to reflect the free space btrees that we just
629 : * rebuilt, then reinitialize the per-AG data.
630 : */
631 : STATIC int
632 142292 : xrep_abt_reset_counters(
633 : struct xrep_abt *ra)
634 : {
635 142292 : struct xfs_scrub *sc = ra->sc;
636 142292 : struct xfs_perag *pag = sc->sa.pag;
637 142292 : struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
638 142292 : unsigned int freesp_btreeblks = 0;
639 :
640 : /*
641 : * Compute the contribution to agf_btreeblks for the new free space
642 : * btrees. This is the computed btree size minus anything we didn't
643 : * use.
644 : */
645 142292 : freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1;
646 142292 : freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1;
647 :
648 142292 : freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt);
649 142311 : freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt);
650 :
651 : /*
652 : * The AGF header contains extra information related to the free space
653 : * btrees, so we must update those fields here.
654 : */
655 142317 : agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks +
656 : (be32_to_cpu(agf->agf_rmap_blocks) - 1));
657 142317 : agf->agf_freeblks = cpu_to_be32(ra->nr_blocks);
658 142317 : agf->agf_longest = cpu_to_be32(ra->longest);
659 142317 : xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS |
660 : XFS_AGF_LONGEST |
661 : XFS_AGF_FREEBLKS);
662 :
663 : /*
664 : * After we commit the new btree to disk, it is possible that the
665 : * process to reap the old btree blocks will race with the AIL trying
666 : * to checkpoint the old btree blocks into the filesystem. If the new
667 : * tree is shorter than the old one, the allocbt write verifier will
668 : * fail and the AIL will shut down the filesystem.
669 : *
670 : * To avoid this, save the old incore btree height values as the alt
671 : * height values before re-initializing the perag info from the updated
672 : * AGF to capture all the new values.
673 : */
674 142327 : pag->pagf_alt_levels[XFS_BTNUM_BNOi] = pag->pagf_levels[XFS_BTNUM_BNOi];
675 142327 : pag->pagf_alt_levels[XFS_BTNUM_CNTi] = pag->pagf_levels[XFS_BTNUM_CNTi];
676 :
677 : /* Reinitialize with the values we just logged. */
678 142327 : return xrep_reinit_pagf(sc);
679 : }
680 :
681 : /*
682 : * Use the collected free space information to stage new free space btrees.
683 : * If this is successful we'll return with the new btree root
684 : * information logged to the repair transaction but not yet committed.
685 : */
686 : STATIC int
687 141948 : xrep_abt_build_new_trees(
688 : struct xrep_abt *ra)
689 : {
690 141948 : struct xfs_scrub *sc = ra->sc;
691 141948 : struct xfs_btree_cur *bno_cur;
692 141948 : struct xfs_btree_cur *cnt_cur;
693 141948 : struct xfs_perag *pag = sc->sa.pag;
694 141948 : bool needs_resort = false;
695 141948 : int error;
696 :
697 : /*
698 : * Sort the free extents by length so that we can set up the free space
699 : * btrees in as few extents as possible. This reduces the amount of
700 : * deferred rmap / free work we have to do at the end.
701 : */
702 141948 : error = xrep_cntbt_sort_records(ra, false);
703 142152 : if (error)
704 : return error;
705 :
706 : /*
707 : * Prepare to construct the new btree by reserving disk space for the
708 : * new btree and setting up all the accounting information we'll need
709 : * to root the new btree while it's under construction and before we
710 : * attach it to the AG header.
711 : */
712 142121 : xrep_newbt_init_bare(&ra->new_bnobt, sc);
713 141787 : xrep_newbt_init_bare(&ra->new_cntbt, sc);
714 :
715 142134 : ra->new_bnobt.bload.get_records = xrep_abt_get_records;
716 142134 : ra->new_cntbt.bload.get_records = xrep_abt_get_records;
717 :
718 142134 : ra->new_bnobt.bload.claim_block = xrep_abt_claim_block;
719 142134 : ra->new_cntbt.bload.claim_block = xrep_abt_claim_block;
720 :
721 : /* Allocate cursors for the staged btrees. */
722 142134 : bno_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_bnobt.afake,
723 : pag, XFS_BTNUM_BNO);
724 142069 : cnt_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_cntbt.afake,
725 : pag, XFS_BTNUM_CNT);
726 :
727 : /* Last chance to abort before we start committing fixes. */
728 142259 : if (xchk_should_terminate(sc, &error))
729 0 : goto err_cur;
730 :
731 : /* Reserve the space we'll need for the new btrees. */
732 142234 : error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort);
733 142187 : if (error)
734 30 : goto err_cur;
735 :
736 : /*
737 : * If we need to re-sort the free extents by length, do so so that we
738 : * can put the records into the cntbt in the correct order.
739 : */
740 142157 : if (needs_resort) {
741 142152 : error = xrep_cntbt_sort_records(ra, needs_resort);
742 142173 : if (error)
743 0 : goto err_cur;
744 : }
745 :
746 : /*
747 : * Due to btree slack factors, it's possible for a new btree to be one
748 : * level taller than the old btree. Update the alternate incore btree
749 : * height so that we don't trip the verifiers when writing the new
750 : * btree blocks to disk.
751 : */
752 142178 : pag->pagf_alt_levels[XFS_BTNUM_BNOi] =
753 142178 : ra->new_bnobt.bload.btree_height;
754 142178 : pag->pagf_alt_levels[XFS_BTNUM_CNTi] =
755 142178 : ra->new_cntbt.bload.btree_height;
756 :
757 : /* Load the free space by length tree. */
758 142178 : ra->array_cur = XFARRAY_CURSOR_INIT;
759 142178 : ra->longest = 0;
760 142178 : error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra);
761 142295 : if (error)
762 0 : goto err_levels;
763 :
764 142295 : error = xrep_bnobt_sort_records(ra);
765 142297 : if (error)
766 : return error;
767 :
768 : /* Load the free space by block number tree. */
769 142285 : ra->array_cur = XFARRAY_CURSOR_INIT;
770 142285 : error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra);
771 142324 : if (error)
772 0 : goto err_levels;
773 :
774 : /*
775 : * Install the new btrees in the AG header. After this point the old
776 : * btrees are no longer accessible and the new trees are live.
777 : */
778 142324 : xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp);
779 142277 : xfs_btree_del_cursor(bno_cur, 0);
780 142281 : xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp);
781 142282 : xfs_btree_del_cursor(cnt_cur, 0);
782 :
783 : /* Reset the AGF counters now that we've changed the btree shape. */
784 142325 : error = xrep_abt_reset_counters(ra);
785 142262 : if (error)
786 0 : goto err_newbt;
787 :
788 : /* Dispose of any unused blocks and the accounting information. */
789 142262 : xrep_abt_dispose_reservations(ra, error);
790 :
791 142230 : return xrep_roll_ag_trans(sc);
792 :
793 0 : err_levels:
794 0 : pag->pagf_alt_levels[XFS_BTNUM_BNOi] = 0;
795 0 : pag->pagf_alt_levels[XFS_BTNUM_CNTi] = 0;
796 30 : err_cur:
797 30 : xfs_btree_del_cursor(cnt_cur, error);
798 30 : xfs_btree_del_cursor(bno_cur, error);
799 30 : err_newbt:
800 30 : xrep_abt_dispose_reservations(ra, error);
801 30 : return error;
802 : }
803 :
804 : /*
805 : * Now that we've logged the roots of the new btrees, invalidate all of the
806 : * old blocks and free them.
807 : */
808 : STATIC int
809 141943 : xrep_abt_remove_old_trees(
810 : struct xrep_abt *ra)
811 : {
812 141943 : struct xfs_perag *pag = ra->sc->sa.pag;
813 141943 : int error;
814 :
815 : /* Free the old btree blocks if they're not in use. */
816 141943 : error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks,
817 : &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE);
818 142041 : if (error)
819 : return error;
820 :
821 : /*
822 : * Now that we've zapped all the old allocbt blocks we can turn off
823 : * the alternate height mechanism.
824 : */
825 142041 : pag->pagf_alt_levels[XFS_BTNUM_BNOi] = 0;
826 142041 : pag->pagf_alt_levels[XFS_BTNUM_CNTi] = 0;
827 142041 : return 0;
828 : }
829 :
830 : /* Repair the freespace btrees for some AG. */
831 : int
832 143229 : xrep_allocbt(
833 : struct xfs_scrub *sc)
834 : {
835 143229 : struct xrep_abt *ra;
836 143229 : struct xfs_mount *mp = sc->mp;
837 143229 : char *descr;
838 143229 : int error;
839 :
840 : /* We require the rmapbt to rebuild anything. */
841 143229 : if (!xfs_has_rmapbt(mp))
842 : return -EOPNOTSUPP;
843 :
844 142290 : ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS);
845 142284 : if (!ra)
846 : return -ENOMEM;
847 142284 : ra->sc = sc;
848 :
849 : /* We rebuild both data structures. */
850 142284 : sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT;
851 :
852 : /*
853 : * Make sure the busy extent list is clear because we can't put extents
854 : * on there twice. In theory we cleared this before we started, but
855 : * let's not risk the filesystem.
856 : */
857 142284 : if (!xfs_extent_busy_list_empty(sc->sa.pag)) {
858 0 : error = -EDEADLOCK;
859 0 : goto out_ra;
860 : }
861 :
862 : /* Set up enough storage to handle maximally fragmented free space. */
863 142216 : descr = xchk_xfile_ag_descr(sc, "free space records");
864 141991 : error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2,
865 : sizeof(struct xfs_alloc_rec_incore),
866 : &ra->free_records);
867 142030 : kfree(descr);
868 142037 : if (error)
869 0 : goto out_ra;
870 :
871 : /* Collect the free space data and find the old btree blocks. */
872 142037 : xagb_bitmap_init(&ra->old_allocbt_blocks);
873 141935 : error = xrep_abt_find_freespace(ra);
874 142083 : if (error)
875 0 : goto out_bitmap;
876 :
877 : /* Rebuild the free space information. */
878 142083 : error = xrep_abt_build_new_trees(ra);
879 142131 : if (error)
880 30 : goto out_bitmap;
881 :
882 : /* Kill the old trees. */
883 142101 : error = xrep_abt_remove_old_trees(ra);
884 :
885 141979 : out_bitmap:
886 141979 : xagb_bitmap_destroy(&ra->old_allocbt_blocks);
887 141965 : xfarray_destroy(ra->free_records);
888 142044 : out_ra:
889 142044 : kfree(ra);
890 142044 : return error;
891 : }
892 :
893 : /* Make sure both btrees are ok after we've rebuilt them. */
894 : int
895 142148 : xrep_revalidate_allocbt(
896 : struct xfs_scrub *sc)
897 : {
898 142148 : __u32 old_type = sc->sm->sm_type;
899 142148 : int error;
900 :
901 : /*
902 : * We must update sm_type temporarily so that the tree-to-tree cross
903 : * reference checks will work in the correct direction, and also so
904 : * that tracing will report correctly if there are more errors.
905 : */
906 142148 : sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT;
907 142148 : error = xchk_bnobt(sc);
908 142256 : if (error)
909 0 : goto out;
910 :
911 142256 : sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT;
912 142256 : error = xchk_cntbt(sc);
913 142298 : out:
914 142298 : sc->sm->sm_type = old_type;
915 142298 : return error;
916 : }
|