Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0+
2 : /*
3 : * Copyright (C) 2016 Oracle. All Rights Reserved.
4 : * Author: Darrick J. Wong <darrick.wong@oracle.com>
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_log_format.h"
11 : #include "xfs_trans_resv.h"
12 : #include "xfs_mount.h"
13 : #include "xfs_alloc.h"
14 : #include "xfs_errortag.h"
15 : #include "xfs_error.h"
16 : #include "xfs_trace.h"
17 : #include "xfs_trans.h"
18 : #include "xfs_rmap_btree.h"
19 : #include "xfs_btree.h"
20 : #include "xfs_refcount_btree.h"
21 : #include "xfs_ialloc_btree.h"
22 : #include "xfs_ag.h"
23 : #include "xfs_ag_resv.h"
24 :
25 : /*
26 : * Per-AG Block Reservations
27 : *
28 : * For some kinds of allocation group metadata structures, it is advantageous
29 : * to reserve a small number of blocks in each AG so that future expansions of
30 : * that data structure do not encounter ENOSPC because errors during a btree
31 : * split cause the filesystem to go offline.
32 : *
33 : * Prior to the introduction of reflink, this wasn't an issue because the free
34 : * space btrees maintain a reserve of space (the AGFL) to handle any expansion
35 : * that may be necessary; and allocations of other metadata (inodes, BMBT,
36 : * dir/attr) aren't restricted to a single AG. However, with reflink it is
37 : * possible to allocate all the space in an AG, have subsequent reflink/CoW
38 : * activity expand the refcount btree, and discover that there's no space left
39 : * to handle that expansion. Since we can calculate the maximum size of the
40 : * refcount btree, we can reserve space for it and avoid ENOSPC.
41 : *
42 : * Handling per-AG reservations consists of three changes to the allocator's
43 : * behavior: First, because these reservations are always needed, we decrease
44 : * the ag_max_usable counter to reflect the size of the AG after the reserved
45 : * blocks are taken. Second, the reservations must be reflected in the
46 : * fdblocks count to maintain proper accounting. Third, each AG must maintain
47 : * its own reserved block counter so that we can calculate the amount of space
48 : * that must remain free to maintain the reservations. Fourth, the "remaining
49 : * reserved blocks" count must be used when calculating the length of the
50 : * longest free extent in an AG and to clamp maxlen in the per-AG allocation
51 : * functions. In other words, we maintain a virtual allocation via in-core
52 : * accounting tricks so that we don't have to clean up after a crash. :)
53 : *
54 : * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
55 : * values via struct xfs_alloc_arg or directly to the xfs_free_extent
56 : * function. It might seem a little funny to maintain a reservoir of blocks
57 : * to feed another reservoir, but the AGFL only holds enough blocks to get
58 : * through the next transaction. The per-AG reservation is to ensure (we
59 : * hope) that each AG never runs out of blocks. Each data structure wanting
60 : * to use the reservation system should update ask/used in xfs_ag_resv_init.
61 : */
62 :
63 : /*
64 : * Are we critically low on blocks? For now we'll define that as the number
65 : * of blocks we can get our hands on being less than 10% of what we reserved
66 : * or less than some arbitrary number (maximum btree height).
67 : */
68 : bool
69 152044548 : xfs_ag_resv_critical(
70 : struct xfs_perag *pag,
71 : enum xfs_ag_resv_type type)
72 : {
73 152044548 : xfs_extlen_t avail;
74 152044548 : xfs_extlen_t orig;
75 :
76 : /*
77 : * Pretend we're critically low on reservations in this AG to scare
78 : * everyone else away.
79 : */
80 304089096 : if (xfs_perag_prohibits_alloc(pag))
81 : return true;
82 :
83 152044548 : switch (type) {
84 76021624 : case XFS_AG_RESV_METADATA:
85 76021624 : avail = pag->pagf_freeblks - pag->pag_rmapbt_resv.ar_reserved;
86 76021624 : orig = pag->pag_meta_resv.ar_asked;
87 76021624 : break;
88 76022924 : case XFS_AG_RESV_RMAPBT:
89 76022924 : avail = pag->pagf_freeblks + pag->pagf_flcount -
90 76022924 : pag->pag_meta_resv.ar_reserved;
91 76022924 : orig = pag->pag_rmapbt_resv.ar_asked;
92 76022924 : break;
93 0 : default:
94 0 : ASSERT(0);
95 0 : return false;
96 : }
97 :
98 152044548 : trace_xfs_ag_resv_critical(pag, type, avail);
99 :
100 : /* Critically low if less than 10% or max btree height remains. */
101 152044461 : return XFS_TEST_ERROR(avail < orig / 10 ||
102 : avail < pag->pag_mount->m_agbtree_maxlevels,
103 : pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
104 : }
105 :
106 : /*
107 : * How many blocks are reserved but not used, and therefore must not be
108 : * allocated away?
109 : */
110 : xfs_extlen_t
111 480650324 : xfs_ag_resv_needed(
112 : struct xfs_perag *pag,
113 : enum xfs_ag_resv_type type)
114 : {
115 480650324 : xfs_extlen_t len;
116 :
117 480650324 : len = pag->pag_meta_resv.ar_reserved + pag->pag_rmapbt_resv.ar_reserved;
118 480650324 : switch (type) {
119 : case XFS_AG_RESV_METADATA:
120 : case XFS_AG_RESV_RMAPBT:
121 808694 : len -= xfs_perag_resv(pag, type)->ar_reserved;
122 808694 : break;
123 : case XFS_AG_RESV_IMETA:
124 : case XFS_AG_RESV_NONE:
125 : /*
126 : * In noalloc mode, we pretend that all the free blocks in this
127 : * AG have been allocated. Make this AG look full.
128 : */
129 959683260 : if (xfs_perag_prohibits_alloc(pag))
130 0 : len += xfs_ag_fdblocks(pag);
131 : break;
132 0 : default:
133 0 : ASSERT(0);
134 : }
135 :
136 480650324 : trace_xfs_ag_resv_needed(pag, type, len);
137 :
138 480625872 : return len;
139 : }
140 :
141 : /* Clean out a reservation */
142 : static void
143 2571454 : __xfs_ag_resv_free(
144 : struct xfs_perag *pag,
145 : enum xfs_ag_resv_type type)
146 : {
147 2571454 : struct xfs_ag_resv *resv;
148 2571454 : xfs_extlen_t oldresv;
149 :
150 2571454 : trace_xfs_ag_resv_free(pag, type, 0);
151 :
152 2571001 : resv = xfs_perag_resv(pag, type);
153 2571001 : if (pag->pag_agno == 0)
154 373054 : pag->pag_mount->m_ag_max_usable += resv->ar_asked;
155 : /*
156 : * RMAPBT blocks come from the AGFL and AGFL blocks are always
157 : * considered "free", so whatever was reserved at mount time must be
158 : * given back at umount.
159 : */
160 2571001 : if (type == XFS_AG_RESV_RMAPBT)
161 1284615 : oldresv = resv->ar_orig_reserved;
162 : else
163 1286386 : oldresv = resv->ar_reserved;
164 2571001 : xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
165 2572808 : resv->ar_reserved = 0;
166 2572808 : resv->ar_asked = 0;
167 2572808 : resv->ar_orig_reserved = 0;
168 2572808 : }
169 :
170 : /* Free a per-AG reservation. */
171 : void
172 1285187 : xfs_ag_resv_free(
173 : struct xfs_perag *pag)
174 : {
175 1285187 : __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
176 1286491 : __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
177 1286445 : }
178 :
179 : static int
180 2552452 : __xfs_ag_resv_init(
181 : struct xfs_perag *pag,
182 : enum xfs_ag_resv_type type,
183 : xfs_extlen_t ask,
184 : xfs_extlen_t used)
185 : {
186 2552452 : struct xfs_mount *mp = pag->pag_mount;
187 2552452 : struct xfs_ag_resv *resv;
188 2552452 : int error;
189 2552452 : xfs_extlen_t hidden_space;
190 :
191 2552452 : if (used > ask)
192 : ask = used;
193 :
194 2552452 : switch (type) {
195 : case XFS_AG_RESV_RMAPBT:
196 : /*
197 : * Space taken by the rmapbt is not subtracted from fdblocks
198 : * because the rmapbt lives in the free space. Here we must
199 : * subtract the entire reservation from fdblocks so that we
200 : * always have blocks available for rmapbt expansion.
201 : */
202 : hidden_space = ask;
203 : break;
204 1276304 : case XFS_AG_RESV_METADATA:
205 : /*
206 : * Space taken by all other metadata btrees are accounted
207 : * on-disk as used space. We therefore only hide the space
208 : * that is reserved but not used by the trees.
209 : */
210 1276304 : hidden_space = ask - used;
211 1276304 : break;
212 0 : default:
213 0 : ASSERT(0);
214 0 : return -EINVAL;
215 : }
216 :
217 2552452 : if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL))
218 : error = -ENOSPC;
219 : else
220 2551954 : error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true);
221 2553397 : if (error) {
222 0 : trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
223 0 : error, _RET_IP_);
224 0 : xfs_warn(mp,
225 : "Per-AG reservation for AG %u failed. Filesystem may run out of space.",
226 : pag->pag_agno);
227 0 : return error;
228 : }
229 :
230 : /*
231 : * Reduce the maximum per-AG allocation length by however much we're
232 : * trying to reserve for an AG. Since this is a filesystem-wide
233 : * counter, we only make the adjustment for AG 0. This assumes that
234 : * there aren't any AGs hungrier for per-AG reservation than AG 0.
235 : */
236 2553397 : if (pag->pag_agno == 0)
237 368474 : mp->m_ag_max_usable -= ask;
238 :
239 2553397 : resv = xfs_perag_resv(pag, type);
240 2553397 : resv->ar_asked = ask;
241 2553397 : resv->ar_orig_reserved = hidden_space;
242 2553397 : resv->ar_reserved = ask - used;
243 :
244 2553397 : trace_xfs_ag_resv_init(pag, type, ask);
245 2553397 : return 0;
246 : }
247 :
248 : /* Create a per-AG block reservation. */
249 : int
250 1279683 : xfs_ag_resv_init(
251 : struct xfs_perag *pag,
252 : struct xfs_trans *tp)
253 : {
254 1279683 : struct xfs_mount *mp = pag->pag_mount;
255 1279683 : xfs_extlen_t ask;
256 1279683 : xfs_extlen_t used;
257 1279683 : int error = 0, error2;
258 1279683 : bool has_resv = false;
259 :
260 : /* Create the metadata reservation. */
261 1279683 : if (pag->pag_meta_resv.ar_asked == 0) {
262 1277503 : ask = used = 0;
263 :
264 1277503 : error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used);
265 1276548 : if (error)
266 436 : goto out;
267 :
268 1276112 : error = xfs_finobt_calc_reserves(pag, tp, &ask, &used);
269 1276384 : if (error)
270 4 : goto out;
271 :
272 1276380 : error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
273 : ask, used);
274 1276097 : if (error) {
275 : /*
276 : * Because we didn't have per-AG reservations when the
277 : * finobt feature was added we might not be able to
278 : * reserve all needed blocks. Warn and fall back to the
279 : * old and potentially buggy code in that case, but
280 : * ensure we do have the reservation for the refcountbt.
281 : */
282 0 : ask = used = 0;
283 :
284 0 : mp->m_finobt_nores = true;
285 :
286 0 : error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask,
287 : &used);
288 0 : if (error)
289 0 : goto out;
290 :
291 0 : error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
292 : ask, used);
293 0 : if (error)
294 0 : goto out;
295 : }
296 1276097 : if (ask)
297 1260458 : has_resv = true;
298 : }
299 :
300 : /* Create the RMAPBT metadata reservation */
301 1278277 : if (pag->pag_rmapbt_resv.ar_asked == 0) {
302 1275902 : ask = used = 0;
303 :
304 1275902 : error = xfs_rmapbt_calc_reserves(mp, tp, pag, &ask, &used);
305 1276612 : if (error)
306 0 : goto out;
307 :
308 1276612 : error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used);
309 1277076 : if (error)
310 0 : goto out;
311 1277076 : if (ask)
312 : has_resv = true;
313 : }
314 :
315 58134 : out:
316 : /*
317 : * Initialize the pagf if we have at least one active reservation on the
318 : * AG. This may have occurred already via reservation calculation, but
319 : * fall back to an explicit init to ensure the in-core allocbt usage
320 : * counters are initialized as soon as possible. This is important
321 : * because filesystems with large perag reservations are susceptible to
322 : * free space reservation problems that the allocbt counter is used to
323 : * address.
324 : */
325 58574 : if (has_resv) {
326 1261289 : error2 = xfs_alloc_read_agf(pag, tp, 0, NULL);
327 1261069 : if (error2)
328 : return error2;
329 :
330 : /*
331 : * If there isn't enough space in the AG to satisfy the
332 : * reservation, let the caller know that there wasn't enough
333 : * space. Callers are responsible for deciding what to do
334 : * next, since (in theory) we can stumble along with
335 : * insufficient reservation if data blocks are being freed to
336 : * replenish the AG's free space.
337 : */
338 1261065 : if (!error &&
339 1261123 : xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
340 1261123 : xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved >
341 1261123 : pag->pagf_freeblks + pag->pagf_flcount)
342 0 : error = -ENOSPC;
343 : }
344 :
345 : return error;
346 : }
347 :
348 : /* Allocate a block from the reservation. */
349 : void
350 98526897 : xfs_ag_resv_alloc_extent(
351 : struct xfs_perag *pag,
352 : enum xfs_ag_resv_type type,
353 : struct xfs_alloc_arg *args)
354 : {
355 98526897 : struct xfs_ag_resv *resv;
356 98526897 : xfs_extlen_t len;
357 98526897 : uint field;
358 :
359 195104164 : ASSERT(type != XFS_AG_RESV_NONE || !xfs_perag_prohibits_alloc(pag));
360 :
361 98526897 : trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
362 :
363 98503652 : switch (type) {
364 : case XFS_AG_RESV_AGFL:
365 : case XFS_AG_RESV_IMETA:
366 : return;
367 : case XFS_AG_RESV_METADATA:
368 : case XFS_AG_RESV_RMAPBT:
369 1208906 : resv = xfs_perag_resv(pag, type);
370 1208906 : break;
371 0 : default:
372 0 : ASSERT(0);
373 96568450 : fallthrough;
374 96568450 : case XFS_AG_RESV_NONE:
375 96568450 : field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
376 : XFS_TRANS_SB_FDBLOCKS;
377 96568450 : xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
378 96568450 : return;
379 : }
380 :
381 1208906 : len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
382 1208906 : resv->ar_reserved -= len;
383 1208906 : if (type == XFS_AG_RESV_RMAPBT)
384 : return;
385 : /* Allocations of reserved blocks only need on-disk sb updates... */
386 365643 : xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
387 : /* ...but non-reserved blocks need in-core and on-disk updates. */
388 365468 : if (args->len > len)
389 0 : xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
390 0 : -((int64_t)args->len - len));
391 : }
392 :
393 : /* Free a block to the reservation. */
394 : void
395 106094964 : xfs_ag_resv_free_extent(
396 : struct xfs_perag *pag,
397 : enum xfs_ag_resv_type type,
398 : struct xfs_trans *tp,
399 : xfs_extlen_t len)
400 : {
401 106094964 : xfs_extlen_t leftover;
402 106094964 : struct xfs_ag_resv *resv;
403 :
404 106094964 : trace_xfs_ag_resv_free_extent(pag, type, len);
405 :
406 106082857 : switch (type) {
407 : case XFS_AG_RESV_AGFL:
408 : case XFS_AG_RESV_IMETA:
409 : return;
410 : case XFS_AG_RESV_METADATA:
411 : case XFS_AG_RESV_RMAPBT:
412 509494 : resv = xfs_perag_resv(pag, type);
413 509494 : break;
414 0 : default:
415 0 : ASSERT(0);
416 105000272 : fallthrough;
417 105000272 : case XFS_AG_RESV_NONE:
418 : /*
419 : * Normally we put freed blocks back into fdblocks. In noalloc
420 : * mode, however, we pretend that there are no fdblocks in the
421 : * AG, so don't put them back.
422 : */
423 210000544 : if (!xfs_perag_prohibits_alloc(pag))
424 104999764 : xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS,
425 : (int64_t)len);
426 : fallthrough;
427 : case XFS_AG_RESV_IGNORE:
428 : return;
429 : }
430 :
431 509494 : leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
432 509494 : resv->ar_reserved += leftover;
433 509494 : if (type == XFS_AG_RESV_RMAPBT)
434 : return;
435 : /* Freeing into the reserved pool only requires on-disk update... */
436 203641 : xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
437 : /* ...but freeing beyond that requires in-core and on-disk update. */
438 203553 : if (len > leftover && !xfs_perag_prohibits_alloc(pag))
439 0 : xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
440 : }
|