Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0+
2 : /*
3 : * Copyright (C) 2016 Oracle. All Rights Reserved.
4 : * Author: Darrick J. Wong <darrick.wong@oracle.com>
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_log_format.h"
11 : #include "xfs_trans_resv.h"
12 : #include "xfs_mount.h"
13 : #include "xfs_alloc.h"
14 : #include "xfs_errortag.h"
15 : #include "xfs_error.h"
16 : #include "xfs_trace.h"
17 : #include "xfs_trans.h"
18 : #include "xfs_rmap_btree.h"
19 : #include "xfs_btree.h"
20 : #include "xfs_refcount_btree.h"
21 : #include "xfs_ialloc_btree.h"
22 : #include "xfs_ag.h"
23 : #include "xfs_ag_resv.h"
24 :
25 : /*
26 : * Per-AG Block Reservations
27 : *
28 : * For some kinds of allocation group metadata structures, it is advantageous
29 : * to reserve a small number of blocks in each AG so that future expansions of
30 : * that data structure do not encounter ENOSPC because errors during a btree
31 : * split cause the filesystem to go offline.
32 : *
33 : * Prior to the introduction of reflink, this wasn't an issue because the free
34 : * space btrees maintain a reserve of space (the AGFL) to handle any expansion
35 : * that may be necessary; and allocations of other metadata (inodes, BMBT,
36 : * dir/attr) aren't restricted to a single AG. However, with reflink it is
37 : * possible to allocate all the space in an AG, have subsequent reflink/CoW
38 : * activity expand the refcount btree, and discover that there's no space left
39 : * to handle that expansion. Since we can calculate the maximum size of the
40 : * refcount btree, we can reserve space for it and avoid ENOSPC.
41 : *
42 : * Handling per-AG reservations consists of three changes to the allocator's
43 : * behavior: First, because these reservations are always needed, we decrease
44 : * the ag_max_usable counter to reflect the size of the AG after the reserved
45 : * blocks are taken. Second, the reservations must be reflected in the
46 : * fdblocks count to maintain proper accounting. Third, each AG must maintain
47 : * its own reserved block counter so that we can calculate the amount of space
48 : * that must remain free to maintain the reservations. Fourth, the "remaining
49 : * reserved blocks" count must be used when calculating the length of the
50 : * longest free extent in an AG and to clamp maxlen in the per-AG allocation
51 : * functions. In other words, we maintain a virtual allocation via in-core
52 : * accounting tricks so that we don't have to clean up after a crash. :)
53 : *
54 : * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
55 : * values via struct xfs_alloc_arg or directly to the xfs_free_extent
56 : * function. It might seem a little funny to maintain a reservoir of blocks
57 : * to feed another reservoir, but the AGFL only holds enough blocks to get
58 : * through the next transaction. The per-AG reservation is to ensure (we
59 : * hope) that each AG never runs out of blocks. Each data structure wanting
60 : * to use the reservation system should update ask/used in xfs_ag_resv_init.
61 : */
62 :
63 : /*
64 : * Are we critically low on blocks? For now we'll define that as the number
65 : * of blocks we can get our hands on being less than 10% of what we reserved
66 : * or less than some arbitrary number (maximum btree height).
67 : */
68 : bool
69 108557332 : xfs_ag_resv_critical(
70 : struct xfs_perag *pag,
71 : enum xfs_ag_resv_type type)
72 : {
73 108557332 : xfs_extlen_t avail;
74 108557332 : xfs_extlen_t orig;
75 :
76 : /*
77 : * Pretend we're critically low on reservations in this AG to scare
78 : * everyone else away.
79 : */
80 217114664 : if (xfs_perag_prohibits_alloc(pag))
81 : return true;
82 :
83 108557332 : switch (type) {
84 54278661 : case XFS_AG_RESV_METADATA:
85 54278661 : avail = pag->pagf_freeblks - pag->pag_rmapbt_resv.ar_reserved;
86 54278661 : orig = pag->pag_meta_resv.ar_asked;
87 54278661 : break;
88 54278671 : case XFS_AG_RESV_RMAPBT:
89 54278671 : avail = pag->pagf_freeblks + pag->pagf_flcount -
90 54278671 : pag->pag_meta_resv.ar_reserved;
91 54278671 : orig = pag->pag_rmapbt_resv.ar_asked;
92 54278671 : break;
93 0 : default:
94 0 : ASSERT(0);
95 0 : return false;
96 : }
97 :
98 108557332 : trace_xfs_ag_resv_critical(pag, type, avail);
99 :
100 : /* Critically low if less than 10% or max btree height remains. */
101 108557328 : return XFS_TEST_ERROR(avail < orig / 10 ||
102 : avail < pag->pag_mount->m_agbtree_maxlevels,
103 : pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
104 : }
105 :
106 : /*
107 : * How many blocks are reserved but not used, and therefore must not be
108 : * allocated away?
109 : */
110 : xfs_extlen_t
111 226184085 : xfs_ag_resv_needed(
112 : struct xfs_perag *pag,
113 : enum xfs_ag_resv_type type)
114 : {
115 226184085 : xfs_extlen_t len;
116 :
117 226184085 : len = pag->pag_meta_resv.ar_reserved + pag->pag_rmapbt_resv.ar_reserved;
118 226184085 : switch (type) {
119 : case XFS_AG_RESV_METADATA:
120 : case XFS_AG_RESV_RMAPBT:
121 410611 : len -= xfs_perag_resv(pag, type)->ar_reserved;
122 410611 : break;
123 : case XFS_AG_RESV_IMETA:
124 : case XFS_AG_RESV_NONE:
125 : /*
126 : * In noalloc mode, we pretend that all the free blocks in this
127 : * AG have been allocated. Make this AG look full.
128 : */
129 451546948 : if (xfs_perag_prohibits_alloc(pag))
130 0 : len += xfs_ag_fdblocks(pag);
131 : break;
132 0 : default:
133 0 : ASSERT(0);
134 : }
135 :
136 226184085 : trace_xfs_ag_resv_needed(pag, type, len);
137 :
138 226187378 : return len;
139 : }
140 :
141 : /* Clean out a reservation */
142 : static void
143 756724 : __xfs_ag_resv_free(
144 : struct xfs_perag *pag,
145 : enum xfs_ag_resv_type type)
146 : {
147 756724 : struct xfs_ag_resv *resv;
148 756724 : xfs_extlen_t oldresv;
149 :
150 756724 : trace_xfs_ag_resv_free(pag, type, 0);
151 :
152 756720 : resv = xfs_perag_resv(pag, type);
153 756720 : if (pag->pag_agno == 0)
154 156068 : pag->pag_mount->m_ag_max_usable += resv->ar_asked;
155 : /*
156 : * RMAPBT blocks come from the AGFL and AGFL blocks are always
157 : * considered "free", so whatever was reserved at mount time must be
158 : * given back at umount.
159 : */
160 756720 : if (type == XFS_AG_RESV_RMAPBT)
161 378345 : oldresv = resv->ar_orig_reserved;
162 : else
163 378375 : oldresv = resv->ar_reserved;
164 756720 : xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
165 756750 : resv->ar_reserved = 0;
166 756750 : resv->ar_asked = 0;
167 756750 : resv->ar_orig_reserved = 0;
168 756750 : }
169 :
170 : /* Free a per-AG reservation. */
171 : void
172 378346 : xfs_ag_resv_free(
173 : struct xfs_perag *pag)
174 : {
175 378346 : __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
176 378375 : __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
177 378375 : }
178 :
179 : static int
180 738759 : __xfs_ag_resv_init(
181 : struct xfs_perag *pag,
182 : enum xfs_ag_resv_type type,
183 : xfs_extlen_t ask,
184 : xfs_extlen_t used)
185 : {
186 738759 : struct xfs_mount *mp = pag->pag_mount;
187 738759 : struct xfs_ag_resv *resv;
188 738759 : int error;
189 738759 : xfs_extlen_t hidden_space;
190 :
191 738759 : if (used > ask)
192 : ask = used;
193 :
194 738759 : switch (type) {
195 : case XFS_AG_RESV_RMAPBT:
196 : /*
197 : * Space taken by the rmapbt is not subtracted from fdblocks
198 : * because the rmapbt lives in the free space. Here we must
199 : * subtract the entire reservation from fdblocks so that we
200 : * always have blocks available for rmapbt expansion.
201 : */
202 : hidden_space = ask;
203 : break;
204 369371 : case XFS_AG_RESV_METADATA:
205 : /*
206 : * Space taken by all other metadata btrees are accounted
207 : * on-disk as used space. We therefore only hide the space
208 : * that is reserved but not used by the trees.
209 : */
210 369371 : hidden_space = ask - used;
211 369371 : break;
212 0 : default:
213 0 : ASSERT(0);
214 0 : return -EINVAL;
215 : }
216 :
217 738759 : if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL))
218 : error = -ENOSPC;
219 : else
220 738756 : error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true);
221 738785 : if (error) {
222 0 : trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
223 0 : error, _RET_IP_);
224 0 : xfs_warn(mp,
225 : "Per-AG reservation for AG %u failed. Filesystem may run out of space.",
226 : pag->pag_agno);
227 0 : return error;
228 : }
229 :
230 : /*
231 : * Reduce the maximum per-AG allocation length by however much we're
232 : * trying to reserve for an AG. Since this is a filesystem-wide
233 : * counter, we only make the adjustment for AG 0. This assumes that
234 : * there aren't any AGs hungrier for per-AG reservation than AG 0.
235 : */
236 738785 : if (pag->pag_agno == 0)
237 151542 : mp->m_ag_max_usable -= ask;
238 :
239 738785 : resv = xfs_perag_resv(pag, type);
240 738785 : resv->ar_asked = ask;
241 738785 : resv->ar_orig_reserved = hidden_space;
242 738785 : resv->ar_reserved = ask - used;
243 :
244 738785 : trace_xfs_ag_resv_init(pag, type, ask);
245 738785 : return 0;
246 : }
247 :
248 : /* Create a per-AG block reservation. */
249 : int
250 370021 : xfs_ag_resv_init(
251 : struct xfs_perag *pag,
252 : struct xfs_trans *tp)
253 : {
254 370021 : struct xfs_mount *mp = pag->pag_mount;
255 370021 : xfs_extlen_t ask;
256 370021 : xfs_extlen_t used;
257 370021 : int error = 0, error2;
258 370021 : bool has_resv = false;
259 :
260 : /* Create the metadata reservation. */
261 370021 : if (pag->pag_meta_resv.ar_asked == 0) {
262 369463 : ask = used = 0;
263 :
264 369463 : error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used);
265 369460 : if (error)
266 78 : goto out;
267 :
268 369382 : error = xfs_finobt_calc_reserves(pag, tp, &ask, &used);
269 369365 : if (error)
270 0 : goto out;
271 :
272 369365 : error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
273 : ask, used);
274 369391 : if (error) {
275 : /*
276 : * Because we didn't have per-AG reservations when the
277 : * finobt feature was added we might not be able to
278 : * reserve all needed blocks. Warn and fall back to the
279 : * old and potentially buggy code in that case, but
280 : * ensure we do have the reservation for the refcountbt.
281 : */
282 0 : ask = used = 0;
283 :
284 0 : mp->m_finobt_nores = true;
285 :
286 0 : error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask,
287 : &used);
288 0 : if (error)
289 0 : goto out;
290 :
291 0 : error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
292 : ask, used);
293 0 : if (error)
294 0 : goto out;
295 : }
296 369391 : if (ask)
297 366751 : has_resv = true;
298 : }
299 :
300 : /* Create the RMAPBT metadata reservation */
301 369949 : if (pag->pag_rmapbt_resv.ar_asked == 0) {
302 369389 : ask = used = 0;
303 :
304 369389 : error = xfs_rmapbt_calc_reserves(mp, tp, pag, &ask, &used);
305 369386 : if (error)
306 0 : goto out;
307 :
308 369386 : error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used);
309 369393 : if (error)
310 0 : goto out;
311 369393 : if (ask)
312 : has_resv = true;
313 : }
314 :
315 3424 : out:
316 : /*
317 : * Initialize the pagf if we have at least one active reservation on the
318 : * AG. This may have occurred already via reservation calculation, but
319 : * fall back to an explicit init to ensure the in-core allocbt usage
320 : * counters are initialized as soon as possible. This is important
321 : * because filesystems with large perag reservations are susceptible to
322 : * free space reservation problems that the allocbt counter is used to
323 : * address.
324 : */
325 3502 : if (has_resv) {
326 366753 : error2 = xfs_alloc_read_agf(pag, tp, 0, NULL);
327 366747 : if (error2)
328 : return error2;
329 :
330 : /*
331 : * If there isn't enough space in the AG to satisfy the
332 : * reservation, let the caller know that there wasn't enough
333 : * space. Callers are responsible for deciding what to do
334 : * next, since (in theory) we can stumble along with
335 : * insufficient reservation if data blocks are being freed to
336 : * replenish the AG's free space.
337 : */
338 366747 : if (!error &&
339 366747 : xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
340 366747 : xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved >
341 366747 : pag->pagf_freeblks + pag->pagf_flcount)
342 0 : error = -ENOSPC;
343 : }
344 :
345 : return error;
346 : }
347 :
348 : /* Allocate a block from the reservation. */
349 : void
350 64147832 : xfs_ag_resv_alloc_extent(
351 : struct xfs_perag *pag,
352 : enum xfs_ag_resv_type type,
353 : struct xfs_alloc_arg *args)
354 : {
355 64147832 : struct xfs_ag_resv *resv;
356 64147832 : xfs_extlen_t len;
357 64147832 : uint field;
358 :
359 127529411 : ASSERT(type != XFS_AG_RESV_NONE || !xfs_perag_prohibits_alloc(pag));
360 :
361 64147832 : trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
362 :
363 64148894 : switch (type) {
364 : case XFS_AG_RESV_AGFL:
365 : case XFS_AG_RESV_IMETA:
366 : return;
367 : case XFS_AG_RESV_METADATA:
368 : case XFS_AG_RESV_RMAPBT:
369 604352 : resv = xfs_perag_resv(pag, type);
370 604352 : break;
371 0 : default:
372 0 : ASSERT(0);
373 63381749 : fallthrough;
374 63381749 : case XFS_AG_RESV_NONE:
375 63381749 : field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
376 : XFS_TRANS_SB_FDBLOCKS;
377 63381749 : xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
378 63381749 : return;
379 : }
380 :
381 604352 : len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
382 604352 : resv->ar_reserved -= len;
383 604352 : if (type == XFS_AG_RESV_RMAPBT)
384 : return;
385 : /* Allocations of reserved blocks only need on-disk sb updates... */
386 194462 : xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
387 : /* ...but non-reserved blocks need in-core and on-disk updates. */
388 194453 : if (args->len > len)
389 0 : xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
390 0 : -((int64_t)args->len - len));
391 : }
392 :
393 : /* Free a block to the reservation. */
394 : void
395 53178320 : xfs_ag_resv_free_extent(
396 : struct xfs_perag *pag,
397 : enum xfs_ag_resv_type type,
398 : struct xfs_trans *tp,
399 : xfs_extlen_t len)
400 : {
401 53178320 : xfs_extlen_t leftover;
402 53178320 : struct xfs_ag_resv *resv;
403 :
404 53178320 : trace_xfs_ag_resv_free_extent(pag, type, len);
405 :
406 53178409 : switch (type) {
407 : case XFS_AG_RESV_AGFL:
408 : case XFS_AG_RESV_IMETA:
409 : return;
410 : case XFS_AG_RESV_METADATA:
411 : case XFS_AG_RESV_RMAPBT:
412 303356 : resv = xfs_perag_resv(pag, type);
413 303356 : break;
414 0 : default:
415 0 : ASSERT(0);
416 52576913 : fallthrough;
417 52576913 : case XFS_AG_RESV_NONE:
418 : /*
419 : * Normally we put freed blocks back into fdblocks. In noalloc
420 : * mode, however, we pretend that there are no fdblocks in the
421 : * AG, so don't put them back.
422 : */
423 105153826 : if (!xfs_perag_prohibits_alloc(pag))
424 52576876 : xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS,
425 : (int64_t)len);
426 : fallthrough;
427 : case XFS_AG_RESV_IGNORE:
428 : return;
429 : }
430 :
431 303356 : leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
432 303356 : resv->ar_reserved += leftover;
433 303356 : if (type == XFS_AG_RESV_RMAPBT)
434 : return;
435 : /* Freeing into the reserved pool only requires on-disk update... */
436 130129 : xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
437 : /* ...but freeing beyond that requires in-core and on-disk update. */
438 130127 : if (len > leftover && !xfs_perag_prohibits_alloc(pag))
439 0 : xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
440 : }
|