Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 :
3 : #include <linux/jiffies.h>
4 : #include <linux/kernel.h>
5 : #include <linux/ktime.h>
6 : #include <linux/list.h>
7 : #include <linux/math64.h>
8 : #include <linux/sizes.h>
9 : #include <linux/workqueue.h>
10 : #include "ctree.h"
11 : #include "block-group.h"
12 : #include "discard.h"
13 : #include "free-space-cache.h"
14 : #include "fs.h"
15 :
16 : /*
17 : * This contains the logic to handle async discard.
18 : *
19 : * Async discard manages trimming of free space outside of transaction commit.
20 : * Discarding is done by managing the block_groups on a LRU list based on free
21 : * space recency. Two passes are used to first prioritize discarding extents
22 : * and then allow for trimming in the bitmap the best opportunity to coalesce.
23 : * The block_groups are maintained on multiple lists to allow for multiple
24 : * passes with different discard filter requirements. A delayed work item is
25 : * used to manage discarding with timeout determined by a max of the delay
26 : * incurred by the iops rate limit, the byte rate limit, and the max delay of
27 : * BTRFS_DISCARD_MAX_DELAY.
28 : *
29 : * Note, this only keeps track of block_groups that are explicitly for data.
30 : * Mixed block_groups are not supported.
31 : *
32 : * The first list is special to manage discarding of fully free block groups.
33 : * This is necessary because we issue a final trim for a full free block group
34 : * after forgetting it. When a block group becomes unused, instead of directly
35 : * being added to the unused_bgs list, we add it to this first list. Then
36 : * from there, if it becomes fully discarded, we place it onto the unused_bgs
37 : * list.
38 : *
39 : * The in-memory free space cache serves as the backing state for discard.
40 : * Consequently this means there is no persistence. We opt to load all the
41 : * block groups in as not discarded, so the mount case degenerates to the
42 : * crashing case.
43 : *
44 : * As the free space cache uses bitmaps, there exists a tradeoff between
45 : * ease/efficiency for find_free_extent() and the accuracy of discard state.
46 : * Here we opt to let untrimmed regions merge with everything while only letting
47 : * trimmed regions merge with other trimmed regions. This can cause
48 : * overtrimming, but the coalescing benefit seems to be worth it. Additionally,
49 : * bitmap state is tracked as a whole. If we're able to fully trim a bitmap,
50 : * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in,
51 : * this resets the state and we will retry trimming the whole bitmap. This is a
52 : * tradeoff between discard state accuracy and the cost of accounting.
53 : */
54 :
55 : /* This is an initial delay to give some chance for block reuse */
56 : #define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC)
57 : #define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC)
58 :
59 : #define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL)
60 : #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL)
61 : #define BTRFS_DISCARD_MAX_IOPS (1000U)
62 :
63 : /* Monotonically decreasing minimum length filters after index 0 */
64 : static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
65 : 0,
66 : BTRFS_ASYNC_DISCARD_MAX_FILTER,
67 : BTRFS_ASYNC_DISCARD_MIN_FILTER
68 : };
69 :
70 : static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
71 : struct btrfs_block_group *block_group)
72 : {
73 155313 : return &discard_ctl->discard_list[block_group->discard_index];
74 : }
75 :
76 : /*
77 : * Determine if async discard should be running.
78 : *
79 : * @discard_ctl: discard control
80 : *
81 : * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
82 : */
83 1761946 : static bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
84 : {
85 1761946 : struct btrfs_fs_info *fs_info = container_of(discard_ctl,
86 : struct btrfs_fs_info,
87 : discard_ctl);
88 :
89 3523880 : return (!(fs_info->sb->s_flags & SB_RDONLY) &&
90 1761934 : test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
91 : }
92 :
93 157061 : static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
94 : struct btrfs_block_group *block_group)
95 : {
96 157061 : lockdep_assert_held(&discard_ctl->lock);
97 157061 : if (!btrfs_run_discard_work(discard_ctl))
98 : return;
99 :
100 155313 : if (list_empty(&block_group->discard_list) ||
101 153945 : block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
102 1517 : if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
103 1333 : block_group->discard_index = BTRFS_DISCARD_INDEX_START;
104 1517 : block_group->discard_eligible_time = (ktime_get_ns() +
105 : BTRFS_DISCARD_DELAY);
106 1517 : block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
107 : }
108 155313 : if (list_empty(&block_group->discard_list))
109 1368 : btrfs_get_block_group(block_group);
110 :
111 155313 : list_move_tail(&block_group->discard_list,
112 : get_discard_list(discard_ctl, block_group));
113 : }
114 :
115 3666883 : static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
116 : struct btrfs_block_group *block_group)
117 : {
118 3666883 : if (!btrfs_is_block_group_data_only(block_group))
119 : return;
120 :
121 157060 : spin_lock(&discard_ctl->lock);
122 157060 : __add_to_discard_list(discard_ctl, block_group);
123 157060 : spin_unlock(&discard_ctl->lock);
124 : }
125 :
126 44520 : static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
127 : struct btrfs_block_group *block_group)
128 : {
129 44520 : bool queued;
130 :
131 44520 : spin_lock(&discard_ctl->lock);
132 :
133 44520 : queued = !list_empty(&block_group->discard_list);
134 :
135 44520 : if (!btrfs_run_discard_work(discard_ctl)) {
136 9278 : spin_unlock(&discard_ctl->lock);
137 9278 : return;
138 : }
139 :
140 35242 : list_del_init(&block_group->discard_list);
141 :
142 35242 : block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
143 35242 : block_group->discard_eligible_time = (ktime_get_ns() +
144 : BTRFS_DISCARD_UNUSED_DELAY);
145 35242 : block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
146 35242 : if (!queued)
147 803 : btrfs_get_block_group(block_group);
148 35242 : list_add_tail(&block_group->discard_list,
149 : &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
150 :
151 35242 : spin_unlock(&discard_ctl->lock);
152 : }
153 :
154 935 : static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
155 : struct btrfs_block_group *block_group)
156 : {
157 935 : bool running = false;
158 935 : bool queued = false;
159 :
160 935 : spin_lock(&discard_ctl->lock);
161 :
162 935 : if (block_group == discard_ctl->block_group) {
163 267 : running = true;
164 267 : discard_ctl->block_group = NULL;
165 : }
166 :
167 935 : block_group->discard_eligible_time = 0;
168 935 : queued = !list_empty(&block_group->discard_list);
169 935 : list_del_init(&block_group->discard_list);
170 : /*
171 : * If the block group is currently running in the discard workfn, we
172 : * don't want to deref it, since it's still being used by the workfn.
173 : * The workfn will notice this case and deref the block group when it is
174 : * finished.
175 : */
176 935 : if (queued && !running)
177 571 : btrfs_put_block_group(block_group);
178 :
179 935 : spin_unlock(&discard_ctl->lock);
180 :
181 935 : return running;
182 : }
183 :
184 : /*
185 : * Find block_group that's up next for discarding.
186 : *
187 : * @discard_ctl: discard control
188 : * @now: current time
189 : *
190 : * Iterate over the discard lists to find the next block_group up for
191 : * discarding checking the discard_eligible_time of block_group.
192 : */
193 1536087 : static struct btrfs_block_group *find_next_block_group(
194 : struct btrfs_discard_ctl *discard_ctl,
195 : u64 now)
196 : {
197 1536087 : struct btrfs_block_group *ret_block_group = NULL, *block_group;
198 1536087 : int i;
199 :
200 6136561 : for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
201 4603357 : struct list_head *discard_list = &discard_ctl->discard_list[i];
202 :
203 4603357 : if (!list_empty(discard_list)) {
204 235325 : block_group = list_first_entry(discard_list,
205 : struct btrfs_block_group,
206 : discard_list);
207 :
208 235325 : if (!ret_block_group)
209 158365 : ret_block_group = block_group;
210 :
211 235325 : if (ret_block_group->discard_eligible_time < now)
212 : break;
213 :
214 232442 : if (ret_block_group->discard_eligible_time >
215 232442 : block_group->discard_eligible_time)
216 23648 : ret_block_group = block_group;
217 : }
218 : }
219 :
220 1536087 : return ret_block_group;
221 : }
222 :
223 : /*
224 : * Look up next block group and set it for use.
225 : *
226 : * @discard_ctl: discard control
227 : * @discard_state: the discard_state of the block_group after state management
228 : * @discard_index: the discard_index of the block_group after state management
229 : * @now: time when discard was invoked, in ns
230 : *
231 : * Wrap find_next_block_group() and set the block_group to be in use.
232 : * @discard_state's control flow is managed here. Variables related to
233 : * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state
234 : * and @discard_index are remembered as it may change while we're discarding,
235 : * but we want the discard to execute in the context determined here.
236 : */
237 46429 : static struct btrfs_block_group *peek_discard_list(
238 : struct btrfs_discard_ctl *discard_ctl,
239 : enum btrfs_discard_state *discard_state,
240 : int *discard_index, u64 now)
241 : {
242 46429 : struct btrfs_block_group *block_group;
243 :
244 46429 : spin_lock(&discard_ctl->lock);
245 46491 : again:
246 46491 : block_group = find_next_block_group(discard_ctl, now);
247 :
248 46491 : if (block_group && now >= block_group->discard_eligible_time) {
249 4486 : if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
250 1086 : block_group->used != 0) {
251 62 : if (btrfs_is_block_group_data_only(block_group)) {
252 1 : __add_to_discard_list(discard_ctl, block_group);
253 : } else {
254 61 : list_del_init(&block_group->discard_list);
255 61 : btrfs_put_block_group(block_group);
256 : }
257 62 : goto again;
258 : }
259 4424 : if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
260 269 : block_group->discard_cursor = block_group->start;
261 269 : block_group->discard_state = BTRFS_DISCARD_EXTENTS;
262 : }
263 4424 : discard_ctl->block_group = block_group;
264 : }
265 46429 : if (block_group) {
266 46315 : *discard_state = block_group->discard_state;
267 46315 : *discard_index = block_group->discard_index;
268 : }
269 46429 : spin_unlock(&discard_ctl->lock);
270 :
271 46429 : return block_group;
272 : }
273 :
274 : /*
275 : * Update a block group's filters.
276 : *
277 : * @block_group: block group of interest
278 : * @bytes: recently freed region size after coalescing
279 : *
280 : * Async discard maintains multiple lists with progressively smaller filters
281 : * to prioritize discarding based on size. Should a free space that matches
282 : * a larger filter be returned to the free_space_cache, prioritize that discard
283 : * by moving @block_group to the proper filter.
284 : */
285 3704935 : void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
286 : u64 bytes)
287 : {
288 3704935 : struct btrfs_discard_ctl *discard_ctl;
289 :
290 3704935 : if (!block_group ||
291 3704935 : !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
292 : return;
293 :
294 3695537 : discard_ctl = &block_group->fs_info->discard_ctl;
295 :
296 3695537 : if (block_group->discard_index > BTRFS_DISCARD_INDEX_START &&
297 24678 : bytes >= discard_minlen[block_group->discard_index - 1]) {
298 38 : int i;
299 :
300 38 : remove_from_discard_list(discard_ctl, block_group);
301 :
302 76 : for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS;
303 0 : i++) {
304 38 : if (bytes >= discard_minlen[i]) {
305 38 : block_group->discard_index = i;
306 38 : add_to_discard_list(discard_ctl, block_group);
307 38 : break;
308 : }
309 : }
310 : }
311 : }
312 :
313 : /*
314 : * Move a block group along the discard lists.
315 : *
316 : * @discard_ctl: discard control
317 : * @block_group: block_group of interest
318 : *
319 : * Increment @block_group's discard_index. If it falls of the list, let it be.
320 : * Otherwise add it back to the appropriate list.
321 : */
322 133 : static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
323 : struct btrfs_block_group *block_group)
324 : {
325 133 : block_group->discard_index++;
326 133 : if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) {
327 33 : block_group->discard_index = 1;
328 33 : return;
329 : }
330 :
331 100 : add_to_discard_list(discard_ctl, block_group);
332 : }
333 :
334 : /*
335 : * Remove a block_group from the discard lists.
336 : *
337 : * @discard_ctl: discard control
338 : * @block_group: block_group of interest
339 : *
340 : * Remove @block_group from the discard lists. If necessary, wait on the
341 : * current work and then reschedule the delayed work.
342 : */
343 630 : void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
344 : struct btrfs_block_group *block_group)
345 : {
346 630 : if (remove_from_discard_list(discard_ctl, block_group)) {
347 0 : cancel_delayed_work_sync(&discard_ctl->work);
348 0 : btrfs_discard_schedule_work(discard_ctl, true);
349 : }
350 630 : }
351 :
352 : /*
353 : * Handles queuing the block_groups.
354 : *
355 : * @discard_ctl: discard control
356 : * @block_group: block_group of interest
357 : *
358 : * Maintain the LRU order of the discard lists.
359 : */
360 3720663 : void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
361 : struct btrfs_block_group *block_group)
362 : {
363 3720663 : if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
364 : return;
365 :
366 3711265 : if (block_group->used == 0)
367 44520 : add_to_discard_unused_list(discard_ctl, block_group);
368 : else
369 3666745 : add_to_discard_list(discard_ctl, block_group);
370 :
371 3711265 : if (!delayed_work_pending(&discard_ctl->work))
372 1264509 : btrfs_discard_schedule_work(discard_ctl, false);
373 : }
374 :
375 1514050 : static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
376 : u64 now, bool override)
377 : {
378 1514050 : struct btrfs_block_group *block_group;
379 :
380 1514050 : if (!btrfs_run_discard_work(discard_ctl))
381 : return;
382 2779527 : if (!override && delayed_work_pending(&discard_ctl->work))
383 : return;
384 :
385 1489596 : block_group = find_next_block_group(discard_ctl, now);
386 1489596 : if (block_group) {
387 111988 : u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC;
388 111988 : u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
389 :
390 : /*
391 : * A single delayed workqueue item is responsible for
392 : * discarding, so we can manage the bytes rate limit by keeping
393 : * track of the previous discard.
394 : */
395 111988 : if (kbps_limit && discard_ctl->prev_discard) {
396 0 : u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
397 0 : u64 bps_delay = div64_u64(discard_ctl->prev_discard *
398 : NSEC_PER_SEC, bps_limit);
399 :
400 0 : delay = max(delay, bps_delay);
401 : }
402 :
403 : /*
404 : * This timeout is to hopefully prevent immediate discarding
405 : * in a recently allocated block group.
406 : */
407 111988 : if (now < block_group->discard_eligible_time) {
408 107663 : u64 bg_timeout = block_group->discard_eligible_time - now;
409 :
410 107663 : delay = max(delay, bg_timeout);
411 : }
412 :
413 111988 : if (override && discard_ctl->prev_discard) {
414 288 : u64 elapsed = now - discard_ctl->prev_discard_time;
415 :
416 288 : if (delay > elapsed)
417 221 : delay -= elapsed;
418 : else
419 : delay = 0;
420 : }
421 :
422 111988 : mod_delayed_work(discard_ctl->discard_workers,
423 : &discard_ctl->work, nsecs_to_jiffies(delay));
424 : }
425 : }
426 :
427 : /*
428 : * Responsible for scheduling the discard work.
429 : *
430 : * @discard_ctl: discard control
431 : * @override: override the current timer
432 : *
433 : * Discards are issued by a delayed workqueue item. @override is used to
434 : * update the current delay as the baseline delay interval is reevaluated on
435 : * transaction commit. This is also maxed with any other rate limit.
436 : */
437 1509627 : void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
438 : bool override)
439 : {
440 1509627 : const u64 now = ktime_get_ns();
441 :
442 1509627 : spin_lock(&discard_ctl->lock);
443 1509627 : __btrfs_discard_schedule_work(discard_ctl, now, override);
444 1509627 : spin_unlock(&discard_ctl->lock);
445 1509627 : }
446 :
447 : /*
448 : * Determine next step of a block_group.
449 : *
450 : * @discard_ctl: discard control
451 : * @block_group: block_group of interest
452 : *
453 : * Determine the next step for a block group after it's finished going through
454 : * a pass on a discard list. If it is unused and fully trimmed, we can mark it
455 : * unused and send it to the unused_bgs path. Otherwise, pass it onto the
456 : * appropriate filter list or let it fall off.
457 : */
458 267 : static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
459 : struct btrfs_block_group *block_group)
460 : {
461 267 : remove_from_discard_list(discard_ctl, block_group);
462 :
463 267 : if (block_group->used == 0) {
464 134 : if (btrfs_is_free_space_trimmed(block_group))
465 134 : btrfs_mark_bg_unused(block_group);
466 : else
467 0 : add_to_discard_unused_list(discard_ctl, block_group);
468 : } else {
469 133 : btrfs_update_discard_index(discard_ctl, block_group);
470 : }
471 267 : }
472 :
473 : /*
474 : * Discard work queue callback
475 : *
476 : * @work: work
477 : *
478 : * Find the next block_group to start discarding and then discard a single
479 : * region. It does this in a two-pass fashion: first extents and second
480 : * bitmaps. Completely discarded block groups are sent to the unused_bgs path.
481 : */
482 46429 : static void btrfs_discard_workfn(struct work_struct *work)
483 : {
484 46429 : struct btrfs_discard_ctl *discard_ctl;
485 46429 : struct btrfs_block_group *block_group;
486 46429 : enum btrfs_discard_state discard_state;
487 46429 : int discard_index = 0;
488 46429 : u64 trimmed = 0;
489 46429 : u64 minlen = 0;
490 46429 : u64 now = ktime_get_ns();
491 :
492 46429 : discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
493 :
494 46429 : block_group = peek_discard_list(discard_ctl, &discard_state,
495 : &discard_index, now);
496 46429 : if (!block_group || !btrfs_run_discard_work(discard_ctl))
497 42006 : return;
498 46314 : if (now < block_group->discard_eligible_time) {
499 41891 : btrfs_discard_schedule_work(discard_ctl, false);
500 41891 : return;
501 : }
502 :
503 : /* Perform discarding */
504 4423 : minlen = discard_minlen[discard_index];
505 :
506 4423 : if (discard_state == BTRFS_DISCARD_BITMAPS) {
507 1512 : u64 maxlen = 0;
508 :
509 : /*
510 : * Use the previous levels minimum discard length as the max
511 : * length filter. In the case something is added to make a
512 : * region go beyond the max filter, the entire bitmap is set
513 : * back to BTRFS_TRIM_STATE_UNTRIMMED.
514 : */
515 1512 : if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
516 1378 : maxlen = discard_minlen[discard_index - 1];
517 :
518 1512 : btrfs_trim_block_group_bitmaps(block_group, &trimmed,
519 : block_group->discard_cursor,
520 : btrfs_block_group_end(block_group),
521 : minlen, maxlen, true);
522 1512 : discard_ctl->discard_bitmap_bytes += trimmed;
523 : } else {
524 2911 : btrfs_trim_block_group_extents(block_group, &trimmed,
525 : block_group->discard_cursor,
526 : btrfs_block_group_end(block_group),
527 : minlen, true);
528 2911 : discard_ctl->discard_extent_bytes += trimmed;
529 : }
530 :
531 : /* Determine next steps for a block_group */
532 4423 : if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
533 535 : if (discard_state == BTRFS_DISCARD_BITMAPS) {
534 267 : btrfs_finish_discard_pass(discard_ctl, block_group);
535 : } else {
536 268 : block_group->discard_cursor = block_group->start;
537 268 : spin_lock(&discard_ctl->lock);
538 268 : if (block_group->discard_state !=
539 : BTRFS_DISCARD_RESET_CURSOR)
540 268 : block_group->discard_state =
541 : BTRFS_DISCARD_BITMAPS;
542 268 : spin_unlock(&discard_ctl->lock);
543 : }
544 : }
545 :
546 4423 : now = ktime_get_ns();
547 4423 : spin_lock(&discard_ctl->lock);
548 4423 : discard_ctl->prev_discard = trimmed;
549 4423 : discard_ctl->prev_discard_time = now;
550 : /*
551 : * If the block group was removed from the discard list while it was
552 : * running in this workfn, then we didn't deref it, since this function
553 : * still owned that reference. But we set the discard_ctl->block_group
554 : * back to NULL, so we can use that condition to know that now we need
555 : * to deref the block_group.
556 : */
557 4423 : if (discard_ctl->block_group == NULL)
558 267 : btrfs_put_block_group(block_group);
559 4423 : discard_ctl->block_group = NULL;
560 4423 : __btrfs_discard_schedule_work(discard_ctl, now, false);
561 4423 : spin_unlock(&discard_ctl->lock);
562 : }
563 :
564 : /*
565 : * Recalculate the base delay.
566 : *
567 : * @discard_ctl: discard control
568 : *
569 : * Recalculate the base delay which is based off the total number of
570 : * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms)
571 : * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
572 : */
573 203227 : void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
574 : {
575 203227 : s32 discardable_extents;
576 203227 : s64 discardable_bytes;
577 203227 : u32 iops_limit;
578 203227 : unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC;
579 203227 : unsigned long delay;
580 :
581 203227 : discardable_extents = atomic_read(&discard_ctl->discardable_extents);
582 203227 : if (!discardable_extents)
583 : return;
584 :
585 31151 : spin_lock(&discard_ctl->lock);
586 :
587 : /*
588 : * The following is to fix a potential -1 discrepancy that we're not
589 : * sure how to reproduce. But given that this is the only place that
590 : * utilizes these numbers and this is only called by from
591 : * btrfs_finish_extent_commit() which is synchronized, we can correct
592 : * here.
593 : */
594 31151 : if (discardable_extents < 0)
595 6 : atomic_add(-discardable_extents,
596 : &discard_ctl->discardable_extents);
597 :
598 31151 : discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes);
599 31151 : if (discardable_bytes < 0)
600 1 : atomic64_add(-discardable_bytes,
601 : &discard_ctl->discardable_bytes);
602 :
603 31151 : if (discardable_extents <= 0) {
604 6 : spin_unlock(&discard_ctl->lock);
605 6 : return;
606 : }
607 :
608 31145 : iops_limit = READ_ONCE(discard_ctl->iops_limit);
609 :
610 31145 : if (iops_limit) {
611 31145 : delay = MSEC_PER_SEC / iops_limit;
612 : } else {
613 : /*
614 : * Unset iops_limit means go as fast as possible, so allow a
615 : * delay of 0.
616 : */
617 : delay = 0;
618 : min_delay = 0;
619 : }
620 :
621 31145 : delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC);
622 31145 : discard_ctl->delay_ms = delay;
623 :
624 31145 : spin_unlock(&discard_ctl->lock);
625 : }
626 :
627 : /*
628 : * Propagate discard counters.
629 : *
630 : * @block_group: block_group of interest
631 : *
632 : * Propagate deltas of counters up to the discard_ctl. It maintains a current
633 : * counter and a previous counter passing the delta up to the global stat.
634 : * Then the current counter value becomes the previous counter value.
635 : */
636 8065330 : void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
637 : {
638 8065330 : struct btrfs_free_space_ctl *ctl;
639 8065330 : struct btrfs_discard_ctl *discard_ctl;
640 8065330 : s32 extents_delta;
641 8065330 : s64 bytes_delta;
642 :
643 8065330 : if (!block_group ||
644 8065330 : !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) ||
645 : !btrfs_is_block_group_data_only(block_group))
646 : return;
647 :
648 4041244 : ctl = block_group->free_space_ctl;
649 4041244 : discard_ctl = &block_group->fs_info->discard_ctl;
650 :
651 4041244 : lockdep_assert_held(&ctl->tree_lock);
652 4041244 : extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
653 4041244 : ctl->discardable_extents[BTRFS_STAT_PREV];
654 4041244 : if (extents_delta) {
655 226090 : atomic_add(extents_delta, &discard_ctl->discardable_extents);
656 226091 : ctl->discardable_extents[BTRFS_STAT_PREV] =
657 226091 : ctl->discardable_extents[BTRFS_STAT_CURR];
658 : }
659 :
660 4041245 : bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
661 4041245 : ctl->discardable_bytes[BTRFS_STAT_PREV];
662 4041245 : if (bytes_delta) {
663 1900423 : atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
664 1900424 : ctl->discardable_bytes[BTRFS_STAT_PREV] =
665 1900424 : ctl->discardable_bytes[BTRFS_STAT_CURR];
666 : }
667 : }
668 :
669 : /*
670 : * Punt unused_bgs list to discard lists.
671 : *
672 : * @fs_info: fs_info of interest
673 : *
674 : * The unused_bgs list needs to be punted to the discard lists because the
675 : * order of operations is changed. In the normal synchronous discard path, the
676 : * block groups are trimmed via a single large trim in transaction commit. This
677 : * is ultimately what we are trying to avoid with asynchronous discard. Thus,
678 : * it must be done before going down the unused_bgs path.
679 : */
680 3161 : void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
681 : {
682 3161 : struct btrfs_block_group *block_group, *next;
683 :
684 3161 : spin_lock(&fs_info->unused_bgs_lock);
685 : /* We enabled async discard, so punt all to the queue */
686 12434 : list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
687 : bg_list) {
688 9273 : list_del_init(&block_group->bg_list);
689 9273 : btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
690 : /*
691 : * This put is for the get done by btrfs_mark_bg_unused.
692 : * Queueing discard incremented it for discard's reference.
693 : */
694 9273 : btrfs_put_block_group(block_group);
695 : }
696 3161 : spin_unlock(&fs_info->unused_bgs_lock);
697 3161 : }
698 :
699 : /*
700 : * Purge discard lists.
701 : *
702 : * @discard_ctl: discard control
703 : *
704 : * If we are disabling async discard, we may have intercepted block groups that
705 : * are completely free and ready for the unused_bgs path. As discarding will
706 : * now happen in transaction commit or not at all, we can safely mark the
707 : * corresponding block groups as unused and they will be sent on their merry
708 : * way to the unused_bgs list.
709 : */
710 3234 : static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
711 : {
712 3234 : struct btrfs_block_group *block_group, *next;
713 3234 : int i;
714 :
715 3234 : spin_lock(&discard_ctl->lock);
716 16170 : for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
717 10974 : list_for_each_entry_safe(block_group, next,
718 : &discard_ctl->discard_list[i],
719 : discard_list) {
720 1272 : list_del_init(&block_group->discard_list);
721 1272 : spin_unlock(&discard_ctl->lock);
722 1272 : if (block_group->used == 0)
723 249 : btrfs_mark_bg_unused(block_group);
724 1272 : spin_lock(&discard_ctl->lock);
725 1272 : btrfs_put_block_group(block_group);
726 : }
727 : }
728 3234 : spin_unlock(&discard_ctl->lock);
729 3234 : }
730 :
731 3177 : void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
732 : {
733 3177 : if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
734 16 : btrfs_discard_cleanup(fs_info);
735 16 : return;
736 : }
737 :
738 3161 : btrfs_discard_punt_unused_bgs_list(fs_info);
739 :
740 3161 : set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
741 : }
742 :
743 17 : void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
744 : {
745 17 : clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
746 17 : }
747 :
748 3467 : void btrfs_discard_init(struct btrfs_fs_info *fs_info)
749 : {
750 3467 : struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
751 3467 : int i;
752 :
753 3467 : spin_lock_init(&discard_ctl->lock);
754 3467 : INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);
755 :
756 17335 : for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
757 10401 : INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
758 :
759 3467 : discard_ctl->prev_discard = 0;
760 3467 : discard_ctl->prev_discard_time = 0;
761 3467 : atomic_set(&discard_ctl->discardable_extents, 0);
762 3467 : atomic64_set(&discard_ctl->discardable_bytes, 0);
763 3467 : discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
764 3467 : discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC;
765 3467 : discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
766 3467 : discard_ctl->kbps_limit = 0;
767 3467 : discard_ctl->discard_extent_bytes = 0;
768 3467 : discard_ctl->discard_bitmap_bytes = 0;
769 3467 : atomic64_set(&discard_ctl->discard_bytes_saved, 0);
770 3467 : }
771 :
772 3234 : void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
773 : {
774 3234 : btrfs_discard_stop(fs_info);
775 3234 : cancel_delayed_work_sync(&fs_info->discard_ctl.work);
776 3234 : btrfs_discard_purge_list(&fs_info->discard_ctl);
777 3234 : }
|