Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
4 : */
5 :
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_format.h"
9 : #include "xfs_log_format.h"
10 : #include "xfs_shared.h"
11 : #include "xfs_trans_resv.h"
12 : #include "xfs_mount.h"
13 : #include "xfs_extent_busy.h"
14 : #include "xfs_trans.h"
15 : #include "xfs_trans_priv.h"
16 : #include "xfs_log.h"
17 : #include "xfs_log_priv.h"
18 : #include "xfs_trace.h"
19 :
20 : struct workqueue_struct *xfs_discard_wq;
21 :
22 : /*
23 : * Allocate a new ticket. Failing to get a new ticket makes it really hard to
24 : * recover, so we don't allow failure here. Also, we allocate in a context that
25 : * we don't want to be issuing transactions from, so we need to tell the
26 : * allocation code this as well.
27 : *
28 : * We don't reserve any space for the ticket - we are going to steal whatever
29 : * space we require from transactions as they commit. To ensure we reserve all
30 : * the space required, we need to set the current reservation of the ticket to
31 : * zero so that we know to steal the initial transaction overhead from the
32 : * first transaction commit.
33 : */
34 : static struct xlog_ticket *
35 2573195 : xlog_cil_ticket_alloc(
36 : struct xlog *log)
37 : {
38 2573195 : struct xlog_ticket *tic;
39 :
40 2573195 : tic = xlog_ticket_alloc(log, 0, 1, 0);
41 :
42 : /*
43 : * set the current reservation to zero so we know to steal the basic
44 : * transaction overhead reservation from the first transaction commit.
45 : */
46 2573193 : tic->t_curr_res = 0;
47 2573193 : tic->t_iclog_hdrs = 0;
48 2573193 : return tic;
49 : }
50 :
51 : static inline void
52 2595686 : xlog_cil_set_iclog_hdr_count(struct xfs_cil *cil)
53 : {
54 2595686 : struct xlog *log = cil->xc_log;
55 :
56 5191372 : atomic_set(&cil->xc_iclog_hdrs,
57 2595686 : (XLOG_CIL_BLOCKING_SPACE_LIMIT(log) /
58 2595686 : (log->l_iclog_size - log->l_iclog_hsize)));
59 2595686 : }
60 :
61 : /*
62 : * Check if the current log item was first committed in this sequence.
63 : * We can't rely on just the log item being in the CIL, we have to check
64 : * the recorded commit sequence number.
65 : *
66 : * Note: for this to be used in a non-racy manner, it has to be called with
67 : * CIL flushing locked out. As a result, it should only be used during the
68 : * transaction commit process when deciding what to format into the item.
69 : */
70 : static bool
71 1034479196 : xlog_item_in_current_chkpt(
72 : struct xfs_cil *cil,
73 : struct xfs_log_item *lip)
74 : {
75 2068958392 : if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags))
76 : return false;
77 :
78 : /*
79 : * li_seq is written on the first commit of a log item to record the
80 : * first checkpoint it is written to. Hence if it is different to the
81 : * current sequence, we're in a new checkpoint.
82 : */
83 1034400431 : return lip->li_seq == READ_ONCE(cil->xc_current_sequence);
84 : }
85 :
86 : bool
87 651598872 : xfs_log_item_in_current_chkpt(
88 : struct xfs_log_item *lip)
89 : {
90 651598872 : return xlog_item_in_current_chkpt(lip->li_log->l_cilp, lip);
91 : }
92 :
93 : /*
94 : * Unavoidable forward declaration - xlog_cil_push_work() calls
95 : * xlog_cil_ctx_alloc() itself.
96 : */
97 : static void xlog_cil_push_work(struct work_struct *work);
98 :
99 : static struct xfs_cil_ctx *
100 2573203 : xlog_cil_ctx_alloc(void)
101 : {
102 2573203 : struct xfs_cil_ctx *ctx;
103 :
104 2573203 : ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS);
105 2573202 : INIT_LIST_HEAD(&ctx->committing);
106 2573202 : INIT_LIST_HEAD(&ctx->busy_extents);
107 2573202 : INIT_LIST_HEAD(&ctx->log_items);
108 2573202 : INIT_LIST_HEAD(&ctx->lv_chain);
109 2573202 : INIT_WORK(&ctx->push_work, xlog_cil_push_work);
110 2573202 : return ctx;
111 : }
112 :
113 : /*
114 : * Aggregate the CIL per cpu structures into global counts, lists, etc and
115 : * clear the percpu state ready for the next context to use. This is called
116 : * from the push code with the context lock held exclusively, hence nothing else
117 : * will be accessing or modifying the per-cpu counters.
118 : */
119 : static void
120 2550710 : xlog_cil_push_pcp_aggregate(
121 : struct xfs_cil *cil,
122 : struct xfs_cil_ctx *ctx)
123 : {
124 2550710 : struct xlog_cil_pcp *cilpcp;
125 2550710 : int cpu;
126 :
127 7651069 : for_each_online_cpu(cpu) {
128 5100357 : cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
129 :
130 5100359 : ctx->ticket->t_curr_res += cilpcp->space_reserved;
131 5100359 : cilpcp->space_reserved = 0;
132 :
133 5100359 : if (!list_empty(&cilpcp->busy_extents)) {
134 1924923 : list_splice_init(&cilpcp->busy_extents,
135 : &ctx->busy_extents);
136 : }
137 5100359 : if (!list_empty(&cilpcp->log_items))
138 4067383 : list_splice_init(&cilpcp->log_items, &ctx->log_items);
139 :
140 : /*
141 : * We're in the middle of switching cil contexts. Reset the
142 : * counter we use to detect when the current context is nearing
143 : * full.
144 : */
145 5100359 : cilpcp->space_used = 0;
146 : }
147 2550709 : }
148 :
149 : /*
150 : * Aggregate the CIL per-cpu space used counters into the global atomic value.
151 : * This is called when the per-cpu counter aggregation will first pass the soft
152 : * limit threshold so we can switch to atomic counter aggregation for accurate
153 : * detection of hard limit traversal.
154 : */
155 : static void
156 611 : xlog_cil_insert_pcp_aggregate(
157 : struct xfs_cil *cil,
158 : struct xfs_cil_ctx *ctx)
159 : {
160 611 : struct xlog_cil_pcp *cilpcp;
161 611 : int cpu;
162 611 : int count = 0;
163 :
164 : /* Trigger atomic updates then aggregate only for the first caller */
165 1222 : if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags))
166 : return;
167 :
168 1833 : for_each_online_cpu(cpu) {
169 1222 : int old, prev;
170 :
171 1222 : cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
172 1222 : do {
173 1222 : old = cilpcp->space_used;
174 1222 : prev = cmpxchg(&cilpcp->space_used, old, 0);
175 1222 : } while (old != prev);
176 1222 : count += old;
177 : }
178 611 : atomic_add(count, &ctx->space_used);
179 : }
180 :
181 : static void
182 2573203 : xlog_cil_ctx_switch(
183 : struct xfs_cil *cil,
184 : struct xfs_cil_ctx *ctx)
185 : {
186 2573203 : xlog_cil_set_iclog_hdr_count(cil);
187 2573203 : set_bit(XLOG_CIL_EMPTY, &cil->xc_flags);
188 2573203 : set_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags);
189 2573202 : ctx->sequence = ++cil->xc_current_sequence;
190 2573202 : ctx->cil = cil;
191 2573202 : cil->xc_ctx = ctx;
192 2573202 : }
193 :
194 : /*
195 : * After the first stage of log recovery is done, we know where the head and
196 : * tail of the log are. We need this log initialisation done before we can
197 : * initialise the first CIL checkpoint context.
198 : *
199 : * Here we allocate a log ticket to track space usage during a CIL push. This
200 : * ticket is passed to xlog_write() directly so that we don't slowly leak log
201 : * space by failing to account for space used by log headers and additional
202 : * region headers for split regions.
203 : */
204 : void
205 22483 : xlog_cil_init_post_recovery(
206 : struct xlog *log)
207 : {
208 22483 : log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
209 22483 : log->l_cilp->xc_ctx->sequence = 1;
210 22483 : xlog_cil_set_iclog_hdr_count(log->l_cilp);
211 22483 : }
212 :
213 : static inline int
214 : xlog_cil_iovec_space(
215 : uint niovecs)
216 : {
217 14976959087 : return round_up((sizeof(struct xfs_log_vec) +
218 : niovecs * sizeof(struct xfs_log_iovec)),
219 : sizeof(uint64_t));
220 : }
221 :
222 : /*
223 : * Allocate or pin log vector buffers for CIL insertion.
224 : *
225 : * The CIL currently uses disposable buffers for copying a snapshot of the
226 : * modified items into the log during a push. The biggest problem with this is
227 : * the requirement to allocate the disposable buffer during the commit if:
228 : * a) does not exist; or
229 : * b) it is too small
230 : *
231 : * If we do this allocation within xlog_cil_insert_format_items(), it is done
232 : * under the xc_ctx_lock, which means that a CIL push cannot occur during
233 : * the memory allocation. This means that we have a potential deadlock situation
234 : * under low memory conditions when we have lots of dirty metadata pinned in
235 : * the CIL and we need a CIL commit to occur to free memory.
236 : *
237 : * To avoid this, we need to move the memory allocation outside the
238 : * xc_ctx_lock, but because the log vector buffers are disposable, that opens
239 : * up a TOCTOU race condition w.r.t. the CIL committing and removing the log
240 : * vector buffers between the check and the formatting of the item into the
241 : * log vector buffer within the xc_ctx_lock.
242 : *
243 : * Because the log vector buffer needs to be unchanged during the CIL push
244 : * process, we cannot share the buffer between the transaction commit (which
245 : * modifies the buffer) and the CIL push context that is writing the changes
246 : * into the log. This means skipping preallocation of buffer space is
247 : * unreliable, but we most definitely do not want to be allocating and freeing
248 : * buffers unnecessarily during commits when overwrites can be done safely.
249 : *
250 : * The simplest solution to this problem is to allocate a shadow buffer when a
251 : * log item is committed for the second time, and then to only use this buffer
252 : * if necessary. The buffer can remain attached to the log item until such time
253 : * it is needed, and this is the buffer that is reallocated to match the size of
254 : * the incoming modification. Then during the formatting of the item we can swap
255 : * the active buffer with the new one if we can't reuse the existing buffer. We
256 : * don't free the old buffer as it may be reused on the next modification if
257 : * it's size is right, otherwise we'll free and reallocate it at that point.
258 : *
259 : * This function builds a vector for the changes in each log item in the
260 : * transaction. It then works out the length of the buffer needed for each log
261 : * item, allocates them and attaches the vector to the log item in preparation
262 : * for the formatting step which occurs under the xc_ctx_lock.
263 : *
264 : * While this means the memory footprint goes up, it avoids the repeated
265 : * alloc/free pattern that repeated modifications of an item would otherwise
266 : * cause, and hence minimises the CPU overhead of such behaviour.
267 : */
268 : static void
269 901767291 : xlog_cil_alloc_shadow_bufs(
270 : struct xlog *log,
271 : struct xfs_trans *tp)
272 : {
273 901767291 : struct xfs_log_item *lip;
274 :
275 6395077614 : list_for_each_entry(lip, &tp->t_items, li_trans) {
276 5493344101 : struct xfs_log_vec *lv;
277 5493344101 : int niovecs = 0;
278 5493344101 : int nbytes = 0;
279 5493344101 : int buf_size;
280 5493344101 : bool ordered = false;
281 :
282 : /* Skip items which aren't dirty in this transaction. */
283 5493344101 : if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
284 520922381 : continue;
285 :
286 : /* get number of vecs and size of data to be stored */
287 4972421720 : lip->li_ops->iop_size(lip, &niovecs, &nbytes);
288 :
289 : /*
290 : * Ordered items need to be tracked but we do not wish to write
291 : * them. We need a logvec to track the object, but we do not
292 : * need an iovec or buffer to be allocated for copying data.
293 : */
294 4971819910 : if (niovecs == XFS_LOG_VEC_ORDERED) {
295 557468 : ordered = true;
296 557468 : niovecs = 0;
297 557468 : nbytes = 0;
298 : }
299 :
300 : /*
301 : * We 64-bit align the length of each iovec so that the start of
302 : * the next one is naturally aligned. We'll need to account for
303 : * that slack space here.
304 : *
305 : * We also add the xlog_op_header to each region when
306 : * formatting, but that's not accounted to the size of the item
307 : * at this point. Hence we'll need an addition number of bytes
308 : * for each vector to hold an opheader.
309 : *
310 : * Then round nbytes up to 64-bit alignment so that the initial
311 : * buffer alignment is easy to calculate and verify.
312 : */
313 4971819910 : nbytes += niovecs *
314 : (sizeof(uint64_t) + sizeof(struct xlog_op_header));
315 4971819910 : nbytes = round_up(nbytes, sizeof(uint64_t));
316 :
317 : /*
318 : * The data buffer needs to start 64-bit aligned, so round up
319 : * that space to ensure we can align it appropriately and not
320 : * overrun the buffer.
321 : */
322 4971819910 : buf_size = nbytes + xlog_cil_iovec_space(niovecs);
323 :
324 : /*
325 : * if we have no shadow buffer, or it is too small, we need to
326 : * reallocate it.
327 : */
328 4971819910 : if (!lip->li_lv_shadow ||
329 3758596815 : buf_size > lip->li_lv_shadow->lv_size) {
330 : /*
331 : * We free and allocate here as a realloc would copy
332 : * unnecessary data. We don't use kvzalloc() for the
333 : * same reason - we don't need to zero the data area in
334 : * the buffer, only the log vector header and the iovec
335 : * storage.
336 : */
337 1277544158 : kmem_free(lip->li_lv_shadow);
338 1278125023 : lv = xlog_kvmalloc(buf_size);
339 :
340 1278112190 : memset(lv, 0, xlog_cil_iovec_space(niovecs));
341 :
342 1278112190 : INIT_LIST_HEAD(&lv->lv_list);
343 1278112190 : lv->lv_item = lip;
344 1278112190 : lv->lv_size = buf_size;
345 1278112190 : if (ordered)
346 547476 : lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
347 : else
348 1277564714 : lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
349 1278112190 : lip->li_lv_shadow = lv;
350 : } else {
351 : /* same or smaller, optimise common overwrite case */
352 3694275752 : lv = lip->li_lv_shadow;
353 3694275752 : if (ordered)
354 9993 : lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
355 : else
356 3694265759 : lv->lv_buf_len = 0;
357 3694275752 : lv->lv_bytes = 0;
358 : }
359 :
360 : /* Ensure the lv is set up according to ->iop_size */
361 4972387942 : lv->lv_niovecs = niovecs;
362 :
363 : /* The allocated data region lies beyond the iovec region */
364 4972387942 : lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs);
365 : }
366 :
367 901733513 : }
368 :
369 : /*
370 : * Prepare the log item for insertion into the CIL. Calculate the difference in
371 : * log space it will consume, and if it is a new item pin it as well.
372 : */
373 : STATIC void
374 4589841402 : xfs_cil_prepare_item(
375 : struct xlog *log,
376 : struct xfs_log_vec *lv,
377 : struct xfs_log_vec *old_lv,
378 : int *diff_len)
379 : {
380 : /* Account for the new LV being passed in */
381 4589841402 : if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
382 4588901647 : *diff_len += lv->lv_bytes;
383 :
384 : /*
385 : * If there is no old LV, this is the first time we've seen the item in
386 : * this CIL context and so we need to pin it. If we are replacing the
387 : * old_lv, then remove the space it accounts for and make it the shadow
388 : * buffer for later freeing. In both cases we are now switching to the
389 : * shadow buffer, so update the pointer to it appropriately.
390 : */
391 4589841402 : if (!old_lv) {
392 797995194 : if (lv->lv_item->li_ops->iop_pin)
393 414051255 : lv->lv_item->li_ops->iop_pin(lv->lv_item);
394 797990638 : lv->lv_item->li_lv_shadow = NULL;
395 3791846208 : } else if (old_lv != lv) {
396 37036986 : ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
397 :
398 37036986 : *diff_len -= old_lv->lv_bytes;
399 37036986 : lv->lv_item->li_lv_shadow = old_lv;
400 : }
401 :
402 : /* attach new log vector to log item */
403 4589836846 : lv->lv_item->li_lv = lv;
404 :
405 : /*
406 : * If this is the first time the item is being committed to the
407 : * CIL, store the sequence number on the log item so we can
408 : * tell in future commits whether this is the first checkpoint
409 : * the item is being committed into.
410 : */
411 4589836846 : if (!lv->lv_item->li_seq)
412 466931557 : lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
413 4589836846 : }
414 :
415 : /*
416 : * Format log item into a flat buffers
417 : *
418 : * For delayed logging, we need to hold a formatted buffer containing all the
419 : * changes on the log item. This enables us to relog the item in memory and
420 : * write it out asynchronously without needing to relock the object that was
421 : * modified at the time it gets written into the iclog.
422 : *
423 : * This function takes the prepared log vectors attached to each log item, and
424 : * formats the changes into the log vector buffer. The buffer it uses is
425 : * dependent on the current state of the vector in the CIL - the shadow lv is
426 : * guaranteed to be large enough for the current modification, but we will only
427 : * use that if we can't reuse the existing lv. If we can't reuse the existing
428 : * lv, then simple swap it out for the shadow lv. We don't free it - that is
429 : * done lazily either by th enext modification or the freeing of the log item.
430 : *
431 : * We don't set up region headers during this process; we simply copy the
432 : * regions into the flat buffer. We can do this because we still have to do a
433 : * formatting step to write the regions into the iclog buffer. Writing the
434 : * ophdrs during the iclog write means that we can support splitting large
435 : * regions across iclog boundares without needing a change in the format of the
436 : * item/region encapsulation.
437 : *
438 : * Hence what we need to do now is change the rewrite the vector array to point
439 : * to the copied region inside the buffer we just allocated. This allows us to
440 : * format the regions into the iclog as though they are being formatted
441 : * directly out of the objects themselves.
442 : */
443 : static void
444 901776972 : xlog_cil_insert_format_items(
445 : struct xlog *log,
446 : struct xfs_trans *tp,
447 : int *diff_len)
448 : {
449 901776972 : struct xfs_log_item *lip;
450 :
451 : /* Bail out if we didn't find a log item. */
452 901776972 : if (list_empty(&tp->t_items)) {
453 0 : ASSERT(0);
454 0 : return;
455 : }
456 :
457 6011779667 : list_for_each_entry(lip, &tp->t_items, li_trans) {
458 5110043445 : struct xfs_log_vec *lv;
459 5110043445 : struct xfs_log_vec *old_lv = NULL;
460 5110043445 : struct xfs_log_vec *shadow;
461 5110043445 : bool ordered = false;
462 :
463 : /* Skip items which aren't dirty in this transaction. */
464 5110043445 : if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
465 520921594 : continue;
466 :
467 : /*
468 : * The formatting size information is already attached to
469 : * the shadow lv on the log item.
470 : */
471 4589121851 : shadow = lip->li_lv_shadow;
472 4589121851 : if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
473 557467 : ordered = true;
474 :
475 : /* Skip items that do not have any vectors for writing */
476 4589121851 : if (!shadow->lv_niovecs && !ordered)
477 0 : continue;
478 :
479 : /* compare to existing item size */
480 4589121851 : old_lv = lip->li_lv;
481 4589121851 : if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
482 : /* same or smaller, optimise common overwrite case */
483 3754659031 : lv = lip->li_lv;
484 :
485 3754659031 : if (ordered)
486 19986 : goto insert;
487 :
488 : /*
489 : * set the item up as though it is a new insertion so
490 : * that the space reservation accounting is correct.
491 : */
492 3754639045 : *diff_len -= lv->lv_bytes;
493 :
494 : /* Ensure the lv is set up according to ->iop_size */
495 3754639045 : lv->lv_niovecs = shadow->lv_niovecs;
496 :
497 : /* reset the lv buffer information for new formatting */
498 3754639045 : lv->lv_buf_len = 0;
499 3754639045 : lv->lv_bytes = 0;
500 3754639045 : lv->lv_buf = (char *)lv +
501 3754639045 : xlog_cil_iovec_space(lv->lv_niovecs);
502 : } else {
503 : /* switch to shadow buffer! */
504 834462820 : lv = shadow;
505 834462820 : lv->lv_item = lip;
506 834462820 : if (ordered) {
507 : /* track as an ordered logvec */
508 537483 : ASSERT(lip->li_lv == NULL);
509 537483 : goto insert;
510 : }
511 : }
512 :
513 4588564382 : ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
514 4588564382 : lip->li_ops->iop_format(lip, lv);
515 4589527280 : insert:
516 4589527280 : xfs_cil_prepare_item(log, lv, old_lv, diff_len);
517 : }
518 : }
519 :
520 : /*
521 : * The use of lockless waitqueue_active() requires that the caller has
522 : * serialised itself against the wakeup call in xlog_cil_push_work(). That
523 : * can be done by either holding the push lock or the context lock.
524 : */
525 : static inline bool
526 805799345 : xlog_cil_over_hard_limit(
527 : struct xlog *log,
528 : int32_t space_used)
529 : {
530 805799345 : if (waitqueue_active(&log->l_cilp->xc_push_wait))
531 : return true;
532 805810480 : if (space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log))
533 0 : return true;
534 : return false;
535 : }
536 :
537 : /*
538 : * Insert the log items into the CIL and calculate the difference in space
539 : * consumed by the item. Add the space to the checkpoint ticket and calculate
540 : * if the change requires additional log metadata. If it does, take that space
541 : * as well. Remove the amount of space we added to the checkpoint ticket from
542 : * the current transaction ticket so that the accounting works out correctly.
543 : */
544 : static void
545 901769215 : xlog_cil_insert_items(
546 : struct xlog *log,
547 : struct xfs_trans *tp,
548 : uint32_t released_space)
549 : {
550 901769215 : struct xfs_cil *cil = log->l_cilp;
551 901769215 : struct xfs_cil_ctx *ctx = cil->xc_ctx;
552 901769215 : struct xfs_log_item *lip;
553 901769215 : int len = 0;
554 901769215 : int iovhdr_res = 0, split_res = 0, ctx_res = 0;
555 901769215 : int space_used;
556 901769215 : int order;
557 901769215 : struct xlog_cil_pcp *cilpcp;
558 :
559 901769215 : ASSERT(tp);
560 :
561 : /*
562 : * We can do this safely because the context can't checkpoint until we
563 : * are done so it doesn't matter exactly how we update the CIL.
564 : */
565 901769215 : xlog_cil_insert_format_items(log, tp, &len);
566 :
567 : /*
568 : * Subtract the space released by intent cancelation from the space we
569 : * consumed so that we remove it from the CIL space and add it back to
570 : * the current transaction reservation context.
571 : */
572 901739467 : len -= released_space;
573 :
574 : /*
575 : * Grab the per-cpu pointer for the CIL before we start any accounting.
576 : * That ensures that we are running with pre-emption disabled and so we
577 : * can't be scheduled away between split sample/update operations that
578 : * are done without outside locking to serialise them.
579 : */
580 901739467 : cilpcp = get_cpu_ptr(cil->xc_pcp);
581 :
582 : /*
583 : * We need to take the CIL checkpoint unit reservation on the first
584 : * commit into the CIL. Test the XLOG_CIL_EMPTY bit first so we don't
585 : * unnecessarily do an atomic op in the fast path here. We can clear the
586 : * XLOG_CIL_EMPTY bit as we are under the xc_ctx_lock here and that
587 : * needs to be held exclusively to reset the XLOG_CIL_EMPTY bit.
588 : */
589 1806115679 : if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) &&
590 : test_and_clear_bit(XLOG_CIL_EMPTY, &cil->xc_flags))
591 2550705 : ctx_res = ctx->ticket->t_unit_res;
592 :
593 : /*
594 : * Check if we need to steal iclog headers. atomic_read() is not a
595 : * locked atomic operation, so we can check the value before we do any
596 : * real atomic ops in the fast path. If we've already taken the CIL unit
597 : * reservation from this commit, we've already got one iclog header
598 : * space reserved so we have to account for that otherwise we risk
599 : * overrunning the reservation on this ticket.
600 : *
601 : * If the CIL is already at the hard limit, we might need more header
602 : * space that originally reserved. So steal more header space from every
603 : * commit that occurs once we are over the hard limit to ensure the CIL
604 : * push won't run out of reservation space.
605 : *
606 : * This can steal more than we need, but that's OK.
607 : *
608 : * The cil->xc_ctx_lock provides the serialisation necessary for safely
609 : * calling xlog_cil_over_hard_limit() in this context.
610 : */
611 901782462 : space_used = atomic_read(&ctx->space_used) + cilpcp->space_used + len;
612 1707557014 : if (atomic_read(&cil->xc_iclog_hdrs) > 0 ||
613 805781634 : xlog_cil_over_hard_limit(log, space_used)) {
614 96000828 : split_res = log->l_iclog_hsize +
615 : sizeof(struct xlog_op_header);
616 96000828 : if (ctx_res)
617 2550705 : ctx_res += split_res * (tp->t_ticket->t_iclog_hdrs - 1);
618 : else
619 93450123 : ctx_res = split_res * tp->t_ticket->t_iclog_hdrs;
620 96000828 : atomic_sub(tp->t_ticket->t_iclog_hdrs, &cil->xc_iclog_hdrs);
621 : }
622 901782340 : cilpcp->space_reserved += ctx_res;
623 :
624 : /*
625 : * Accurately account when over the soft limit, otherwise fold the
626 : * percpu count into the global count if over the per-cpu threshold.
627 : */
628 901782340 : if (!test_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags)) {
629 21806 : atomic_add(len, &ctx->space_used);
630 901760534 : } else if (cilpcp->space_used + len >
631 901760540 : (XLOG_CIL_SPACE_LIMIT(log) / num_online_cpus())) {
632 215267 : space_used = atomic_add_return(cilpcp->space_used + len,
633 : &ctx->space_used);
634 215267 : cilpcp->space_used = 0;
635 :
636 : /*
637 : * If we just transitioned over the soft limit, we need to
638 : * transition to the global atomic counter.
639 : */
640 215267 : if (space_used >= XLOG_CIL_SPACE_LIMIT(log))
641 611 : xlog_cil_insert_pcp_aggregate(cil, ctx);
642 : } else {
643 901545267 : cilpcp->space_used += len;
644 : }
645 : /* attach the transaction to the CIL if it has any busy extents */
646 901782340 : if (!list_empty(&tp->t_busy))
647 38949279 : list_splice_init(&tp->t_busy, &cilpcp->busy_extents);
648 :
649 : /*
650 : * Now update the order of everything modified in the transaction
651 : * and insert items into the CIL if they aren't already there.
652 : * We do this here so we only need to take the CIL lock once during
653 : * the transaction commit.
654 : */
655 901782340 : order = atomic_inc_return(&ctx->order_id);
656 6013305244 : list_for_each_entry(lip, &tp->t_items, li_trans) {
657 : /* Skip items which aren't dirty in this transaction. */
658 5111483081 : if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
659 520933175 : continue;
660 :
661 4590549906 : lip->li_order_id = order;
662 4590549906 : if (!list_empty(&lip->li_cil))
663 3792522905 : continue;
664 798027001 : list_add_tail(&lip->li_cil, &cilpcp->log_items);
665 : }
666 901822163 : put_cpu_ptr(cilpcp);
667 :
668 : /*
669 : * If we've overrun the reservation, dump the tx details before we move
670 : * the log items. Shutdown is imminent...
671 : */
672 901793515 : tp->t_ticket->t_curr_res -= ctx_res + len;
673 901793515 : if (WARN_ON(tp->t_ticket->t_curr_res < 0)) {
674 0 : xfs_warn(log->l_mp, "Transaction log reservation overrun:");
675 0 : xfs_warn(log->l_mp,
676 : " log items: %d bytes (iov hdrs: %d bytes)",
677 : len, iovhdr_res);
678 0 : xfs_warn(log->l_mp, " split region headers: %d bytes",
679 : split_res);
680 0 : xfs_warn(log->l_mp, " ctx ticket: %d bytes", ctx_res);
681 0 : xlog_print_trans(tp);
682 0 : xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
683 : }
684 901793515 : }
685 :
686 : static void
687 2550712 : xlog_cil_free_logvec(
688 : struct list_head *lv_chain)
689 : {
690 2550712 : struct xfs_log_vec *lv;
691 :
692 418107246 : while (!list_empty(lv_chain)) {
693 415556534 : lv = list_first_entry(lv_chain, struct xfs_log_vec, lv_list);
694 415556534 : list_del_init(&lv->lv_list);
695 415556519 : kmem_free(lv);
696 : }
697 2550712 : }
698 :
699 : static void
700 55 : xlog_discard_endio_work(
701 : struct work_struct *work)
702 : {
703 55 : struct xfs_cil_ctx *ctx =
704 55 : container_of(work, struct xfs_cil_ctx, discard_endio_work);
705 55 : struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
706 :
707 55 : xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
708 55 : kmem_free(ctx);
709 55 : }
710 :
711 : /*
712 : * Queue up the actual completion to a thread to avoid IRQ-safe locking for
713 : * pagb_lock. Note that we need a unbounded workqueue, otherwise we might
714 : * get the execution delayed up to 30 seconds for weird reasons.
715 : */
716 : static void
717 55 : xlog_discard_endio(
718 : struct bio *bio)
719 : {
720 55 : struct xfs_cil_ctx *ctx = bio->bi_private;
721 :
722 55 : INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work);
723 55 : queue_work(xfs_discard_wq, &ctx->discard_endio_work);
724 55 : bio_put(bio);
725 55 : }
726 :
727 : static void
728 55 : xlog_discard_busy_extents(
729 : struct xfs_mount *mp,
730 : struct xfs_cil_ctx *ctx)
731 : {
732 55 : struct list_head *list = &ctx->busy_extents;
733 55 : struct xfs_extent_busy *busyp;
734 55 : struct bio *bio = NULL;
735 55 : struct blk_plug plug;
736 55 : int error = 0;
737 :
738 55 : ASSERT(xfs_has_discard(mp));
739 :
740 55 : blk_start_plug(&plug);
741 307 : list_for_each_entry(busyp, list, list) {
742 252 : trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
743 : busyp->length);
744 :
745 756 : error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
746 252 : XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
747 252 : XFS_FSB_TO_BB(mp, busyp->length),
748 : GFP_NOFS, &bio);
749 252 : if (error && error != -EOPNOTSUPP) {
750 0 : xfs_info(mp,
751 : "discard failed for extent [0x%llx,%u], error %d",
752 : (unsigned long long)busyp->bno,
753 : busyp->length,
754 : error);
755 0 : break;
756 : }
757 : }
758 :
759 55 : if (bio) {
760 55 : bio->bi_private = ctx;
761 55 : bio->bi_end_io = xlog_discard_endio;
762 55 : submit_bio(bio);
763 : } else {
764 0 : xlog_discard_endio_work(&ctx->discard_endio_work);
765 : }
766 55 : blk_finish_plug(&plug);
767 55 : }
768 :
769 : /*
770 : * Mark all items committed and clear busy extents. We free the log vector
771 : * chains in a separate pass so that we unpin the log items as quickly as
772 : * possible.
773 : */
774 : static void
775 2550712 : xlog_cil_committed(
776 : struct xfs_cil_ctx *ctx)
777 : {
778 2550712 : struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
779 2550712 : bool abort = xlog_is_shutdown(ctx->cil->xc_log);
780 :
781 : /*
782 : * If the I/O failed, we're aborting the commit and already shutdown.
783 : * Wake any commit waiters before aborting the log items so we don't
784 : * block async log pushers on callbacks. Async log pushers explicitly do
785 : * not wait on log force completion because they may be holding locks
786 : * required to unpin items.
787 : */
788 2550712 : if (abort) {
789 10502 : spin_lock(&ctx->cil->xc_push_lock);
790 10502 : wake_up_all(&ctx->cil->xc_start_wait);
791 10502 : wake_up_all(&ctx->cil->xc_commit_wait);
792 10502 : spin_unlock(&ctx->cil->xc_push_lock);
793 : }
794 :
795 2550712 : xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain,
796 : ctx->start_lsn, abort);
797 :
798 2550712 : xfs_extent_busy_sort(&ctx->busy_extents);
799 5101424 : xfs_extent_busy_clear(mp, &ctx->busy_extents,
800 2550712 : xfs_has_discard(mp) && !abort);
801 :
802 2550712 : spin_lock(&ctx->cil->xc_push_lock);
803 2550712 : list_del(&ctx->committing);
804 2550712 : spin_unlock(&ctx->cil->xc_push_lock);
805 :
806 2550712 : xlog_cil_free_logvec(&ctx->lv_chain);
807 :
808 2550712 : if (!list_empty(&ctx->busy_extents))
809 55 : xlog_discard_busy_extents(mp, ctx);
810 : else
811 2550657 : kmem_free(ctx);
812 2550712 : }
813 :
814 : void
815 12657353 : xlog_cil_process_committed(
816 : struct list_head *list)
817 : {
818 12657353 : struct xfs_cil_ctx *ctx;
819 :
820 15201636 : while ((ctx = list_first_entry_or_null(list,
821 : struct xfs_cil_ctx, iclog_entry))) {
822 2544283 : list_del(&ctx->iclog_entry);
823 2544283 : xlog_cil_committed(ctx);
824 : }
825 12657353 : }
826 :
827 : /*
828 : * Record the LSN of the iclog we were just granted space to start writing into.
829 : * If the context doesn't have a start_lsn recorded, then this iclog will
830 : * contain the start record for the checkpoint. Otherwise this write contains
831 : * the commit record for the checkpoint.
832 : */
833 : void
834 5091188 : xlog_cil_set_ctx_write_state(
835 : struct xfs_cil_ctx *ctx,
836 : struct xlog_in_core *iclog)
837 : {
838 5091188 : struct xfs_cil *cil = ctx->cil;
839 5091188 : xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn);
840 :
841 5091188 : ASSERT(!ctx->commit_lsn);
842 5091188 : if (!ctx->start_lsn) {
843 2546911 : spin_lock(&cil->xc_push_lock);
844 : /*
845 : * The LSN we need to pass to the log items on transaction
846 : * commit is the LSN reported by the first log vector write, not
847 : * the commit lsn. If we use the commit record lsn then we can
848 : * move the grant write head beyond the tail LSN and overwrite
849 : * it.
850 : */
851 2546911 : ctx->start_lsn = lsn;
852 2546911 : wake_up_all(&cil->xc_start_wait);
853 2546910 : spin_unlock(&cil->xc_push_lock);
854 :
855 : /*
856 : * Make sure the metadata we are about to overwrite in the log
857 : * has been flushed to stable storage before this iclog is
858 : * issued.
859 : */
860 2546910 : spin_lock(&cil->xc_log->l_icloglock);
861 2546910 : iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
862 2546910 : spin_unlock(&cil->xc_log->l_icloglock);
863 2546910 : return;
864 : }
865 :
866 : /*
867 : * Take a reference to the iclog for the context so that we still hold
868 : * it when xlog_write is done and has released it. This means the
869 : * context controls when the iclog is released for IO.
870 : */
871 2544277 : atomic_inc(&iclog->ic_refcnt);
872 :
873 : /*
874 : * xlog_state_get_iclog_space() guarantees there is enough space in the
875 : * iclog for an entire commit record, so we can attach the context
876 : * callbacks now. This needs to be done before we make the commit_lsn
877 : * visible to waiters so that checkpoints with commit records in the
878 : * same iclog order their IO completion callbacks in the same order that
879 : * the commit records appear in the iclog.
880 : */
881 2544283 : spin_lock(&cil->xc_log->l_icloglock);
882 2544282 : list_add_tail(&ctx->iclog_entry, &iclog->ic_callbacks);
883 2544283 : spin_unlock(&cil->xc_log->l_icloglock);
884 :
885 : /*
886 : * Now we can record the commit LSN and wake anyone waiting for this
887 : * sequence to have the ordered commit record assigned to a physical
888 : * location in the log.
889 : */
890 2544283 : spin_lock(&cil->xc_push_lock);
891 2544283 : ctx->commit_iclog = iclog;
892 2544283 : ctx->commit_lsn = lsn;
893 2544283 : wake_up_all(&cil->xc_commit_wait);
894 2544283 : spin_unlock(&cil->xc_push_lock);
895 : }
896 :
897 :
898 : /*
899 : * Ensure that the order of log writes follows checkpoint sequence order. This
900 : * relies on the context LSN being zero until the log write has guaranteed the
901 : * LSN that the log write will start at via xlog_state_get_iclog_space().
902 : */
903 : enum _record_type {
904 : _START_RECORD,
905 : _COMMIT_RECORD,
906 : };
907 :
908 : static int
909 5094992 : xlog_cil_order_write(
910 : struct xfs_cil *cil,
911 : xfs_csn_t sequence,
912 : enum _record_type record)
913 : {
914 5101486 : struct xfs_cil_ctx *ctx;
915 :
916 : restart:
917 5101486 : spin_lock(&cil->xc_push_lock);
918 10740547 : list_for_each_entry(ctx, &cil->xc_committing, committing) {
919 : /*
920 : * Avoid getting stuck in this loop because we were woken by the
921 : * shutdown, but then went back to sleep once already in the
922 : * shutdown state.
923 : */
924 11298712 : if (xlog_is_shutdown(cil->xc_log)) {
925 3800 : spin_unlock(&cil->xc_push_lock);
926 3800 : return -EIO;
927 : }
928 :
929 : /*
930 : * Higher sequences will wait for this one so skip them.
931 : * Don't wait for our own sequence, either.
932 : */
933 5645556 : if (ctx->sequence >= sequence)
934 5110312 : continue;
935 :
936 : /* Wait until the LSN for the record has been recorded. */
937 535244 : switch (record) {
938 342516 : case _START_RECORD:
939 342516 : if (!ctx->start_lsn) {
940 855 : xlog_wait(&cil->xc_start_wait, &cil->xc_push_lock);
941 855 : goto restart;
942 : }
943 : break;
944 192728 : case _COMMIT_RECORD:
945 192728 : if (!ctx->commit_lsn) {
946 5639 : xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
947 5639 : goto restart;
948 : }
949 : break;
950 : }
951 : }
952 5091191 : spin_unlock(&cil->xc_push_lock);
953 5091191 : return 0;
954 : }
955 :
956 : /*
957 : * Write out the log vector change now attached to the CIL context. This will
958 : * write a start record that needs to be strictly ordered in ascending CIL
959 : * sequence order so that log recovery will always use in-order start LSNs when
960 : * replaying checkpoints.
961 : */
962 : static int
963 2550709 : xlog_cil_write_chain(
964 : struct xfs_cil_ctx *ctx,
965 : uint32_t chain_len)
966 : {
967 2550709 : struct xlog *log = ctx->cil->xc_log;
968 2550709 : int error;
969 :
970 2550709 : error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD);
971 2550709 : if (error)
972 : return error;
973 2546906 : return xlog_write(log, ctx, &ctx->lv_chain, ctx->ticket, chain_len);
974 : }
975 :
976 : /*
977 : * Write out the commit record of a checkpoint transaction to close off a
978 : * running log write. These commit records are strictly ordered in ascending CIL
979 : * sequence order so that log recovery will always replay the checkpoints in the
980 : * correct order.
981 : */
982 : static int
983 2544283 : xlog_cil_write_commit_record(
984 : struct xfs_cil_ctx *ctx)
985 : {
986 2544283 : struct xlog *log = ctx->cil->xc_log;
987 5088566 : struct xlog_op_header ophdr = {
988 : .oh_clientid = XFS_TRANSACTION,
989 2544283 : .oh_tid = cpu_to_be32(ctx->ticket->t_tid),
990 : .oh_flags = XLOG_COMMIT_TRANS,
991 : };
992 2544283 : struct xfs_log_iovec reg = {
993 : .i_addr = &ophdr,
994 : .i_len = sizeof(struct xlog_op_header),
995 : .i_type = XLOG_REG_TYPE_COMMIT,
996 : };
997 2544283 : struct xfs_log_vec vec = {
998 : .lv_niovecs = 1,
999 : .lv_iovecp = ®,
1000 : };
1001 2544283 : int error;
1002 2544283 : LIST_HEAD(lv_chain);
1003 2544283 : list_add(&vec.lv_list, &lv_chain);
1004 :
1005 5088566 : if (xlog_is_shutdown(log))
1006 : return -EIO;
1007 :
1008 2544283 : error = xlog_cil_order_write(ctx->cil, ctx->sequence, _COMMIT_RECORD);
1009 2544283 : if (error)
1010 : return error;
1011 :
1012 : /* account for space used by record data */
1013 2544283 : ctx->ticket->t_curr_res -= reg.i_len;
1014 2544283 : error = xlog_write(log, ctx, &lv_chain, ctx->ticket, reg.i_len);
1015 2544283 : if (error)
1016 1 : xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
1017 : return error;
1018 : }
1019 :
1020 : struct xlog_cil_trans_hdr {
1021 : struct xlog_op_header oph[2];
1022 : struct xfs_trans_header thdr;
1023 : struct xfs_log_iovec lhdr[2];
1024 : };
1025 :
1026 : /*
1027 : * Build a checkpoint transaction header to begin the journal transaction. We
1028 : * need to account for the space used by the transaction header here as it is
1029 : * not accounted for in xlog_write().
1030 : *
1031 : * This is the only place we write a transaction header, so we also build the
1032 : * log opheaders that indicate the start of a log transaction and wrap the
1033 : * transaction header. We keep the start record in it's own log vector rather
1034 : * than compacting them into a single region as this ends up making the logic
1035 : * in xlog_write() for handling empty opheaders for start, commit and unmount
1036 : * records much simpler.
1037 : */
1038 : static void
1039 2550711 : xlog_cil_build_trans_hdr(
1040 : struct xfs_cil_ctx *ctx,
1041 : struct xlog_cil_trans_hdr *hdr,
1042 : struct xfs_log_vec *lvhdr,
1043 : int num_iovecs)
1044 : {
1045 2550711 : struct xlog_ticket *tic = ctx->ticket;
1046 2550711 : __be32 tid = cpu_to_be32(tic->t_tid);
1047 :
1048 2550711 : memset(hdr, 0, sizeof(*hdr));
1049 :
1050 : /* Log start record */
1051 2550711 : hdr->oph[0].oh_tid = tid;
1052 2550711 : hdr->oph[0].oh_clientid = XFS_TRANSACTION;
1053 2550711 : hdr->oph[0].oh_flags = XLOG_START_TRANS;
1054 :
1055 : /* log iovec region pointer */
1056 2550711 : hdr->lhdr[0].i_addr = &hdr->oph[0];
1057 2550711 : hdr->lhdr[0].i_len = sizeof(struct xlog_op_header);
1058 2550711 : hdr->lhdr[0].i_type = XLOG_REG_TYPE_LRHEADER;
1059 :
1060 : /* log opheader */
1061 2550711 : hdr->oph[1].oh_tid = tid;
1062 2550711 : hdr->oph[1].oh_clientid = XFS_TRANSACTION;
1063 2550711 : hdr->oph[1].oh_len = cpu_to_be32(sizeof(struct xfs_trans_header));
1064 :
1065 : /* transaction header in host byte order format */
1066 2550711 : hdr->thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
1067 2550711 : hdr->thdr.th_type = XFS_TRANS_CHECKPOINT;
1068 2550711 : hdr->thdr.th_tid = tic->t_tid;
1069 2550711 : hdr->thdr.th_num_items = num_iovecs;
1070 :
1071 : /* log iovec region pointer */
1072 2550711 : hdr->lhdr[1].i_addr = &hdr->oph[1];
1073 2550711 : hdr->lhdr[1].i_len = sizeof(struct xlog_op_header) +
1074 : sizeof(struct xfs_trans_header);
1075 2550711 : hdr->lhdr[1].i_type = XLOG_REG_TYPE_TRANSHDR;
1076 :
1077 2550711 : lvhdr->lv_niovecs = 2;
1078 2550711 : lvhdr->lv_iovecp = &hdr->lhdr[0];
1079 2550711 : lvhdr->lv_bytes = hdr->lhdr[0].i_len + hdr->lhdr[1].i_len;
1080 :
1081 2550711 : tic->t_curr_res -= lvhdr->lv_bytes;
1082 2550711 : }
1083 :
1084 : /*
1085 : * CIL item reordering compare function. We want to order in ascending ID order,
1086 : * but we want to leave items with the same ID in the order they were added to
1087 : * the list. This is important for operations like reflink where we log 4 order
1088 : * dependent intents in a single transaction when we overwrite an existing
1089 : * shared extent with a new shared extent. i.e. BUI(unmap), CUI(drop),
1090 : * CUI (inc), BUI(remap)...
1091 : */
1092 : static int
1093 3060831178 : xlog_cil_order_cmp(
1094 : void *priv,
1095 : const struct list_head *a,
1096 : const struct list_head *b)
1097 : {
1098 3060831178 : struct xfs_log_vec *l1 = container_of(a, struct xfs_log_vec, lv_list);
1099 3060831178 : struct xfs_log_vec *l2 = container_of(b, struct xfs_log_vec, lv_list);
1100 :
1101 3060831178 : return l1->lv_order_id > l2->lv_order_id;
1102 : }
1103 :
1104 : /*
1105 : * Pull all the log vectors off the items in the CIL, and remove the items from
1106 : * the CIL. We don't need the CIL lock here because it's only needed on the
1107 : * transaction commit side which is currently locked out by the flush lock.
1108 : *
1109 : * If a log item is marked with a whiteout, we do not need to write it to the
1110 : * journal and so we just move them to the whiteout list for the caller to
1111 : * dispose of appropriately.
1112 : */
1113 : static void
1114 2550709 : xlog_cil_build_lv_chain(
1115 : struct xfs_cil_ctx *ctx,
1116 : struct list_head *whiteouts,
1117 : uint32_t *num_iovecs,
1118 : uint32_t *num_bytes)
1119 : {
1120 800587521 : while (!list_empty(&ctx->log_items)) {
1121 798036809 : struct xfs_log_item *item;
1122 798036809 : struct xfs_log_vec *lv;
1123 :
1124 798036809 : item = list_first_entry(&ctx->log_items,
1125 : struct xfs_log_item, li_cil);
1126 :
1127 1596073618 : if (test_bit(XFS_LI_WHITEOUT, &item->li_flags)) {
1128 382482025 : list_move(&item->li_cil, whiteouts);
1129 382482002 : trace_xfs_cil_whiteout_skip(item);
1130 382481985 : continue;
1131 : }
1132 :
1133 415554784 : lv = item->li_lv;
1134 415554784 : lv->lv_order_id = item->li_order_id;
1135 :
1136 : /* we don't write ordered log vectors */
1137 415554784 : if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
1138 415068776 : *num_bytes += lv->lv_bytes;
1139 415554784 : *num_iovecs += lv->lv_niovecs;
1140 415554784 : list_add_tail(&lv->lv_list, &ctx->lv_chain);
1141 :
1142 415555067 : list_del_init(&item->li_cil);
1143 415554827 : item->li_order_id = 0;
1144 415554827 : item->li_lv = NULL;
1145 : }
1146 2550712 : }
1147 :
1148 : static void
1149 2550711 : xlog_cil_cleanup_whiteouts(
1150 : struct list_head *whiteouts)
1151 : {
1152 385035428 : while (!list_empty(whiteouts)) {
1153 382484716 : struct xfs_log_item *item = list_first_entry(whiteouts,
1154 : struct xfs_log_item, li_cil);
1155 382484716 : list_del_init(&item->li_cil);
1156 382481516 : trace_xfs_cil_whiteout_unpin(item);
1157 382481548 : item->li_ops->iop_unpin(item, 1);
1158 : }
1159 2550712 : }
1160 :
1161 : /*
1162 : * Push the Committed Item List to the log.
1163 : *
1164 : * If the current sequence is the same as xc_push_seq we need to do a flush. If
1165 : * xc_push_seq is less than the current sequence, then it has already been
1166 : * flushed and we don't need to do anything - the caller will wait for it to
1167 : * complete if necessary.
1168 : *
1169 : * xc_push_seq is checked unlocked against the sequence number for a match.
1170 : * Hence we can allow log forces to run racily and not issue pushes for the
1171 : * same sequence twice. If we get a race between multiple pushes for the same
1172 : * sequence they will block on the first one and then abort, hence avoiding
1173 : * needless pushes.
1174 : */
1175 : static void
1176 2550712 : xlog_cil_push_work(
1177 : struct work_struct *work)
1178 : {
1179 2550712 : struct xfs_cil_ctx *ctx =
1180 2550712 : container_of(work, struct xfs_cil_ctx, push_work);
1181 2550712 : struct xfs_cil *cil = ctx->cil;
1182 2550712 : struct xlog *log = cil->xc_log;
1183 2550712 : struct xfs_cil_ctx *new_ctx;
1184 2550712 : int num_iovecs = 0;
1185 2550712 : int num_bytes = 0;
1186 2550712 : int error = 0;
1187 2550712 : struct xlog_cil_trans_hdr thdr;
1188 2550712 : struct xfs_log_vec lvhdr = {};
1189 2550712 : xfs_csn_t push_seq;
1190 2550712 : bool push_commit_stable;
1191 2550712 : LIST_HEAD (whiteouts);
1192 2550712 : struct xlog_ticket *ticket;
1193 :
1194 2550712 : new_ctx = xlog_cil_ctx_alloc();
1195 2550711 : new_ctx->ticket = xlog_cil_ticket_alloc(log);
1196 :
1197 2550710 : down_write(&cil->xc_ctx_lock);
1198 :
1199 2550712 : spin_lock(&cil->xc_push_lock);
1200 2550712 : push_seq = cil->xc_push_seq;
1201 2550712 : ASSERT(push_seq <= ctx->sequence);
1202 2550712 : push_commit_stable = cil->xc_push_commit_stable;
1203 2550712 : cil->xc_push_commit_stable = false;
1204 :
1205 : /*
1206 : * As we are about to switch to a new, empty CIL context, we no longer
1207 : * need to throttle tasks on CIL space overruns. Wake any waiters that
1208 : * the hard push throttle may have caught so they can start committing
1209 : * to the new context. The ctx->xc_push_lock provides the serialisation
1210 : * necessary for safely using the lockless waitqueue_active() check in
1211 : * this context.
1212 : */
1213 2550712 : if (waitqueue_active(&cil->xc_push_wait))
1214 0 : wake_up_all(&cil->xc_push_wait);
1215 :
1216 2550712 : xlog_cil_push_pcp_aggregate(cil, ctx);
1217 :
1218 : /*
1219 : * Check if we've anything to push. If there is nothing, then we don't
1220 : * move on to a new sequence number and so we have to be able to push
1221 : * this sequence again later.
1222 : */
1223 5101416 : if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) {
1224 0 : cil->xc_push_seq = 0;
1225 0 : spin_unlock(&cil->xc_push_lock);
1226 0 : goto out_skip;
1227 : }
1228 :
1229 :
1230 : /* check for a previously pushed sequence */
1231 2550708 : if (push_seq < ctx->sequence) {
1232 0 : spin_unlock(&cil->xc_push_lock);
1233 0 : goto out_skip;
1234 : }
1235 :
1236 : /*
1237 : * We are now going to push this context, so add it to the committing
1238 : * list before we do anything else. This ensures that anyone waiting on
1239 : * this push can easily detect the difference between a "push in
1240 : * progress" and "CIL is empty, nothing to do".
1241 : *
1242 : * IOWs, a wait loop can now check for:
1243 : * the current sequence not being found on the committing list;
1244 : * an empty CIL; and
1245 : * an unchanged sequence number
1246 : * to detect a push that had nothing to do and therefore does not need
1247 : * waiting on. If the CIL is not empty, we get put on the committing
1248 : * list before emptying the CIL and bumping the sequence number. Hence
1249 : * an empty CIL and an unchanged sequence number means we jumped out
1250 : * above after doing nothing.
1251 : *
1252 : * Hence the waiter will either find the commit sequence on the
1253 : * committing list or the sequence number will be unchanged and the CIL
1254 : * still dirty. In that latter case, the push has not yet started, and
1255 : * so the waiter will have to continue trying to check the CIL
1256 : * committing list until it is found. In extreme cases of delay, the
1257 : * sequence may fully commit between the attempts the wait makes to wait
1258 : * on the commit sequence.
1259 : */
1260 2550708 : list_add(&ctx->committing, &cil->xc_committing);
1261 2550710 : spin_unlock(&cil->xc_push_lock);
1262 :
1263 2550710 : xlog_cil_build_lv_chain(ctx, &whiteouts, &num_iovecs, &num_bytes);
1264 :
1265 : /*
1266 : * Switch the contexts so we can drop the context lock and move out
1267 : * of a shared context. We can't just go straight to the commit record,
1268 : * though - we need to synchronise with previous and future commits so
1269 : * that the commit records are correctly ordered in the log to ensure
1270 : * that we process items during log IO completion in the correct order.
1271 : *
1272 : * For example, if we get an EFI in one checkpoint and the EFD in the
1273 : * next (e.g. due to log forces), we do not want the checkpoint with
1274 : * the EFD to be committed before the checkpoint with the EFI. Hence
1275 : * we must strictly order the commit records of the checkpoints so
1276 : * that: a) the checkpoint callbacks are attached to the iclogs in the
1277 : * correct order; and b) the checkpoints are replayed in correct order
1278 : * in log recovery.
1279 : *
1280 : * Hence we need to add this context to the committing context list so
1281 : * that higher sequences will wait for us to write out a commit record
1282 : * before they do.
1283 : *
1284 : * xfs_log_force_seq requires us to mirror the new sequence into the cil
1285 : * structure atomically with the addition of this sequence to the
1286 : * committing list. This also ensures that we can do unlocked checks
1287 : * against the current sequence in log forces without risking
1288 : * deferencing a freed context pointer.
1289 : */
1290 2550711 : spin_lock(&cil->xc_push_lock);
1291 2550712 : xlog_cil_ctx_switch(cil, new_ctx);
1292 2550706 : spin_unlock(&cil->xc_push_lock);
1293 2550709 : up_write(&cil->xc_ctx_lock);
1294 :
1295 : /*
1296 : * Sort the log vector chain before we add the transaction headers.
1297 : * This ensures we always have the transaction headers at the start
1298 : * of the chain.
1299 : */
1300 2550708 : list_sort(NULL, &ctx->lv_chain, xlog_cil_order_cmp);
1301 :
1302 : /*
1303 : * Build a checkpoint transaction header and write it to the log to
1304 : * begin the transaction. We need to account for the space used by the
1305 : * transaction header here as it is not accounted for in xlog_write().
1306 : * Add the lvhdr to the head of the lv chain we pass to xlog_write() so
1307 : * it gets written into the iclog first.
1308 : */
1309 2550709 : xlog_cil_build_trans_hdr(ctx, &thdr, &lvhdr, num_iovecs);
1310 2550710 : num_bytes += lvhdr.lv_bytes;
1311 2550710 : list_add(&lvhdr.lv_list, &ctx->lv_chain);
1312 :
1313 : /*
1314 : * Take the lvhdr back off the lv_chain immediately after calling
1315 : * xlog_cil_write_chain() as it should not be passed to log IO
1316 : * completion.
1317 : */
1318 2550711 : error = xlog_cil_write_chain(ctx, num_bytes);
1319 2550709 : list_del(&lvhdr.lv_list);
1320 2550710 : if (error)
1321 6429 : goto out_abort_free_ticket;
1322 :
1323 2544281 : error = xlog_cil_write_commit_record(ctx);
1324 2544283 : if (error)
1325 1 : goto out_abort_free_ticket;
1326 :
1327 : /*
1328 : * Grab the ticket from the ctx so we can ungrant it after releasing the
1329 : * commit_iclog. The ctx may be freed by the time we return from
1330 : * releasing the commit_iclog (i.e. checkpoint has been completed and
1331 : * callback run) so we can't reference the ctx after the call to
1332 : * xlog_state_release_iclog().
1333 : */
1334 2544282 : ticket = ctx->ticket;
1335 :
1336 : /*
1337 : * If the checkpoint spans multiple iclogs, wait for all previous iclogs
1338 : * to complete before we submit the commit_iclog. We can't use state
1339 : * checks for this - ACTIVE can be either a past completed iclog or a
1340 : * future iclog being filled, while WANT_SYNC through SYNC_DONE can be a
1341 : * past or future iclog awaiting IO or ordered IO completion to be run.
1342 : * In the latter case, if it's a future iclog and we wait on it, the we
1343 : * will hang because it won't get processed through to ic_force_wait
1344 : * wakeup until this commit_iclog is written to disk. Hence we use the
1345 : * iclog header lsn and compare it to the commit lsn to determine if we
1346 : * need to wait on iclogs or not.
1347 : */
1348 2544282 : spin_lock(&log->l_icloglock);
1349 2544282 : if (ctx->start_lsn != ctx->commit_lsn) {
1350 1146020 : xfs_lsn_t plsn;
1351 :
1352 1146020 : plsn = be64_to_cpu(ctx->commit_iclog->ic_prev->ic_header.h_lsn);
1353 1146020 : if (plsn && XFS_LSN_CMP(plsn, ctx->commit_lsn) < 0) {
1354 : /*
1355 : * Waiting on ic_force_wait orders the completion of
1356 : * iclogs older than ic_prev. Hence we only need to wait
1357 : * on the most recent older iclog here.
1358 : */
1359 1133536 : xlog_wait_on_iclog(ctx->commit_iclog->ic_prev);
1360 1133536 : spin_lock(&log->l_icloglock);
1361 : }
1362 :
1363 : /*
1364 : * We need to issue a pre-flush so that the ordering for this
1365 : * checkpoint is correctly preserved down to stable storage.
1366 : */
1367 1146020 : ctx->commit_iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
1368 : }
1369 :
1370 : /*
1371 : * The commit iclog must be written to stable storage to guarantee
1372 : * journal IO vs metadata writeback IO is correctly ordered on stable
1373 : * storage.
1374 : *
1375 : * If the push caller needs the commit to be immediately stable and the
1376 : * commit_iclog is not yet marked as XLOG_STATE_WANT_SYNC to indicate it
1377 : * will be written when released, switch it's state to WANT_SYNC right
1378 : * now.
1379 : */
1380 2544282 : ctx->commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA;
1381 2544282 : if (push_commit_stable &&
1382 44577 : ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE)
1383 40469 : xlog_state_switch_iclogs(log, ctx->commit_iclog, 0);
1384 2544282 : ticket = ctx->ticket;
1385 2544282 : xlog_state_release_iclog(log, ctx->commit_iclog, ticket);
1386 :
1387 : /* Not safe to reference ctx now! */
1388 :
1389 2544282 : spin_unlock(&log->l_icloglock);
1390 2544282 : xlog_cil_cleanup_whiteouts(&whiteouts);
1391 2544282 : xfs_log_ticket_ungrant(log, ticket);
1392 5094992 : return;
1393 :
1394 0 : out_skip:
1395 0 : up_write(&cil->xc_ctx_lock);
1396 0 : xfs_log_ticket_put(new_ctx->ticket);
1397 0 : kmem_free(new_ctx);
1398 : return;
1399 :
1400 6430 : out_abort_free_ticket:
1401 12860 : ASSERT(xlog_is_shutdown(log));
1402 6430 : xlog_cil_cleanup_whiteouts(&whiteouts);
1403 6430 : if (!ctx->commit_iclog) {
1404 6429 : xfs_log_ticket_ungrant(log, ctx->ticket);
1405 6429 : xlog_cil_committed(ctx);
1406 6429 : return;
1407 : }
1408 1 : spin_lock(&log->l_icloglock);
1409 1 : ticket = ctx->ticket;
1410 1 : xlog_state_release_iclog(log, ctx->commit_iclog, ticket);
1411 : /* Not safe to reference ctx now! */
1412 1 : spin_unlock(&log->l_icloglock);
1413 1 : xfs_log_ticket_ungrant(log, ticket);
1414 : }
1415 :
1416 : /*
1417 : * We need to push CIL every so often so we don't cache more than we can fit in
1418 : * the log. The limit really is that a checkpoint can't be more than half the
1419 : * log (the current checkpoint is not allowed to overwrite the previous
1420 : * checkpoint), but commit latency and memory usage limit this to a smaller
1421 : * size.
1422 : */
1423 : static void
1424 901731069 : xlog_cil_push_background(
1425 : struct xlog *log) __releases(cil->xc_ctx_lock)
1426 : {
1427 901731069 : struct xfs_cil *cil = log->l_cilp;
1428 901731069 : int space_used = atomic_read(&cil->xc_ctx->space_used);
1429 :
1430 : /*
1431 : * The cil won't be empty because we are called while holding the
1432 : * context lock so whatever we added to the CIL will still be there.
1433 : */
1434 901731069 : ASSERT(!test_bit(XLOG_CIL_EMPTY, &cil->xc_flags));
1435 :
1436 : /*
1437 : * We are done if:
1438 : * - we haven't used up all the space available yet; or
1439 : * - we've already queued up a push; and
1440 : * - we're not over the hard limit; and
1441 : * - nothing has been over the hard limit.
1442 : *
1443 : * If so, we don't need to take the push lock as there's nothing to do.
1444 : */
1445 901731075 : if (space_used < XLOG_CIL_SPACE_LIMIT(log) ||
1446 22427 : (cil->xc_push_seq == cil->xc_current_sequence &&
1447 21816 : space_used < XLOG_CIL_BLOCKING_SPACE_LIMIT(log) &&
1448 : !waitqueue_active(&cil->xc_push_wait))) {
1449 901730458 : up_read(&cil->xc_ctx_lock);
1450 901730458 : return;
1451 : }
1452 :
1453 611 : spin_lock(&cil->xc_push_lock);
1454 611 : if (cil->xc_push_seq < cil->xc_current_sequence) {
1455 610 : cil->xc_push_seq = cil->xc_current_sequence;
1456 610 : queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work);
1457 : }
1458 :
1459 : /*
1460 : * Drop the context lock now, we can't hold that if we need to sleep
1461 : * because we are over the blocking threshold. The push_lock is still
1462 : * held, so blocking threshold sleep/wakeup is still correctly
1463 : * serialised here.
1464 : */
1465 611 : up_read(&cil->xc_ctx_lock);
1466 :
1467 : /*
1468 : * If we are well over the space limit, throttle the work that is being
1469 : * done until the push work on this context has begun. Enforce the hard
1470 : * throttle on all transaction commits once it has been activated, even
1471 : * if the committing transactions have resulted in the space usage
1472 : * dipping back down under the hard limit.
1473 : *
1474 : * The ctx->xc_push_lock provides the serialisation necessary for safely
1475 : * calling xlog_cil_over_hard_limit() in this context.
1476 : */
1477 611 : if (xlog_cil_over_hard_limit(log, space_used)) {
1478 0 : trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
1479 0 : ASSERT(space_used < log->l_logsize);
1480 0 : xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
1481 0 : return;
1482 : }
1483 :
1484 611 : spin_unlock(&cil->xc_push_lock);
1485 :
1486 : }
1487 :
1488 : /*
1489 : * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence
1490 : * number that is passed. When it returns, the work will be queued for
1491 : * @push_seq, but it won't be completed.
1492 : *
1493 : * If the caller is performing a synchronous force, we will flush the workqueue
1494 : * to get previously queued work moving to minimise the wait time they will
1495 : * undergo waiting for all outstanding pushes to complete. The caller is
1496 : * expected to do the required waiting for push_seq to complete.
1497 : *
1498 : * If the caller is performing an async push, we need to ensure that the
1499 : * checkpoint is fully flushed out of the iclogs when we finish the push. If we
1500 : * don't do this, then the commit record may remain sitting in memory in an
1501 : * ACTIVE iclog. This then requires another full log force to push to disk,
1502 : * which defeats the purpose of having an async, non-blocking CIL force
1503 : * mechanism. Hence in this case we need to pass a flag to the push work to
1504 : * indicate it needs to flush the commit record itself.
1505 : */
1506 : static void
1507 5620155 : xlog_cil_push_now(
1508 : struct xlog *log,
1509 : xfs_lsn_t push_seq,
1510 : bool async)
1511 : {
1512 5620155 : struct xfs_cil *cil = log->l_cilp;
1513 :
1514 5620155 : if (!cil)
1515 : return;
1516 :
1517 5620155 : ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
1518 :
1519 : /* start on any pending background push to minimise wait time on it */
1520 5620155 : if (!async)
1521 5564555 : flush_workqueue(cil->xc_push_wq);
1522 :
1523 5620158 : spin_lock(&cil->xc_push_lock);
1524 :
1525 : /*
1526 : * If this is an async flush request, we always need to set the
1527 : * xc_push_commit_stable flag even if something else has already queued
1528 : * a push. The flush caller is asking for the CIL to be on stable
1529 : * storage when the next push completes, so regardless of who has queued
1530 : * the push, the flush requires stable semantics from it.
1531 : */
1532 5620493 : cil->xc_push_commit_stable = async;
1533 :
1534 : /*
1535 : * If the CIL is empty or we've already pushed the sequence then
1536 : * there's no more work that we need to do.
1537 : */
1538 5620493 : if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) ||
1539 3179826 : push_seq <= cil->xc_push_seq) {
1540 3070392 : spin_unlock(&cil->xc_push_lock);
1541 3070392 : return;
1542 : }
1543 :
1544 2550101 : cil->xc_push_seq = push_seq;
1545 2550101 : queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work);
1546 2550102 : spin_unlock(&cil->xc_push_lock);
1547 : }
1548 :
1549 : bool
1550 253936 : xlog_cil_empty(
1551 : struct xlog *log)
1552 : {
1553 253936 : struct xfs_cil *cil = log->l_cilp;
1554 253936 : bool empty = false;
1555 :
1556 253936 : spin_lock(&cil->xc_push_lock);
1557 507858 : if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags))
1558 250972 : empty = true;
1559 253929 : spin_unlock(&cil->xc_push_lock);
1560 253930 : return empty;
1561 : }
1562 :
1563 : /*
1564 : * If there are intent done items in this transaction and the related intent was
1565 : * committed in the current (same) CIL checkpoint, we don't need to write either
1566 : * the intent or intent done item to the journal as the change will be
1567 : * journalled atomically within this checkpoint. As we cannot remove items from
1568 : * the CIL here, mark the related intent with a whiteout so that the CIL push
1569 : * can remove it rather than writing it to the journal. Then remove the intent
1570 : * done item from the current transaction and release it so it doesn't get put
1571 : * into the CIL at all.
1572 : */
1573 : static uint32_t
1574 486923063 : xlog_cil_process_intents(
1575 : struct xfs_cil *cil,
1576 : struct xfs_trans *tp)
1577 : {
1578 486923063 : struct xfs_log_item *lip, *ilip, *next;
1579 486923063 : uint32_t len = 0;
1580 :
1581 3623142055 : list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
1582 3136221398 : if (!(lip->li_ops->flags & XFS_ITEM_INTENT_DONE))
1583 2753308958 : continue;
1584 :
1585 382912440 : ilip = lip->li_ops->iop_intent(lip);
1586 382909864 : if (!ilip || !xlog_item_in_current_chkpt(cil, ilip))
1587 424151 : continue;
1588 382485793 : set_bit(XFS_LI_WHITEOUT, &ilip->li_flags);
1589 382485817 : trace_xfs_cil_whiteout_mark(ilip);
1590 382485619 : len += ilip->li_lv->lv_bytes;
1591 382485619 : kmem_free(ilip->li_lv);
1592 382485929 : ilip->li_lv = NULL;
1593 :
1594 382485929 : xfs_trans_del_item(lip);
1595 382485878 : lip->li_ops->iop_release(lip);
1596 : }
1597 486920657 : return len;
1598 : }
1599 :
1600 : /*
1601 : * Commit a transaction with the given vector to the Committed Item List.
1602 : *
1603 : * To do this, we need to format the item, pin it in memory if required and
1604 : * account for the space used by the transaction. Once we have done that we
1605 : * need to release the unused reservation for the transaction, attach the
1606 : * transaction to the checkpoint context so we carry the busy extents through
1607 : * to checkpoint completion, and then unlock all the items in the transaction.
1608 : *
1609 : * Called with the context lock already held in read mode to lock out
1610 : * background commit, returns without it held once background commits are
1611 : * allowed again.
1612 : */
1613 : void
1614 901752341 : xlog_cil_commit(
1615 : struct xlog *log,
1616 : struct xfs_trans *tp,
1617 : xfs_csn_t *commit_seq,
1618 : bool regrant)
1619 : {
1620 901752341 : struct xfs_cil *cil = log->l_cilp;
1621 901752341 : struct xfs_log_item *lip, *next;
1622 901752341 : uint32_t released_space = 0;
1623 :
1624 : /*
1625 : * Do all necessary memory allocation before we lock the CIL.
1626 : * This ensures the allocation does not deadlock with a CIL
1627 : * push in memory reclaim (e.g. from kswapd).
1628 : */
1629 901752341 : xlog_cil_alloc_shadow_bufs(log, tp);
1630 :
1631 : /* lock out background commit */
1632 901737629 : down_read(&cil->xc_ctx_lock);
1633 :
1634 901796772 : if (tp->t_flags & XFS_TRANS_HAS_INTENT_DONE)
1635 486922117 : released_space = xlog_cil_process_intents(cil, tp);
1636 :
1637 901794498 : xlog_cil_insert_items(log, tp, released_space);
1638 :
1639 1356031250 : if (regrant && !xlog_is_shutdown(log))
1640 454214813 : xfs_log_ticket_regrant(log, tp->t_ticket);
1641 : else
1642 447601629 : xfs_log_ticket_ungrant(log, tp->t_ticket);
1643 901828369 : tp->t_ticket = NULL;
1644 901828369 : xfs_trans_unreserve_and_mod_sb(tp);
1645 :
1646 : /*
1647 : * Once all the items of the transaction have been copied to the CIL,
1648 : * the items can be unlocked and possibly freed.
1649 : *
1650 : * This needs to be done before we drop the CIL context lock because we
1651 : * have to update state in the log items and unlock them before they go
1652 : * to disk. If we don't, then the CIL checkpoint can race with us and
1653 : * we can run checkpoint completion before we've updated and unlocked
1654 : * the log items. This affects (at least) processing of stale buffers,
1655 : * inodes and EFIs.
1656 : */
1657 901833445 : trace_xfs_trans_commit_items(tp, _RET_IP_);
1658 6012469489 : list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
1659 5110701619 : xfs_trans_del_item(lip);
1660 5110834890 : if (lip->li_ops->iop_committing)
1661 4726890230 : lip->li_ops->iop_committing(lip, cil->xc_ctx->sequence);
1662 : }
1663 901767870 : if (commit_seq)
1664 901767870 : *commit_seq = cil->xc_ctx->sequence;
1665 :
1666 : /* xlog_cil_push_background() releases cil->xc_ctx_lock */
1667 901767870 : xlog_cil_push_background(log);
1668 901795872 : }
1669 :
1670 : /*
1671 : * Flush the CIL to stable storage but don't wait for it to complete. This
1672 : * requires the CIL push to ensure the commit record for the push hits the disk,
1673 : * but otherwise is no different to a push done from a log force.
1674 : */
1675 : void
1676 55623 : xlog_cil_flush(
1677 : struct xlog *log)
1678 : {
1679 55623 : xfs_csn_t seq = log->l_cilp->xc_current_sequence;
1680 :
1681 55623 : trace_xfs_log_force(log->l_mp, seq, _RET_IP_);
1682 55623 : xlog_cil_push_now(log, seq, true);
1683 :
1684 : /*
1685 : * If the CIL is empty, make sure that any previous checkpoint that may
1686 : * still be in an active iclog is pushed to stable storage.
1687 : */
1688 111246 : if (test_bit(XLOG_CIL_EMPTY, &log->l_cilp->xc_flags))
1689 8185 : xfs_log_force(log->l_mp, 0);
1690 55623 : }
1691 :
1692 : /*
1693 : * Conditionally push the CIL based on the sequence passed in.
1694 : *
1695 : * We only need to push if we haven't already pushed the sequence number given.
1696 : * Hence the only time we will trigger a push here is if the push sequence is
1697 : * the same as the current context.
1698 : *
1699 : * We return the current commit lsn to allow the callers to determine if a
1700 : * iclog flush is necessary following this call.
1701 : */
1702 : xfs_lsn_t
1703 3888818 : xlog_cil_force_seq(
1704 : struct xlog *log,
1705 : xfs_csn_t sequence)
1706 : {
1707 3888818 : struct xfs_cil *cil = log->l_cilp;
1708 3888818 : struct xfs_cil_ctx *ctx;
1709 3888818 : xfs_lsn_t commit_lsn = NULLCOMMITLSN;
1710 :
1711 3888818 : ASSERT(sequence <= cil->xc_current_sequence);
1712 :
1713 3888818 : if (!sequence)
1714 0 : sequence = cil->xc_current_sequence;
1715 3888818 : trace_xfs_log_force(log->l_mp, sequence, _RET_IP_);
1716 :
1717 : /*
1718 : * check to see if we need to force out the current context.
1719 : * xlog_cil_push() handles racing pushes for the same sequence,
1720 : * so no need to deal with it here.
1721 : */
1722 : restart:
1723 5564604 : xlog_cil_push_now(log, sequence, false);
1724 :
1725 : /*
1726 : * See if we can find a previous sequence still committing.
1727 : * We need to wait for all previous sequence commits to complete
1728 : * before allowing the force of push_seq to go ahead. Hence block
1729 : * on commits for those as well.
1730 : */
1731 5564724 : spin_lock(&cil->xc_push_lock);
1732 8927651 : list_for_each_entry(ctx, &cil->xc_committing, committing) {
1733 : /*
1734 : * Avoid getting stuck in this loop because we were woken by the
1735 : * shutdown, but then went back to sleep once already in the
1736 : * shutdown state.
1737 : */
1738 7437688 : if (xlog_is_shutdown(log))
1739 322 : goto out_shutdown;
1740 3718522 : if (ctx->sequence > sequence)
1741 194807 : continue;
1742 3523715 : if (!ctx->commit_lsn) {
1743 : /*
1744 : * It is still being pushed! Wait for the push to
1745 : * complete, then start again from the beginning.
1746 : */
1747 355744 : XFS_STATS_INC(log->l_mp, xs_log_force_sleep);
1748 355744 : xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
1749 355733 : goto restart;
1750 : }
1751 3167971 : if (ctx->sequence != sequence)
1752 372392 : continue;
1753 : /* found it! */
1754 : commit_lsn = ctx->commit_lsn;
1755 : }
1756 :
1757 : /*
1758 : * The call to xlog_cil_push_now() executes the push in the background.
1759 : * Hence by the time we have got here it our sequence may not have been
1760 : * pushed yet. This is true if the current sequence still matches the
1761 : * push sequence after the above wait loop and the CIL still contains
1762 : * dirty objects. This is guaranteed by the push code first adding the
1763 : * context to the committing list before emptying the CIL.
1764 : *
1765 : * Hence if we don't find the context in the committing list and the
1766 : * current sequence number is unchanged then the CIL contents are
1767 : * significant. If the CIL is empty, if means there was nothing to push
1768 : * and that means there is nothing to wait for. If the CIL is not empty,
1769 : * it means we haven't yet started the push, because if it had started
1770 : * we would have found the context on the committing list.
1771 : */
1772 7585513 : if (sequence == cil->xc_current_sequence &&
1773 2376706 : !test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) {
1774 1319954 : spin_unlock(&cil->xc_push_lock);
1775 1319954 : goto restart;
1776 : }
1777 :
1778 3888853 : spin_unlock(&cil->xc_push_lock);
1779 3888853 : return commit_lsn;
1780 :
1781 : /*
1782 : * We detected a shutdown in progress. We need to trigger the log force
1783 : * to pass through it's iclog state machine error handling, even though
1784 : * we are already in a shutdown state. Hence we can't return
1785 : * NULLCOMMITLSN here as that has special meaning to log forces (i.e.
1786 : * LSN is already stable), so we return a zero LSN instead.
1787 : */
1788 : out_shutdown:
1789 322 : spin_unlock(&cil->xc_push_lock);
1790 322 : return 0;
1791 : }
1792 :
1793 : /*
1794 : * Move dead percpu state to the relevant CIL context structures.
1795 : *
1796 : * We have to lock the CIL context here to ensure that nothing is modifying
1797 : * the percpu state, either addition or removal. Both of these are done under
1798 : * the CIL context lock, so grabbing that exclusively here will ensure we can
1799 : * safely drain the cilpcp for the CPU that is dying.
1800 : */
1801 : void
1802 15 : xlog_cil_pcp_dead(
1803 : struct xlog *log,
1804 : unsigned int cpu)
1805 : {
1806 15 : struct xfs_cil *cil = log->l_cilp;
1807 15 : struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
1808 15 : struct xfs_cil_ctx *ctx;
1809 :
1810 15 : down_write(&cil->xc_ctx_lock);
1811 15 : ctx = cil->xc_ctx;
1812 15 : if (ctx->ticket)
1813 15 : ctx->ticket->t_curr_res += cilpcp->space_reserved;
1814 15 : cilpcp->space_reserved = 0;
1815 :
1816 15 : if (!list_empty(&cilpcp->log_items))
1817 9 : list_splice_init(&cilpcp->log_items, &ctx->log_items);
1818 15 : if (!list_empty(&cilpcp->busy_extents))
1819 4 : list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents);
1820 15 : atomic_add(cilpcp->space_used, &ctx->space_used);
1821 15 : cilpcp->space_used = 0;
1822 15 : up_write(&cil->xc_ctx_lock);
1823 15 : }
1824 :
1825 : /*
1826 : * Perform initial CIL structure initialisation.
1827 : */
1828 : int
1829 22491 : xlog_cil_init(
1830 : struct xlog *log)
1831 : {
1832 22491 : struct xfs_cil *cil;
1833 22491 : struct xfs_cil_ctx *ctx;
1834 22491 : struct xlog_cil_pcp *cilpcp;
1835 22491 : int cpu;
1836 :
1837 22491 : cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
1838 22491 : if (!cil)
1839 : return -ENOMEM;
1840 : /*
1841 : * Limit the CIL pipeline depth to 4 concurrent works to bound the
1842 : * concurrency the log spinlocks will be exposed to.
1843 : */
1844 44982 : cil->xc_push_wq = alloc_workqueue("xfs-cil/%s",
1845 : XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND),
1846 22491 : 4, log->l_mp->m_super->s_id);
1847 22491 : if (!cil->xc_push_wq)
1848 0 : goto out_destroy_cil;
1849 :
1850 22491 : cil->xc_log = log;
1851 22491 : cil->xc_pcp = alloc_percpu(struct xlog_cil_pcp);
1852 22491 : if (!cil->xc_pcp)
1853 0 : goto out_destroy_wq;
1854 :
1855 67473 : for_each_possible_cpu(cpu) {
1856 44982 : cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
1857 44982 : INIT_LIST_HEAD(&cilpcp->busy_extents);
1858 44982 : INIT_LIST_HEAD(&cilpcp->log_items);
1859 : }
1860 :
1861 22491 : INIT_LIST_HEAD(&cil->xc_committing);
1862 22491 : spin_lock_init(&cil->xc_push_lock);
1863 22491 : init_waitqueue_head(&cil->xc_push_wait);
1864 22491 : init_rwsem(&cil->xc_ctx_lock);
1865 22491 : init_waitqueue_head(&cil->xc_start_wait);
1866 22491 : init_waitqueue_head(&cil->xc_commit_wait);
1867 22491 : log->l_cilp = cil;
1868 :
1869 22491 : ctx = xlog_cil_ctx_alloc();
1870 22491 : xlog_cil_ctx_switch(cil, ctx);
1871 22491 : return 0;
1872 :
1873 : out_destroy_wq:
1874 0 : destroy_workqueue(cil->xc_push_wq);
1875 0 : out_destroy_cil:
1876 0 : kmem_free(cil);
1877 0 : return -ENOMEM;
1878 : }
1879 :
1880 : void
1881 22494 : xlog_cil_destroy(
1882 : struct xlog *log)
1883 : {
1884 22494 : struct xfs_cil *cil = log->l_cilp;
1885 :
1886 22494 : if (cil->xc_ctx) {
1887 22494 : if (cil->xc_ctx->ticket)
1888 22486 : xfs_log_ticket_put(cil->xc_ctx->ticket);
1889 22494 : kmem_free(cil->xc_ctx);
1890 : }
1891 :
1892 22494 : ASSERT(test_bit(XLOG_CIL_EMPTY, &cil->xc_flags));
1893 22494 : free_percpu(cil->xc_pcp);
1894 22494 : destroy_workqueue(cil->xc_push_wq);
1895 22494 : kmem_free(cil);
1896 22494 : }
1897 :
|