Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 : * All Rights Reserved.
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_log_format.h"
11 : #include "xfs_trans_resv.h"
12 : #include "xfs_bit.h"
13 : #include "xfs_sb.h"
14 : #include "xfs_mount.h"
15 : #include "xfs_defer.h"
16 : #include "xfs_inode.h"
17 : #include "xfs_trans.h"
18 : #include "xfs_log.h"
19 : #include "xfs_log_priv.h"
20 : #include "xfs_log_recover.h"
21 : #include "xfs_trans_priv.h"
22 : #include "xfs_alloc.h"
23 : #include "xfs_ialloc.h"
24 : #include "xfs_trace.h"
25 : #include "xfs_icache.h"
26 : #include "xfs_error.h"
27 : #include "xfs_buf_item.h"
28 : #include "xfs_ag.h"
29 : #include "xfs_quota.h"
30 : #include "xfs_reflink.h"
31 :
32 : #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
33 :
34 : STATIC int
35 : xlog_find_zeroed(
36 : struct xlog *,
37 : xfs_daddr_t *);
38 : STATIC int
39 : xlog_clear_stale_blocks(
40 : struct xlog *,
41 : xfs_lsn_t);
42 : STATIC int
43 : xlog_do_recovery_pass(
44 : struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
45 :
46 : /*
47 : * Sector aligned buffer routines for buffer create/read/write/access
48 : */
49 :
50 : /*
51 : * Verify the log-relative block number and length in basic blocks are valid for
52 : * an operation involving the given XFS log buffer. Returns true if the fields
53 : * are valid, false otherwise.
54 : */
55 : static inline bool
56 : xlog_verify_bno(
57 : struct xlog *log,
58 : xfs_daddr_t blk_no,
59 : int bbcount)
60 : {
61 12963303 : if (blk_no < 0 || blk_no >= log->l_logBBsize)
62 : return false;
63 13213844 : if (bbcount <= 0 || (blk_no + bbcount) > log->l_logBBsize)
64 0 : return false;
65 : return true;
66 : }
67 :
68 : /*
69 : * Allocate a buffer to hold log data. The buffer needs to be able to map to
70 : * a range of nbblks basic blocks at any valid offset within the log.
71 : */
72 : static char *
73 250541 : xlog_alloc_buffer(
74 : struct xlog *log,
75 : int nbblks)
76 : {
77 : /*
78 : * Pass log block 0 since we don't have an addr yet, buffer will be
79 : * verified on read.
80 : */
81 501082 : if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, 0, nbblks))) {
82 0 : xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
83 : nbblks);
84 0 : return NULL;
85 : }
86 :
87 : /*
88 : * We do log I/O in units of log sectors (a power-of-2 multiple of the
89 : * basic block size), so we round up the requested size to accommodate
90 : * the basic blocks required for complete log sectors.
91 : *
92 : * In addition, the buffer may be used for a non-sector-aligned block
93 : * offset, in which case an I/O of the requested size could extend
94 : * beyond the end of the buffer. If the requested size is only 1 basic
95 : * block it will never straddle a sector boundary, so this won't be an
96 : * issue. Nor will this be a problem if the log I/O is done in basic
97 : * blocks (sector size 1). But otherwise we extend the buffer by one
98 : * extra log sector to ensure there's space to accommodate this
99 : * possibility.
100 : */
101 250541 : if (nbblks > 1 && log->l_sectBBsize > 1)
102 117803 : nbblks += log->l_sectBBsize;
103 250541 : nbblks = round_up(nbblks, log->l_sectBBsize);
104 250541 : return kvzalloc(BBTOB(nbblks), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
105 : }
106 :
107 : /*
108 : * Return the address of the start of the given block number's data
109 : * in a log buffer. The buffer covers a log sector-aligned region.
110 : */
111 : static inline unsigned int
112 : xlog_align(
113 : struct xlog *log,
114 : xfs_daddr_t blk_no)
115 : {
116 12961924 : return BBTOB(blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1));
117 : }
118 :
119 : static int
120 12963303 : xlog_do_io(
121 : struct xlog *log,
122 : xfs_daddr_t blk_no,
123 : unsigned int nbblks,
124 : char *data,
125 : enum req_op op)
126 : {
127 12963303 : int error;
128 :
129 25926606 : if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, blk_no, nbblks))) {
130 0 : xfs_warn(log->l_mp,
131 : "Invalid log block/length (0x%llx, 0x%x) for buffer",
132 : blk_no, nbblks);
133 0 : return -EFSCORRUPTED;
134 : }
135 :
136 12963303 : blk_no = round_down(blk_no, log->l_sectBBsize);
137 12963303 : nbblks = round_up(nbblks, log->l_sectBBsize);
138 12963303 : ASSERT(nbblks > 0);
139 :
140 12963303 : error = xfs_rw_bdev(xfs_buftarg_bdev(log->l_targ),
141 12963303 : log->l_logBBstart + blk_no,
142 : BBTOB(nbblks), data, op);
143 12963303 : if (error && !xlog_is_shutdown(log)) {
144 0 : xfs_alert(log->l_mp,
145 : "log recovery %s I/O error at daddr 0x%llx len %d error %d",
146 : op == REQ_OP_WRITE ? "write" : "read",
147 : blk_no, nbblks, error);
148 : }
149 : return error;
150 : }
151 :
152 : STATIC int
153 1379 : xlog_bread_noalign(
154 : struct xlog *log,
155 : xfs_daddr_t blk_no,
156 : int nbblks,
157 : char *data)
158 : {
159 1379 : return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
160 : }
161 :
162 : STATIC int
163 12858620 : xlog_bread(
164 : struct xlog *log,
165 : xfs_daddr_t blk_no,
166 : int nbblks,
167 : char *data,
168 : char **offset)
169 : {
170 12858620 : int error;
171 :
172 12858620 : error = xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
173 12858620 : if (!error)
174 12858620 : *offset = data + xlog_align(log, blk_no);
175 12858620 : return error;
176 : }
177 :
178 : STATIC int
179 103304 : xlog_bwrite(
180 : struct xlog *log,
181 : xfs_daddr_t blk_no,
182 : int nbblks,
183 : char *data)
184 : {
185 103304 : return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_WRITE);
186 : }
187 :
188 : #ifdef DEBUG
189 : /*
190 : * dump debug superblock and log record information
191 : */
192 : STATIC void
193 0 : xlog_header_check_dump(
194 : xfs_mount_t *mp,
195 : xlog_rec_header_t *head)
196 : {
197 0 : xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d",
198 : __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
199 0 : xfs_debug(mp, " log : uuid = %pU, fmt = %d",
200 : &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
201 0 : }
202 : #else
203 : #define xlog_header_check_dump(mp, head)
204 : #endif
205 :
206 : /*
207 : * check log record header for recovery
208 : */
209 : STATIC int
210 2609736 : xlog_header_check_recover(
211 : xfs_mount_t *mp,
212 : xlog_rec_header_t *head)
213 : {
214 2609736 : ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
215 :
216 : /*
217 : * IRIX doesn't write the h_fmt field and leaves it zeroed
218 : * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
219 : * a dirty log created in IRIX.
220 : */
221 2609736 : if (XFS_IS_CORRUPT(mp, head->h_fmt != cpu_to_be32(XLOG_FMT))) {
222 0 : xfs_warn(mp,
223 : "dirty log written in incompatible format - can't recover");
224 0 : xlog_header_check_dump(mp, head);
225 0 : return -EFSCORRUPTED;
226 : }
227 2609736 : if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
228 : &head->h_fs_uuid))) {
229 0 : xfs_warn(mp,
230 : "dirty log entry has mismatched uuid - can't recover");
231 0 : xlog_header_check_dump(mp, head);
232 0 : return -EFSCORRUPTED;
233 : }
234 : return 0;
235 : }
236 :
237 : /*
238 : * read the head block of the log and check the header
239 : */
240 : STATIC int
241 24101 : xlog_header_check_mount(
242 : xfs_mount_t *mp,
243 : xlog_rec_header_t *head)
244 : {
245 24101 : ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
246 :
247 24101 : if (uuid_is_null(&head->h_fs_uuid)) {
248 : /*
249 : * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
250 : * h_fs_uuid is null, we assume this log was last mounted
251 : * by IRIX and continue.
252 : */
253 0 : xfs_warn(mp, "null uuid in log - IRIX style log");
254 24101 : } else if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
255 : &head->h_fs_uuid))) {
256 0 : xfs_warn(mp, "log has mismatched uuid - can't recover");
257 0 : xlog_header_check_dump(mp, head);
258 0 : return -EFSCORRUPTED;
259 : }
260 : return 0;
261 : }
262 :
263 : /*
264 : * This routine finds (to an approximation) the first block in the physical
265 : * log which contains the given cycle. It uses a binary search algorithm.
266 : * Note that the algorithm can not be perfect because the disk will not
267 : * necessarily be perfect.
268 : */
269 : STATIC int
270 24101 : xlog_find_cycle_start(
271 : struct xlog *log,
272 : char *buffer,
273 : xfs_daddr_t first_blk,
274 : xfs_daddr_t *last_blk,
275 : uint cycle)
276 : {
277 24101 : char *offset;
278 24101 : xfs_daddr_t mid_blk;
279 24101 : xfs_daddr_t end_blk;
280 24101 : uint mid_cycle;
281 24101 : int error;
282 :
283 24101 : end_blk = *last_blk;
284 24101 : mid_blk = BLK_AVG(first_blk, end_blk);
285 431757 : while (mid_blk != first_blk && mid_blk != end_blk) {
286 407656 : error = xlog_bread(log, mid_blk, 1, buffer, &offset);
287 407656 : if (error)
288 0 : return error;
289 407656 : mid_cycle = xlog_get_cycle(offset);
290 407656 : if (mid_cycle == cycle)
291 : end_blk = mid_blk; /* last_half_cycle == mid_cycle */
292 : else
293 131271 : first_blk = mid_blk; /* first_half_cycle == mid_cycle */
294 407656 : mid_blk = BLK_AVG(first_blk, end_blk);
295 : }
296 24101 : ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
297 : (mid_blk == end_blk && mid_blk-1 == first_blk));
298 :
299 24101 : *last_blk = end_blk;
300 :
301 24101 : return 0;
302 : }
303 :
304 : /*
305 : * Check that a range of blocks does not contain stop_on_cycle_no.
306 : * Fill in *new_blk with the block offset where such a block is
307 : * found, or with -1 (an invalid block number) if there is no such
308 : * block in the range. The scan needs to occur from front to back
309 : * and the pointer into the region must be updated since a later
310 : * routine will need to perform another test.
311 : */
312 : STATIC int
313 24743 : xlog_find_verify_cycle(
314 : struct xlog *log,
315 : xfs_daddr_t start_blk,
316 : int nbblks,
317 : uint stop_on_cycle_no,
318 : xfs_daddr_t *new_blk)
319 : {
320 24743 : xfs_daddr_t i, j;
321 24743 : uint cycle;
322 24743 : char *buffer;
323 24743 : xfs_daddr_t bufblks;
324 24743 : char *buf = NULL;
325 24743 : int error = 0;
326 :
327 : /*
328 : * Greedily allocate a buffer big enough to handle the full
329 : * range of basic blocks we'll be examining. If that fails,
330 : * try a smaller size. We need to be able to read at least
331 : * a log sector, or we're out of luck.
332 : */
333 24743 : bufblks = 1 << ffs(nbblks);
334 24743 : while (bufblks > log->l_logBBsize)
335 0 : bufblks >>= 1;
336 24743 : while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
337 0 : bufblks >>= 1;
338 0 : if (bufblks < log->l_sectBBsize)
339 : return -ENOMEM;
340 : }
341 :
342 213957 : for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
343 189233 : int bcount;
344 :
345 189233 : bcount = min(bufblks, (start_blk + nbblks - i));
346 :
347 189233 : error = xlog_bread(log, i, bcount, buffer, &buf);
348 189233 : if (error)
349 0 : goto out;
350 :
351 71341082 : for (j = 0; j < bcount; j++) {
352 71151868 : cycle = xlog_get_cycle(buf);
353 71151868 : if (cycle == stop_on_cycle_no) {
354 19 : *new_blk = i+j;
355 19 : goto out;
356 : }
357 :
358 71151849 : buf += BBSIZE;
359 : }
360 : }
361 :
362 24724 : *new_blk = -1;
363 :
364 24743 : out:
365 24743 : kmem_free(buffer);
366 24743 : return error;
367 : }
368 :
369 : static inline int
370 93349 : xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh)
371 : {
372 93349 : if (xfs_has_logv2(log->l_mp)) {
373 93337 : int h_size = be32_to_cpu(rh->h_size);
374 :
375 93337 : if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) &&
376 : h_size > XLOG_HEADER_CYCLE_SIZE)
377 444 : return DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE);
378 : }
379 : return 1;
380 : }
381 :
382 : /*
383 : * Potentially backup over partial log record write.
384 : *
385 : * In the typical case, last_blk is the number of the block directly after
386 : * a good log record. Therefore, we subtract one to get the block number
387 : * of the last block in the given buffer. extra_bblks contains the number
388 : * of blocks we would have read on a previous read. This happens when the
389 : * last log record is split over the end of the physical log.
390 : *
391 : * extra_bblks is the number of blocks potentially verified on a previous
392 : * call to this routine.
393 : */
394 : STATIC int
395 24110 : xlog_find_verify_log_record(
396 : struct xlog *log,
397 : xfs_daddr_t start_blk,
398 : xfs_daddr_t *last_blk,
399 : int extra_bblks)
400 : {
401 24110 : xfs_daddr_t i;
402 24110 : char *buffer;
403 24110 : char *offset = NULL;
404 24110 : xlog_rec_header_t *head = NULL;
405 24110 : int error = 0;
406 24110 : int smallmem = 0;
407 24110 : int num_blks = *last_blk - start_blk;
408 24110 : int xhdrs;
409 :
410 24110 : ASSERT(start_blk != 0 || *last_blk != start_blk);
411 :
412 24110 : buffer = xlog_alloc_buffer(log, num_blks);
413 24110 : if (!buffer) {
414 0 : buffer = xlog_alloc_buffer(log, 1);
415 0 : if (!buffer)
416 : return -ENOMEM;
417 : smallmem = 1;
418 : } else {
419 24110 : error = xlog_bread(log, start_blk, num_blks, buffer, &offset);
420 24110 : if (error)
421 0 : goto out;
422 24110 : offset += ((num_blks - 1) << BBSHIFT);
423 : }
424 :
425 538783 : for (i = (*last_blk) - 1; i >= 0; i--) {
426 538778 : if (i < start_blk) {
427 : /* valid log record not found */
428 4 : xfs_warn(log->l_mp,
429 : "Log inconsistent (didn't find previous header)");
430 4 : ASSERT(0);
431 4 : error = -EFSCORRUPTED;
432 4 : goto out;
433 : }
434 :
435 538774 : if (smallmem) {
436 0 : error = xlog_bread(log, i, 1, buffer, &offset);
437 0 : if (error)
438 0 : goto out;
439 : }
440 :
441 538774 : head = (xlog_rec_header_t *)offset;
442 :
443 538774 : if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
444 : break;
445 :
446 514673 : if (!smallmem)
447 514673 : offset -= BBSIZE;
448 : }
449 :
450 : /*
451 : * We hit the beginning of the physical log & still no header. Return
452 : * to caller. If caller can handle a return of -1, then this routine
453 : * will be called again for the end of the physical log.
454 : */
455 24106 : if (i == -1) {
456 5 : error = 1;
457 5 : goto out;
458 : }
459 :
460 : /*
461 : * We have the final block of the good log (the first block
462 : * of the log record _before_ the head. So we check the uuid.
463 : */
464 24101 : if ((error = xlog_header_check_mount(log->l_mp, head)))
465 0 : goto out;
466 :
467 : /*
468 : * We may have found a log record header before we expected one.
469 : * last_blk will be the 1st block # with a given cycle #. We may end
470 : * up reading an entire log record. In this case, we don't want to
471 : * reset last_blk. Only when last_blk points in the middle of a log
472 : * record do we update last_blk.
473 : */
474 24101 : xhdrs = xlog_logrec_hblks(log, head);
475 :
476 48202 : if (*last_blk - i + extra_bblks !=
477 24101 : BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
478 37 : *last_blk = i;
479 :
480 24064 : out:
481 24110 : kmem_free(buffer);
482 24110 : return error;
483 : }
484 :
485 : /*
486 : * Head is defined to be the point of the log where the next log write
487 : * could go. This means that incomplete LR writes at the end are
488 : * eliminated when calculating the head. We aren't guaranteed that previous
489 : * LR have complete transactions. We only know that a cycle number of
490 : * current cycle number -1 won't be present in the log if we start writing
491 : * from our current block number.
492 : *
493 : * last_blk contains the block number of the first block with a given
494 : * cycle number.
495 : *
496 : * Return: zero if normal, non-zero if error.
497 : */
498 : STATIC int
499 24105 : xlog_find_head(
500 : struct xlog *log,
501 : xfs_daddr_t *return_head_blk)
502 : {
503 24105 : char *buffer;
504 24105 : char *offset;
505 24105 : xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
506 24105 : int num_scan_bblks;
507 24105 : uint first_half_cycle, last_half_cycle;
508 24105 : uint stop_on_cycle;
509 24105 : int error, log_bbnum = log->l_logBBsize;
510 :
511 : /* Is the end of the log device zeroed? */
512 24105 : error = xlog_find_zeroed(log, &first_blk);
513 24105 : if (error < 0) {
514 0 : xfs_warn(log->l_mp, "empty log check failed");
515 0 : return error;
516 : }
517 24105 : if (error == 1) {
518 8217 : *return_head_blk = first_blk;
519 :
520 : /* Is the whole lot zeroed? */
521 8217 : if (!first_blk) {
522 : /* Linux XFS shouldn't generate totally zeroed logs -
523 : * mkfs etc write a dummy unmount record to a fresh
524 : * log so we can store the uuid in there
525 : */
526 0 : xfs_warn(log->l_mp, "totally zeroed log");
527 : }
528 :
529 8217 : return 0;
530 : }
531 :
532 15888 : first_blk = 0; /* get cycle # of 1st block */
533 15888 : buffer = xlog_alloc_buffer(log, 1);
534 15888 : if (!buffer)
535 : return -ENOMEM;
536 :
537 15888 : error = xlog_bread(log, 0, 1, buffer, &offset);
538 15888 : if (error)
539 0 : goto out_free_buffer;
540 :
541 15888 : first_half_cycle = xlog_get_cycle(offset);
542 :
543 15888 : last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
544 15888 : error = xlog_bread(log, last_blk, 1, buffer, &offset);
545 15888 : if (error)
546 0 : goto out_free_buffer;
547 :
548 15888 : last_half_cycle = xlog_get_cycle(offset);
549 15888 : ASSERT(last_half_cycle != 0);
550 :
551 : /*
552 : * If the 1st half cycle number is equal to the last half cycle number,
553 : * then the entire log is stamped with the same cycle number. In this
554 : * case, head_blk can't be set to zero (which makes sense). The below
555 : * math doesn't work out properly with head_blk equal to zero. Instead,
556 : * we set it to log_bbnum which is an invalid block number, but this
557 : * value makes the math correct. If head_blk doesn't changed through
558 : * all the tests below, *head_blk is set to zero at the very end rather
559 : * than log_bbnum. In a sense, log_bbnum and zero are the same block
560 : * in a circular file.
561 : */
562 15888 : if (first_half_cycle == last_half_cycle) {
563 : /*
564 : * In this case we believe that the entire log should have
565 : * cycle number last_half_cycle. We need to scan backwards
566 : * from the end verifying that there are no holes still
567 : * containing last_half_cycle - 1. If we find such a hole,
568 : * then the start of that hole will be the new head. The
569 : * simple case looks like
570 : * x | x ... | x - 1 | x
571 : * Another case that fits this picture would be
572 : * x | x + 1 | x ... | x
573 : * In this case the head really is somewhere at the end of the
574 : * log, as one of the latest writes at the beginning was
575 : * incomplete.
576 : * One more case is
577 : * x | x + 1 | x ... | x - 1 | x
578 : * This is really the combination of the above two cases, and
579 : * the head has to end up at the start of the x-1 hole at the
580 : * end of the log.
581 : *
582 : * In the 256k log case, we will read from the beginning to the
583 : * end of the log and search for cycle numbers equal to x-1.
584 : * We don't worry about the x+1 blocks that we encounter,
585 : * because we know that they cannot be the head since the log
586 : * started with x.
587 : */
588 4 : head_blk = log_bbnum;
589 4 : stop_on_cycle = last_half_cycle - 1;
590 : } else {
591 : /*
592 : * In this case we want to find the first block with cycle
593 : * number matching last_half_cycle. We expect the log to be
594 : * some variation on
595 : * x + 1 ... | x ... | x
596 : * The first block with cycle number x (last_half_cycle) will
597 : * be where the new head belongs. First we do a binary search
598 : * for the first occurrence of last_half_cycle. The binary
599 : * search may not be totally accurate, so then we scan back
600 : * from there looking for occurrences of last_half_cycle before
601 : * us. If that backwards scan wraps around the beginning of
602 : * the log, then we look for occurrences of last_half_cycle - 1
603 : * at the end of the log. The cases we're looking for look
604 : * like
605 : * v binary search stopped here
606 : * x + 1 ... | x | x + 1 | x ... | x
607 : * ^ but we want to locate this spot
608 : * or
609 : * <---------> less than scan distance
610 : * x + 1 ... | x ... | x - 1 | x
611 : * ^ we want to locate this spot
612 : */
613 15884 : stop_on_cycle = last_half_cycle;
614 15884 : error = xlog_find_cycle_start(log, buffer, first_blk, &head_blk,
615 : last_half_cycle);
616 15884 : if (error)
617 0 : goto out_free_buffer;
618 : }
619 :
620 : /*
621 : * Now validate the answer. Scan back some number of maximum possible
622 : * blocks and make sure each one has the expected cycle number. The
623 : * maximum is determined by the total possible amount of buffering
624 : * in the in-core log. The following number can be made tighter if
625 : * we actually look at the block size of the filesystem.
626 : */
627 15888 : num_scan_bblks = min_t(int, log_bbnum, XLOG_TOTAL_REC_SHIFT(log));
628 15888 : if (head_blk >= num_scan_bblks) {
629 : /*
630 : * We are guaranteed that the entire check can be performed
631 : * in one buffer.
632 : */
633 15250 : start_blk = head_blk - num_scan_bblks;
634 15250 : if ((error = xlog_find_verify_cycle(log,
635 : start_blk, num_scan_bblks,
636 : stop_on_cycle, &new_blk)))
637 0 : goto out_free_buffer;
638 15250 : if (new_blk != -1)
639 16 : head_blk = new_blk;
640 : } else { /* need to read 2 parts of log */
641 : /*
642 : * We are going to scan backwards in the log in two parts.
643 : * First we scan the physical end of the log. In this part
644 : * of the log, we are looking for blocks with cycle number
645 : * last_half_cycle - 1.
646 : * If we find one, then we know that the log starts there, as
647 : * we've found a hole that didn't get written in going around
648 : * the end of the physical log. The simple case for this is
649 : * x + 1 ... | x ... | x - 1 | x
650 : * <---------> less than scan distance
651 : * If all of the blocks at the end of the log have cycle number
652 : * last_half_cycle, then we check the blocks at the start of
653 : * the log looking for occurrences of last_half_cycle. If we
654 : * find one, then our current estimate for the location of the
655 : * first occurrence of last_half_cycle is wrong and we move
656 : * back to the hole we've found. This case looks like
657 : * x + 1 ... | x | x + 1 | x ...
658 : * ^ binary search stopped here
659 : * Another case we need to handle that only occurs in 256k
660 : * logs is
661 : * x + 1 ... | x ... | x+1 | x ...
662 : * ^ binary search stops here
663 : * In a 256k log, the scan at the end of the log will see the
664 : * x + 1 blocks. We need to skip past those since that is
665 : * certainly not the head of the log. By searching for
666 : * last_half_cycle-1 we accomplish that.
667 : */
668 638 : ASSERT(head_blk <= INT_MAX &&
669 : (xfs_daddr_t) num_scan_bblks >= head_blk);
670 638 : start_blk = log_bbnum - (num_scan_bblks - head_blk);
671 638 : if ((error = xlog_find_verify_cycle(log, start_blk,
672 638 : num_scan_bblks - (int)head_blk,
673 : (stop_on_cycle - 1), &new_blk)))
674 0 : goto out_free_buffer;
675 638 : if (new_blk != -1) {
676 0 : head_blk = new_blk;
677 0 : goto validate_head;
678 : }
679 :
680 : /*
681 : * Scan beginning of log now. The last part of the physical
682 : * log is good. This scan needs to verify that it doesn't find
683 : * the last_half_cycle.
684 : */
685 638 : start_blk = 0;
686 638 : ASSERT(head_blk <= INT_MAX);
687 638 : if ((error = xlog_find_verify_cycle(log,
688 : start_blk, (int)head_blk,
689 : stop_on_cycle, &new_blk)))
690 0 : goto out_free_buffer;
691 638 : if (new_blk != -1)
692 0 : head_blk = new_blk;
693 : }
694 :
695 638 : validate_head:
696 : /*
697 : * Now we need to make sure head_blk is not pointing to a block in
698 : * the middle of a log record.
699 : */
700 15888 : num_scan_bblks = XLOG_REC_SHIFT(log);
701 15888 : if (head_blk >= num_scan_bblks) {
702 15793 : start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
703 :
704 : /* start ptr at last block ptr before head_blk */
705 15793 : error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
706 15793 : if (error == 1)
707 : error = -EIO;
708 15793 : if (error)
709 4 : goto out_free_buffer;
710 : } else {
711 95 : start_blk = 0;
712 95 : ASSERT(head_blk <= INT_MAX);
713 95 : error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
714 95 : if (error < 0)
715 0 : goto out_free_buffer;
716 95 : if (error == 1) {
717 : /* We hit the beginning of the log during our search */
718 5 : start_blk = log_bbnum - (num_scan_bblks - head_blk);
719 5 : new_blk = log_bbnum;
720 5 : ASSERT(start_blk <= INT_MAX &&
721 : (xfs_daddr_t) log_bbnum-start_blk >= 0);
722 5 : ASSERT(head_blk <= INT_MAX);
723 5 : error = xlog_find_verify_log_record(log, start_blk,
724 : &new_blk, (int)head_blk);
725 5 : if (error == 1)
726 : error = -EIO;
727 5 : if (error)
728 0 : goto out_free_buffer;
729 5 : if (new_blk != log_bbnum)
730 0 : head_blk = new_blk;
731 90 : } else if (error)
732 0 : goto out_free_buffer;
733 : }
734 :
735 15884 : kmem_free(buffer);
736 15884 : if (head_blk == log_bbnum)
737 0 : *return_head_blk = 0;
738 : else
739 15884 : *return_head_blk = head_blk;
740 : /*
741 : * When returning here, we have a good block number. Bad block
742 : * means that during a previous crash, we didn't have a clean break
743 : * from cycle number N to cycle number N-1. In this case, we need
744 : * to find the first block with cycle number N-1.
745 : */
746 : return 0;
747 :
748 4 : out_free_buffer:
749 4 : kmem_free(buffer);
750 4 : if (error)
751 4 : xfs_warn(log->l_mp, "failed to find log head");
752 4 : return error;
753 : }
754 :
755 : /*
756 : * Seek backwards in the log for log record headers.
757 : *
758 : * Given a starting log block, walk backwards until we find the provided number
759 : * of records or hit the provided tail block. The return value is the number of
760 : * records encountered or a negative error code. The log block and buffer
761 : * pointer of the last record seen are returned in rblk and rhead respectively.
762 : */
763 : STATIC int
764 35397 : xlog_rseek_logrec_hdr(
765 : struct xlog *log,
766 : xfs_daddr_t head_blk,
767 : xfs_daddr_t tail_blk,
768 : int count,
769 : char *buffer,
770 : xfs_daddr_t *rblk,
771 : struct xlog_rec_header **rhead,
772 : bool *wrapped)
773 : {
774 35397 : int i;
775 35397 : int error;
776 35397 : int found = 0;
777 35397 : char *offset = NULL;
778 35397 : xfs_daddr_t end_blk;
779 :
780 35397 : *wrapped = false;
781 :
782 : /*
783 : * Walk backwards from the head block until we hit the tail or the first
784 : * block in the log.
785 : */
786 35397 : end_blk = head_blk > tail_blk ? tail_blk : 0;
787 4095717 : for (i = (int) head_blk - 1; i >= end_blk; i--) {
788 4092360 : error = xlog_bread(log, i, 1, buffer, &offset);
789 4092360 : if (error)
790 0 : goto out_error;
791 :
792 4092360 : if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
793 94978 : *rblk = i;
794 94978 : *rhead = (struct xlog_rec_header *) offset;
795 94978 : if (++found == count)
796 : break;
797 : }
798 : }
799 :
800 : /*
801 : * If we haven't hit the tail block or the log record header count,
802 : * start looking again from the end of the physical log. Note that
803 : * callers can pass head == tail if the tail is not yet known.
804 : */
805 35397 : if (tail_blk >= head_blk && found != count) {
806 24667 : for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
807 24664 : error = xlog_bread(log, i, 1, buffer, &offset);
808 24664 : if (error)
809 0 : goto out_error;
810 :
811 24664 : if (*(__be32 *)offset ==
812 : cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
813 194 : *wrapped = true;
814 194 : *rblk = i;
815 194 : *rhead = (struct xlog_rec_header *) offset;
816 194 : if (++found == count)
817 : break;
818 : }
819 : }
820 : }
821 :
822 : return found;
823 :
824 : out_error:
825 : return error;
826 : }
827 :
828 : /*
829 : * Seek forward in the log for log record headers.
830 : *
831 : * Given head and tail blocks, walk forward from the tail block until we find
832 : * the provided number of records or hit the head block. The return value is the
833 : * number of records encountered or a negative error code. The log block and
834 : * buffer pointer of the last record seen are returned in rblk and rhead
835 : * respectively.
836 : */
837 : STATIC int
838 11285 : xlog_seek_logrec_hdr(
839 : struct xlog *log,
840 : xfs_daddr_t head_blk,
841 : xfs_daddr_t tail_blk,
842 : int count,
843 : char *buffer,
844 : xfs_daddr_t *rblk,
845 : struct xlog_rec_header **rhead,
846 : bool *wrapped)
847 : {
848 11285 : int i;
849 11285 : int error;
850 11285 : int found = 0;
851 11285 : char *offset = NULL;
852 11285 : xfs_daddr_t end_blk;
853 :
854 11285 : *wrapped = false;
855 :
856 : /*
857 : * Walk forward from the tail block until we hit the head or the last
858 : * block in the log.
859 : */
860 11285 : end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
861 11285 : for (i = (int) tail_blk; i <= end_blk; i++) {
862 11285 : error = xlog_bread(log, i, 1, buffer, &offset);
863 11285 : if (error)
864 0 : goto out_error;
865 :
866 11285 : if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
867 11285 : *rblk = i;
868 11285 : *rhead = (struct xlog_rec_header *) offset;
869 11285 : if (++found == count)
870 : break;
871 : }
872 : }
873 :
874 : /*
875 : * If we haven't hit the head block or the log record header count,
876 : * start looking again from the start of the physical log.
877 : */
878 11285 : if (tail_blk > head_blk && found != count) {
879 0 : for (i = 0; i < (int) head_blk; i++) {
880 0 : error = xlog_bread(log, i, 1, buffer, &offset);
881 0 : if (error)
882 0 : goto out_error;
883 :
884 0 : if (*(__be32 *)offset ==
885 : cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
886 0 : *wrapped = true;
887 0 : *rblk = i;
888 0 : *rhead = (struct xlog_rec_header *) offset;
889 0 : if (++found == count)
890 : break;
891 : }
892 : }
893 : }
894 :
895 : return found;
896 :
897 : out_error:
898 : return error;
899 : }
900 :
901 : /*
902 : * Calculate distance from head to tail (i.e., unused space in the log).
903 : */
904 : static inline int
905 : xlog_tail_distance(
906 : struct xlog *log,
907 : xfs_daddr_t head_blk,
908 : xfs_daddr_t tail_blk)
909 : {
910 0 : if (head_blk < tail_blk)
911 0 : return tail_blk - head_blk;
912 :
913 0 : return tail_blk + (log->l_logBBsize - head_blk);
914 : }
915 :
916 : /*
917 : * Verify the log tail. This is particularly important when torn or incomplete
918 : * writes have been detected near the front of the log and the head has been
919 : * walked back accordingly.
920 : *
921 : * We also have to handle the case where the tail was pinned and the head
922 : * blocked behind the tail right before a crash. If the tail had been pushed
923 : * immediately prior to the crash and the subsequent checkpoint was only
924 : * partially written, it's possible it overwrote the last referenced tail in the
925 : * log with garbage. This is not a coherency problem because the tail must have
926 : * been pushed before it can be overwritten, but appears as log corruption to
927 : * recovery because we have no way to know the tail was updated if the
928 : * subsequent checkpoint didn't write successfully.
929 : *
930 : * Therefore, CRC check the log from tail to head. If a failure occurs and the
931 : * offending record is within max iclog bufs from the head, walk the tail
932 : * forward and retry until a valid tail is found or corruption is detected out
933 : * of the range of a possible overwrite.
934 : */
935 : STATIC int
936 11285 : xlog_verify_tail(
937 : struct xlog *log,
938 : xfs_daddr_t head_blk,
939 : xfs_daddr_t *tail_blk,
940 : int hsize)
941 : {
942 11285 : struct xlog_rec_header *thead;
943 11285 : char *buffer;
944 11285 : xfs_daddr_t first_bad;
945 11285 : int error = 0;
946 11285 : bool wrapped;
947 11285 : xfs_daddr_t tmp_tail;
948 11285 : xfs_daddr_t orig_tail = *tail_blk;
949 :
950 11285 : buffer = xlog_alloc_buffer(log, 1);
951 11285 : if (!buffer)
952 : return -ENOMEM;
953 :
954 : /*
955 : * Make sure the tail points to a record (returns positive count on
956 : * success).
957 : */
958 11285 : error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, buffer,
959 : &tmp_tail, &thead, &wrapped);
960 11285 : if (error < 0)
961 0 : goto out;
962 11285 : if (*tail_blk != tmp_tail)
963 0 : *tail_blk = tmp_tail;
964 :
965 : /*
966 : * Run a CRC check from the tail to the head. We can't just check
967 : * MAX_ICLOGS records past the tail because the tail may point to stale
968 : * blocks cleared during the search for the head/tail. These blocks are
969 : * overwritten with zero-length records and thus record count is not a
970 : * reliable indicator of the iclog state before a crash.
971 : */
972 11285 : first_bad = 0;
973 11285 : error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
974 : XLOG_RECOVER_CRCPASS, &first_bad);
975 11285 : while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
976 0 : int tail_distance;
977 :
978 : /*
979 : * Is corruption within range of the head? If so, retry from
980 : * the next record. Otherwise return an error.
981 : */
982 0 : tail_distance = xlog_tail_distance(log, head_blk, first_bad);
983 0 : if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
984 : break;
985 :
986 : /* skip to the next record; returns positive count on success */
987 0 : error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2,
988 : buffer, &tmp_tail, &thead, &wrapped);
989 0 : if (error < 0)
990 0 : goto out;
991 :
992 0 : *tail_blk = tmp_tail;
993 0 : first_bad = 0;
994 0 : error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
995 : XLOG_RECOVER_CRCPASS, &first_bad);
996 : }
997 :
998 11285 : if (!error && *tail_blk != orig_tail)
999 0 : xfs_warn(log->l_mp,
1000 : "Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
1001 : orig_tail, *tail_blk);
1002 11285 : out:
1003 11285 : kmem_free(buffer);
1004 11285 : return error;
1005 : }
1006 :
1007 : /*
1008 : * Detect and trim torn writes from the head of the log.
1009 : *
1010 : * Storage without sector atomicity guarantees can result in torn writes in the
1011 : * log in the event of a crash. Our only means to detect this scenario is via
1012 : * CRC verification. While we can't always be certain that CRC verification
1013 : * failure is due to a torn write vs. an unrelated corruption, we do know that
1014 : * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
1015 : * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
1016 : * the log and treat failures in this range as torn writes as a matter of
1017 : * policy. In the event of CRC failure, the head is walked back to the last good
1018 : * record in the log and the tail is updated from that record and verified.
1019 : */
1020 : STATIC int
1021 11285 : xlog_verify_head(
1022 : struct xlog *log,
1023 : xfs_daddr_t *head_blk, /* in/out: unverified head */
1024 : xfs_daddr_t *tail_blk, /* out: tail block */
1025 : char *buffer,
1026 : xfs_daddr_t *rhead_blk, /* start blk of last record */
1027 : struct xlog_rec_header **rhead, /* ptr to last record */
1028 : bool *wrapped) /* last rec. wraps phys. log */
1029 : {
1030 11285 : struct xlog_rec_header *tmp_rhead;
1031 11285 : char *tmp_buffer;
1032 11285 : xfs_daddr_t first_bad;
1033 11285 : xfs_daddr_t tmp_rhead_blk;
1034 11285 : int found;
1035 11285 : int error;
1036 11285 : bool tmp_wrapped;
1037 :
1038 : /*
1039 : * Check the head of the log for torn writes. Search backwards from the
1040 : * head until we hit the tail or the maximum number of log record I/Os
1041 : * that could have been in flight at one time. Use a temporary buffer so
1042 : * we don't trash the rhead/buffer pointers from the caller.
1043 : */
1044 11285 : tmp_buffer = xlog_alloc_buffer(log, 1);
1045 11285 : if (!tmp_buffer)
1046 : return -ENOMEM;
1047 11285 : error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
1048 : XLOG_MAX_ICLOGS, tmp_buffer,
1049 : &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
1050 11285 : kmem_free(tmp_buffer);
1051 11285 : if (error < 0)
1052 : return error;
1053 :
1054 : /*
1055 : * Now run a CRC verification pass over the records starting at the
1056 : * block found above to the current head. If a CRC failure occurs, the
1057 : * log block of the first bad record is saved in first_bad.
1058 : */
1059 11285 : error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
1060 : XLOG_RECOVER_CRCPASS, &first_bad);
1061 11285 : if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
1062 : /*
1063 : * We've hit a potential torn write. Reset the error and warn
1064 : * about it.
1065 : */
1066 11 : error = 0;
1067 11 : xfs_warn(log->l_mp,
1068 : "Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
1069 : first_bad, *head_blk);
1070 :
1071 : /*
1072 : * Get the header block and buffer pointer for the last good
1073 : * record before the bad record.
1074 : *
1075 : * Note that xlog_find_tail() clears the blocks at the new head
1076 : * (i.e., the records with invalid CRC) if the cycle number
1077 : * matches the current cycle.
1078 : */
1079 11 : found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
1080 : buffer, rhead_blk, rhead, wrapped);
1081 11 : if (found < 0)
1082 : return found;
1083 11 : if (found == 0) /* XXX: right thing to do here? */
1084 : return -EIO;
1085 :
1086 : /*
1087 : * Reset the head block to the starting block of the first bad
1088 : * log record and set the tail block based on the last good
1089 : * record.
1090 : *
1091 : * Bail out if the updated head/tail match as this indicates
1092 : * possible corruption outside of the acceptable
1093 : * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
1094 : */
1095 11 : *head_blk = first_bad;
1096 11 : *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
1097 11 : if (*head_blk == *tail_blk) {
1098 0 : ASSERT(0);
1099 0 : return 0;
1100 : }
1101 : }
1102 11274 : if (error)
1103 : return error;
1104 :
1105 11285 : return xlog_verify_tail(log, *head_blk, tail_blk,
1106 11285 : be32_to_cpu((*rhead)->h_size));
1107 : }
1108 :
1109 : /*
1110 : * We need to make sure we handle log wrapping properly, so we can't use the
1111 : * calculated logbno directly. Make sure it wraps to the correct bno inside the
1112 : * log.
1113 : *
1114 : * The log is limited to 32 bit sizes, so we use the appropriate modulus
1115 : * operation here and cast it back to a 64 bit daddr on return.
1116 : */
1117 : static inline xfs_daddr_t
1118 : xlog_wrap_logbno(
1119 : struct xlog *log,
1120 : xfs_daddr_t bno)
1121 : {
1122 288921 : int mod;
1123 :
1124 288921 : div_s64_rem(bno, log->l_logBBsize, &mod);
1125 288921 : return mod;
1126 : }
1127 :
1128 : /*
1129 : * Check whether the head of the log points to an unmount record. In other
1130 : * words, determine whether the log is clean. If so, update the in-core state
1131 : * appropriately.
1132 : */
1133 : static int
1134 24112 : xlog_check_unmount_rec(
1135 : struct xlog *log,
1136 : xfs_daddr_t *head_blk,
1137 : xfs_daddr_t *tail_blk,
1138 : struct xlog_rec_header *rhead,
1139 : xfs_daddr_t rhead_blk,
1140 : char *buffer,
1141 : bool *clean)
1142 : {
1143 24112 : struct xlog_op_header *op_head;
1144 24112 : xfs_daddr_t umount_data_blk;
1145 24112 : xfs_daddr_t after_umount_blk;
1146 24112 : int hblks;
1147 24112 : int error;
1148 24112 : char *offset;
1149 :
1150 24112 : *clean = false;
1151 :
1152 : /*
1153 : * Look for unmount record. If we find it, then we know there was a
1154 : * clean unmount. Since 'i' could be the last block in the physical
1155 : * log, we convert to a log block before comparing to the head_blk.
1156 : *
1157 : * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
1158 : * below. We won't want to clear the unmount record if there is one, so
1159 : * we pass the lsn of the unmount record rather than the block after it.
1160 : */
1161 24112 : hblks = xlog_logrec_hblks(log, rhead);
1162 48224 : after_umount_blk = xlog_wrap_logbno(log,
1163 24112 : rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)));
1164 :
1165 48224 : if (*head_blk == after_umount_blk &&
1166 24112 : be32_to_cpu(rhead->h_num_logops) == 1) {
1167 12820 : umount_data_blk = xlog_wrap_logbno(log, rhead_blk + hblks);
1168 12820 : error = xlog_bread(log, umount_data_blk, 1, buffer, &offset);
1169 12820 : if (error)
1170 : return error;
1171 :
1172 12820 : op_head = (struct xlog_op_header *)offset;
1173 12820 : if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
1174 : /*
1175 : * Set tail and last sync so that newly written log
1176 : * records will point recovery to after the current
1177 : * unmount record.
1178 : */
1179 12816 : xlog_assign_atomic_lsn(&log->l_tail_lsn,
1180 12816 : log->l_curr_cycle, after_umount_blk);
1181 12816 : xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
1182 : log->l_curr_cycle, after_umount_blk);
1183 12816 : *tail_blk = after_umount_blk;
1184 :
1185 12816 : *clean = true;
1186 : }
1187 : }
1188 :
1189 : return 0;
1190 : }
1191 :
1192 : static void
1193 24112 : xlog_set_state(
1194 : struct xlog *log,
1195 : xfs_daddr_t head_blk,
1196 : struct xlog_rec_header *rhead,
1197 : xfs_daddr_t rhead_blk,
1198 : bool bump_cycle)
1199 : {
1200 : /*
1201 : * Reset log values according to the state of the log when we
1202 : * crashed. In the case where head_blk == 0, we bump curr_cycle
1203 : * one because the next write starts a new cycle rather than
1204 : * continuing the cycle of the last good log record. At this
1205 : * point we have guaranteed that all partial log records have been
1206 : * accounted for. Therefore, we know that the last good log record
1207 : * written was complete and ended exactly on the end boundary
1208 : * of the physical log.
1209 : */
1210 24112 : log->l_prev_block = rhead_blk;
1211 24112 : log->l_curr_block = (int)head_blk;
1212 24112 : log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
1213 24112 : if (bump_cycle)
1214 41 : log->l_curr_cycle++;
1215 24112 : atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
1216 24112 : atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
1217 24112 : xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
1218 : BBTOB(log->l_curr_block));
1219 24112 : xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
1220 : BBTOB(log->l_curr_block));
1221 24112 : }
1222 :
1223 : /*
1224 : * Find the sync block number or the tail of the log.
1225 : *
1226 : * This will be the block number of the last record to have its
1227 : * associated buffers synced to disk. Every log record header has
1228 : * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
1229 : * to get a sync block number. The only concern is to figure out which
1230 : * log record header to believe.
1231 : *
1232 : * The following algorithm uses the log record header with the largest
1233 : * lsn. The entire log record does not need to be valid. We only care
1234 : * that the header is valid.
1235 : *
1236 : * We could speed up search by using current head_blk buffer, but it is not
1237 : * available.
1238 : */
1239 : STATIC int
1240 24105 : xlog_find_tail(
1241 : struct xlog *log,
1242 : xfs_daddr_t *head_blk,
1243 : xfs_daddr_t *tail_blk)
1244 : {
1245 24105 : xlog_rec_header_t *rhead;
1246 24105 : char *offset = NULL;
1247 24105 : char *buffer;
1248 24105 : int error;
1249 24105 : xfs_daddr_t rhead_blk;
1250 24105 : xfs_lsn_t tail_lsn;
1251 24105 : bool wrapped = false;
1252 24105 : bool clean = false;
1253 :
1254 : /*
1255 : * Find previous log record
1256 : */
1257 24105 : if ((error = xlog_find_head(log, head_blk)))
1258 : return error;
1259 24101 : ASSERT(*head_blk < INT_MAX);
1260 :
1261 24101 : buffer = xlog_alloc_buffer(log, 1);
1262 24101 : if (!buffer)
1263 : return -ENOMEM;
1264 24101 : if (*head_blk == 0) { /* special case */
1265 36 : error = xlog_bread(log, 0, 1, buffer, &offset);
1266 36 : if (error)
1267 0 : goto done;
1268 :
1269 36 : if (xlog_get_cycle(offset) == 0) {
1270 0 : *tail_blk = 0;
1271 : /* leave all other log inited values alone */
1272 0 : goto done;
1273 : }
1274 : }
1275 :
1276 : /*
1277 : * Search backwards through the log looking for the log record header
1278 : * block. This wraps all the way back around to the head so something is
1279 : * seriously wrong if we can't find it.
1280 : */
1281 24101 : error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer,
1282 : &rhead_blk, &rhead, &wrapped);
1283 24101 : if (error < 0)
1284 0 : goto done;
1285 24101 : if (!error) {
1286 0 : xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
1287 0 : error = -EFSCORRUPTED;
1288 0 : goto done;
1289 : }
1290 24101 : *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
1291 :
1292 : /*
1293 : * Set the log state based on the current head record.
1294 : */
1295 24101 : xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
1296 24101 : tail_lsn = atomic64_read(&log->l_tail_lsn);
1297 :
1298 : /*
1299 : * Look for an unmount record at the head of the log. This sets the log
1300 : * state to determine whether recovery is necessary.
1301 : */
1302 24101 : error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
1303 : rhead_blk, buffer, &clean);
1304 24101 : if (error)
1305 0 : goto done;
1306 :
1307 : /*
1308 : * Verify the log head if the log is not clean (e.g., we have anything
1309 : * but an unmount record at the head). This uses CRC verification to
1310 : * detect and trim torn writes. If discovered, CRC failures are
1311 : * considered torn writes and the log head is trimmed accordingly.
1312 : *
1313 : * Note that we can only run CRC verification when the log is dirty
1314 : * because there's no guarantee that the log data behind an unmount
1315 : * record is compatible with the current architecture.
1316 : */
1317 24101 : if (!clean) {
1318 11285 : xfs_daddr_t orig_head = *head_blk;
1319 :
1320 11285 : error = xlog_verify_head(log, head_blk, tail_blk, buffer,
1321 : &rhead_blk, &rhead, &wrapped);
1322 11285 : if (error)
1323 0 : goto done;
1324 :
1325 : /* update in-core state again if the head changed */
1326 11285 : if (*head_blk != orig_head) {
1327 11 : xlog_set_state(log, *head_blk, rhead, rhead_blk,
1328 : wrapped);
1329 11 : tail_lsn = atomic64_read(&log->l_tail_lsn);
1330 11 : error = xlog_check_unmount_rec(log, head_blk, tail_blk,
1331 : rhead, rhead_blk, buffer,
1332 : &clean);
1333 11 : if (error)
1334 0 : goto done;
1335 : }
1336 : }
1337 :
1338 : /*
1339 : * Note that the unmount was clean. If the unmount was not clean, we
1340 : * need to know this to rebuild the superblock counters from the perag
1341 : * headers if we have a filesystem using non-persistent counters.
1342 : */
1343 24101 : if (clean)
1344 12816 : set_bit(XFS_OPSTATE_CLEAN, &log->l_mp->m_opstate);
1345 :
1346 : /*
1347 : * Make sure that there are no blocks in front of the head
1348 : * with the same cycle number as the head. This can happen
1349 : * because we allow multiple outstanding log writes concurrently,
1350 : * and the later writes might make it out before earlier ones.
1351 : *
1352 : * We use the lsn from before modifying it so that we'll never
1353 : * overwrite the unmount record after a clean unmount.
1354 : *
1355 : * Do this only if we are going to recover the filesystem
1356 : *
1357 : * NOTE: This used to say "if (!readonly)"
1358 : * However on Linux, we can & do recover a read-only filesystem.
1359 : * We only skip recovery if NORECOVERY is specified on mount,
1360 : * in which case we would not be here.
1361 : *
1362 : * But... if the -device- itself is readonly, just skip this.
1363 : * We can't recover this device anyway, so it won't matter.
1364 : */
1365 24101 : if (!xfs_readonly_buftarg(log->l_targ))
1366 24097 : error = xlog_clear_stale_blocks(log, tail_lsn);
1367 :
1368 4 : done:
1369 24101 : kmem_free(buffer);
1370 :
1371 24101 : if (error)
1372 0 : xfs_warn(log->l_mp, "failed to locate log tail");
1373 : return error;
1374 : }
1375 :
1376 : /*
1377 : * Is the log zeroed at all?
1378 : *
1379 : * The last binary search should be changed to perform an X block read
1380 : * once X becomes small enough. You can then search linearly through
1381 : * the X blocks. This will cut down on the number of reads we need to do.
1382 : *
1383 : * If the log is partially zeroed, this routine will pass back the blkno
1384 : * of the first block with cycle number 0. It won't have a complete LR
1385 : * preceding it.
1386 : *
1387 : * Return:
1388 : * 0 => the log is completely written to
1389 : * 1 => use *blk_no as the first block of the log
1390 : * <0 => error has occurred
1391 : */
1392 : STATIC int
1393 24105 : xlog_find_zeroed(
1394 : struct xlog *log,
1395 : xfs_daddr_t *blk_no)
1396 : {
1397 24105 : char *buffer;
1398 24105 : char *offset;
1399 24105 : uint first_cycle, last_cycle;
1400 24105 : xfs_daddr_t new_blk, last_blk, start_blk;
1401 24105 : xfs_daddr_t num_scan_bblks;
1402 24105 : int error, log_bbnum = log->l_logBBsize;
1403 :
1404 24105 : *blk_no = 0;
1405 :
1406 : /* check totally zeroed log */
1407 24105 : buffer = xlog_alloc_buffer(log, 1);
1408 24105 : if (!buffer)
1409 : return -ENOMEM;
1410 24105 : error = xlog_bread(log, 0, 1, buffer, &offset);
1411 24105 : if (error)
1412 0 : goto out_free_buffer;
1413 :
1414 24105 : first_cycle = xlog_get_cycle(offset);
1415 24105 : if (first_cycle == 0) { /* completely zeroed log */
1416 0 : *blk_no = 0;
1417 0 : kmem_free(buffer);
1418 0 : return 1;
1419 : }
1420 :
1421 : /* check partially zeroed log */
1422 24105 : error = xlog_bread(log, log_bbnum-1, 1, buffer, &offset);
1423 24105 : if (error)
1424 0 : goto out_free_buffer;
1425 :
1426 24105 : last_cycle = xlog_get_cycle(offset);
1427 24105 : if (last_cycle != 0) { /* log completely written to */
1428 15888 : kmem_free(buffer);
1429 15888 : return 0;
1430 : }
1431 :
1432 : /* we have a partially zeroed log */
1433 8217 : last_blk = log_bbnum-1;
1434 8217 : error = xlog_find_cycle_start(log, buffer, 0, &last_blk, 0);
1435 8217 : if (error)
1436 0 : goto out_free_buffer;
1437 :
1438 : /*
1439 : * Validate the answer. Because there is no way to guarantee that
1440 : * the entire log is made up of log records which are the same size,
1441 : * we scan over the defined maximum blocks. At this point, the maximum
1442 : * is not chosen to mean anything special. XXXmiken
1443 : */
1444 8217 : num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1445 8217 : ASSERT(num_scan_bblks <= INT_MAX);
1446 :
1447 8217 : if (last_blk < num_scan_bblks)
1448 : num_scan_bblks = last_blk;
1449 8217 : start_blk = last_blk - num_scan_bblks;
1450 :
1451 : /*
1452 : * We search for any instances of cycle number 0 that occur before
1453 : * our current estimate of the head. What we're trying to detect is
1454 : * 1 ... | 0 | 1 | 0...
1455 : * ^ binary search ends here
1456 : */
1457 8217 : if ((error = xlog_find_verify_cycle(log, start_blk,
1458 : (int)num_scan_bblks, 0, &new_blk)))
1459 0 : goto out_free_buffer;
1460 8217 : if (new_blk != -1)
1461 3 : last_blk = new_blk;
1462 :
1463 : /*
1464 : * Potentially backup over partial log record write. We don't need
1465 : * to search the end of the log because we know it is zero.
1466 : */
1467 8217 : error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
1468 8217 : if (error == 1)
1469 : error = -EIO;
1470 8217 : if (error)
1471 0 : goto out_free_buffer;
1472 :
1473 8217 : *blk_no = last_blk;
1474 8217 : out_free_buffer:
1475 8217 : kmem_free(buffer);
1476 8217 : if (error)
1477 0 : return error;
1478 : return 1;
1479 : }
1480 :
1481 : /*
1482 : * These are simple subroutines used by xlog_clear_stale_blocks() below
1483 : * to initialize a buffer full of empty log record headers and write
1484 : * them into the log.
1485 : */
1486 : STATIC void
1487 98678112 : xlog_add_record(
1488 : struct xlog *log,
1489 : char *buf,
1490 : int cycle,
1491 : int block,
1492 : int tail_cycle,
1493 : int tail_block)
1494 : {
1495 98678112 : xlog_rec_header_t *recp = (xlog_rec_header_t *)buf;
1496 :
1497 98678112 : memset(buf, 0, BBSIZE);
1498 98678112 : recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1499 98678112 : recp->h_cycle = cpu_to_be32(cycle);
1500 98678112 : recp->h_version = cpu_to_be32(
1501 : xfs_has_logv2(log->l_mp) ? 2 : 1);
1502 98678112 : recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1503 98678112 : recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1504 98678112 : recp->h_fmt = cpu_to_be32(XLOG_FMT);
1505 197356224 : memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1506 98678112 : }
1507 :
1508 : STATIC int
1509 24584 : xlog_write_log_records(
1510 : struct xlog *log,
1511 : int cycle,
1512 : int start_block,
1513 : int blocks,
1514 : int tail_cycle,
1515 : int tail_block)
1516 : {
1517 24584 : char *offset;
1518 24584 : char *buffer;
1519 24584 : int balign, ealign;
1520 24584 : int sectbb = log->l_sectBBsize;
1521 24584 : int end_block = start_block + blocks;
1522 24584 : int bufblks;
1523 24584 : int error = 0;
1524 24584 : int i, j = 0;
1525 :
1526 : /*
1527 : * Greedily allocate a buffer big enough to handle the full
1528 : * range of basic blocks to be written. If that fails, try
1529 : * a smaller size. We need to be able to write at least a
1530 : * log sector, or we're out of luck.
1531 : */
1532 24584 : bufblks = 1 << ffs(blocks);
1533 24590 : while (bufblks > log->l_logBBsize)
1534 6 : bufblks >>= 1;
1535 24584 : while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
1536 0 : bufblks >>= 1;
1537 0 : if (bufblks < sectbb)
1538 : return -ENOMEM;
1539 : }
1540 :
1541 : /* We may need to do a read at the start to fill in part of
1542 : * the buffer in the starting sector not covered by the first
1543 : * write below.
1544 : */
1545 24584 : balign = round_down(start_block, sectbb);
1546 24584 : if (balign != start_block) {
1547 0 : error = xlog_bread_noalign(log, start_block, 1, buffer);
1548 0 : if (error)
1549 0 : goto out_free_buffer;
1550 :
1551 0 : j = start_block - balign;
1552 : }
1553 :
1554 127888 : for (i = start_block; i < end_block; i += bufblks) {
1555 103304 : int bcount, endcount;
1556 :
1557 103304 : bcount = min(bufblks, end_block - start_block);
1558 103304 : endcount = bcount - j;
1559 :
1560 : /* We may need to do a read at the end to fill in part of
1561 : * the buffer in the final sector not covered by the write.
1562 : * If this is the same sector as the above read, skip it.
1563 : */
1564 103304 : ealign = round_down(end_block, sectbb);
1565 103304 : if (j == 0 && (start_block + endcount > ealign)) {
1566 0 : error = xlog_bread_noalign(log, ealign, sectbb,
1567 0 : buffer + BBTOB(ealign - start_block));
1568 0 : if (error)
1569 : break;
1570 :
1571 : }
1572 :
1573 103304 : offset = buffer + xlog_align(log, start_block);
1574 98781416 : for (; j < endcount; j++) {
1575 98678112 : xlog_add_record(log, offset, cycle, i+j,
1576 : tail_cycle, tail_block);
1577 98678112 : offset += BBSIZE;
1578 : }
1579 103304 : error = xlog_bwrite(log, start_block, endcount, buffer);
1580 103304 : if (error)
1581 : break;
1582 103304 : start_block += endcount;
1583 103304 : j = 0;
1584 : }
1585 :
1586 24584 : out_free_buffer:
1587 24584 : kmem_free(buffer);
1588 24584 : return error;
1589 : }
1590 :
1591 : /*
1592 : * This routine is called to blow away any incomplete log writes out
1593 : * in front of the log head. We do this so that we won't become confused
1594 : * if we come up, write only a little bit more, and then crash again.
1595 : * If we leave the partial log records out there, this situation could
1596 : * cause us to think those partial writes are valid blocks since they
1597 : * have the current cycle number. We get rid of them by overwriting them
1598 : * with empty log records with the old cycle number rather than the
1599 : * current one.
1600 : *
1601 : * The tail lsn is passed in rather than taken from
1602 : * the log so that we will not write over the unmount record after a
1603 : * clean unmount in a 512 block log. Doing so would leave the log without
1604 : * any valid log records in it until a new one was written. If we crashed
1605 : * during that time we would not be able to recover.
1606 : */
1607 : STATIC int
1608 24097 : xlog_clear_stale_blocks(
1609 : struct xlog *log,
1610 : xfs_lsn_t tail_lsn)
1611 : {
1612 24097 : int tail_cycle, head_cycle;
1613 24097 : int tail_block, head_block;
1614 24097 : int tail_distance, max_distance;
1615 24097 : int distance;
1616 24097 : int error;
1617 :
1618 24097 : tail_cycle = CYCLE_LSN(tail_lsn);
1619 24097 : tail_block = BLOCK_LSN(tail_lsn);
1620 24097 : head_cycle = log->l_curr_cycle;
1621 24097 : head_block = log->l_curr_block;
1622 :
1623 : /*
1624 : * Figure out the distance between the new head of the log
1625 : * and the tail. We want to write over any blocks beyond the
1626 : * head that we may have written just before the crash, but
1627 : * we don't want to overwrite the tail of the log.
1628 : */
1629 24097 : if (head_cycle == tail_cycle) {
1630 : /*
1631 : * The tail is behind the head in the physical log,
1632 : * so the distance from the head to the tail is the
1633 : * distance from the head to the end of the log plus
1634 : * the distance from the beginning of the log to the
1635 : * tail.
1636 : */
1637 23522 : if (XFS_IS_CORRUPT(log->l_mp,
1638 : head_block < tail_block ||
1639 : head_block >= log->l_logBBsize))
1640 0 : return -EFSCORRUPTED;
1641 23522 : tail_distance = tail_block + (log->l_logBBsize - head_block);
1642 : } else {
1643 : /*
1644 : * The head is behind the tail in the physical log,
1645 : * so the distance from the head to the tail is just
1646 : * the tail block minus the head block.
1647 : */
1648 575 : if (XFS_IS_CORRUPT(log->l_mp,
1649 : head_block >= tail_block ||
1650 : head_cycle != tail_cycle + 1))
1651 0 : return -EFSCORRUPTED;
1652 575 : tail_distance = tail_block - head_block;
1653 : }
1654 :
1655 : /*
1656 : * If the head is right up against the tail, we can't clear
1657 : * anything.
1658 : */
1659 24097 : if (tail_distance <= 0) {
1660 0 : ASSERT(tail_distance == 0);
1661 0 : return 0;
1662 : }
1663 :
1664 24097 : max_distance = XLOG_TOTAL_REC_SHIFT(log);
1665 : /*
1666 : * Take the smaller of the maximum amount of outstanding I/O
1667 : * we could have and the distance to the tail to clear out.
1668 : * We take the smaller so that we don't overwrite the tail and
1669 : * we don't waste all day writing from the head to the tail
1670 : * for no reason.
1671 : */
1672 24097 : max_distance = min(max_distance, tail_distance);
1673 :
1674 24097 : if ((head_block + max_distance) <= log->l_logBBsize) {
1675 : /*
1676 : * We can stomp all the blocks we need to without
1677 : * wrapping around the end of the log. Just do it
1678 : * in a single write. Use the cycle number of the
1679 : * current cycle minus one so that the log will look like:
1680 : * n ... | n - 1 ...
1681 : */
1682 23610 : error = xlog_write_log_records(log, (head_cycle - 1),
1683 : head_block, max_distance, tail_cycle,
1684 : tail_block);
1685 23610 : if (error)
1686 0 : return error;
1687 : } else {
1688 : /*
1689 : * We need to wrap around the end of the physical log in
1690 : * order to clear all the blocks. Do it in two separate
1691 : * I/Os. The first write should be from the head to the
1692 : * end of the physical log, and it should use the current
1693 : * cycle number minus one just like above.
1694 : */
1695 487 : distance = log->l_logBBsize - head_block;
1696 487 : error = xlog_write_log_records(log, (head_cycle - 1),
1697 : head_block, distance, tail_cycle,
1698 : tail_block);
1699 :
1700 487 : if (error)
1701 : return error;
1702 :
1703 : /*
1704 : * Now write the blocks at the start of the physical log.
1705 : * This writes the remainder of the blocks we want to clear.
1706 : * It uses the current cycle number since we're now on the
1707 : * same cycle as the head so that we get:
1708 : * n ... n ... | n - 1 ...
1709 : * ^^^^^ blocks we're writing
1710 : */
1711 487 : distance = max_distance - (log->l_logBBsize - head_block);
1712 487 : error = xlog_write_log_records(log, head_cycle, 0, distance,
1713 : tail_cycle, tail_block);
1714 487 : if (error)
1715 0 : return error;
1716 : }
1717 :
1718 : return 0;
1719 : }
1720 :
1721 : /*
1722 : * Release the recovered intent item in the AIL that matches the given intent
1723 : * type and intent id.
1724 : */
1725 : void
1726 145214 : xlog_recover_release_intent(
1727 : struct xlog *log,
1728 : unsigned short intent_type,
1729 : uint64_t intent_id)
1730 : {
1731 145214 : struct xfs_ail_cursor cur;
1732 145214 : struct xfs_log_item *lip;
1733 145214 : struct xfs_ail *ailp = log->l_ailp;
1734 :
1735 145214 : spin_lock(&ailp->ail_lock);
1736 172206 : for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL;
1737 26992 : lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
1738 171787 : if (lip->li_type != intent_type)
1739 20771 : continue;
1740 151016 : if (!lip->li_ops->iop_match(lip, intent_id))
1741 6221 : continue;
1742 :
1743 144795 : spin_unlock(&ailp->ail_lock);
1744 144795 : lip->li_ops->iop_release(lip);
1745 144795 : spin_lock(&ailp->ail_lock);
1746 : break;
1747 : }
1748 :
1749 145214 : xfs_trans_ail_cursor_done(&cur);
1750 145214 : spin_unlock(&ailp->ail_lock);
1751 145214 : }
1752 :
1753 : int
1754 425 : xlog_recover_iget(
1755 : struct xfs_mount *mp,
1756 : xfs_ino_t ino,
1757 : struct xfs_inode **ipp)
1758 : {
1759 425 : int error;
1760 :
1761 425 : error = xfs_iget(mp, NULL, ino, 0, 0, ipp);
1762 425 : if (error)
1763 : return error;
1764 :
1765 425 : error = xfs_qm_dqattach(*ipp);
1766 425 : if (error) {
1767 0 : xfs_irele(*ipp);
1768 0 : return error;
1769 : }
1770 :
1771 425 : if (VFS_I(*ipp)->i_nlink == 0)
1772 27 : xfs_iflags_set(*ipp, XFS_IRECOVERY);
1773 :
1774 : return 0;
1775 : }
1776 :
1777 : /******************************************************************************
1778 : *
1779 : * Log recover routines
1780 : *
1781 : ******************************************************************************
1782 : */
1783 : static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
1784 : &xlog_buf_item_ops,
1785 : &xlog_inode_item_ops,
1786 : &xlog_dquot_item_ops,
1787 : &xlog_quotaoff_item_ops,
1788 : &xlog_icreate_item_ops,
1789 : &xlog_efi_item_ops,
1790 : &xlog_efd_item_ops,
1791 : &xlog_rui_item_ops,
1792 : &xlog_rud_item_ops,
1793 : &xlog_cui_item_ops,
1794 : &xlog_cud_item_ops,
1795 : &xlog_bui_item_ops,
1796 : &xlog_bud_item_ops,
1797 : &xlog_attri_item_ops,
1798 : &xlog_attrd_item_ops,
1799 : &xlog_sxi_item_ops,
1800 : &xlog_sxd_item_ops,
1801 : };
1802 :
1803 : static const struct xlog_recover_item_ops *
1804 51446224 : xlog_find_item_ops(
1805 : struct xlog_recover_item *item)
1806 : {
1807 51446224 : unsigned int i;
1808 :
1809 90601788 : for (i = 0; i < ARRAY_SIZE(xlog_recover_item_ops); i++)
1810 90601788 : if (ITEM_TYPE(item) == xlog_recover_item_ops[i]->item_type)
1811 51446224 : return xlog_recover_item_ops[i];
1812 :
1813 : return NULL;
1814 : }
1815 :
1816 : /*
1817 : * Sort the log items in the transaction.
1818 : *
1819 : * The ordering constraints are defined by the inode allocation and unlink
1820 : * behaviour. The rules are:
1821 : *
1822 : * 1. Every item is only logged once in a given transaction. Hence it
1823 : * represents the last logged state of the item. Hence ordering is
1824 : * dependent on the order in which operations need to be performed so
1825 : * required initial conditions are always met.
1826 : *
1827 : * 2. Cancelled buffers are recorded in pass 1 in a separate table and
1828 : * there's nothing to replay from them so we can simply cull them
1829 : * from the transaction. However, we can't do that until after we've
1830 : * replayed all the other items because they may be dependent on the
1831 : * cancelled buffer and replaying the cancelled buffer can remove it
1832 : * form the cancelled buffer table. Hence they have tobe done last.
1833 : *
1834 : * 3. Inode allocation buffers must be replayed before inode items that
1835 : * read the buffer and replay changes into it. For filesystems using the
1836 : * ICREATE transactions, this means XFS_LI_ICREATE objects need to get
1837 : * treated the same as inode allocation buffers as they create and
1838 : * initialise the buffers directly.
1839 : *
1840 : * 4. Inode unlink buffers must be replayed after inode items are replayed.
1841 : * This ensures that inodes are completely flushed to the inode buffer
1842 : * in a "free" state before we remove the unlinked inode list pointer.
1843 : *
1844 : * Hence the ordering needs to be inode allocation buffers first, inode items
1845 : * second, inode unlink buffers third and cancelled buffers last.
1846 : *
1847 : * But there's a problem with that - we can't tell an inode allocation buffer
1848 : * apart from a regular buffer, so we can't separate them. We can, however,
1849 : * tell an inode unlink buffer from the others, and so we can separate them out
1850 : * from all the other buffers and move them to last.
1851 : *
1852 : * Hence, 4 lists, in order from head to tail:
1853 : * - buffer_list for all buffers except cancelled/inode unlink buffers
1854 : * - item_list for all non-buffer items
1855 : * - inode_buffer_list for inode unlink buffers
1856 : * - cancel_list for the cancelled buffers
1857 : *
1858 : * Note that we add objects to the tail of the lists so that first-to-last
1859 : * ordering is preserved within the lists. Adding objects to the head of the
1860 : * list means when we traverse from the head we walk them in last-to-first
1861 : * order. For cancelled buffers and inode unlink buffers this doesn't matter,
1862 : * but for all other items there may be specific ordering that we need to
1863 : * preserve.
1864 : */
1865 : STATIC int
1866 763682 : xlog_recover_reorder_trans(
1867 : struct xlog *log,
1868 : struct xlog_recover *trans,
1869 : int pass)
1870 : {
1871 763682 : struct xlog_recover_item *item, *n;
1872 763682 : int error = 0;
1873 763682 : LIST_HEAD(sort_list);
1874 763682 : LIST_HEAD(cancel_list);
1875 763682 : LIST_HEAD(buffer_list);
1876 763682 : LIST_HEAD(inode_buffer_list);
1877 763682 : LIST_HEAD(item_list);
1878 :
1879 763682 : list_splice_init(&trans->r_itemq, &sort_list);
1880 52209906 : list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1881 51446224 : enum xlog_recover_reorder fate = XLOG_REORDER_ITEM_LIST;
1882 :
1883 51446224 : item->ri_ops = xlog_find_item_ops(item);
1884 51446224 : if (!item->ri_ops) {
1885 0 : xfs_warn(log->l_mp,
1886 : "%s: unrecognized type of log operation (%d)",
1887 : __func__, ITEM_TYPE(item));
1888 0 : ASSERT(0);
1889 : /*
1890 : * return the remaining items back to the transaction
1891 : * item list so they can be freed in caller.
1892 : */
1893 0 : if (!list_empty(&sort_list))
1894 0 : list_splice_init(&sort_list, &trans->r_itemq);
1895 : error = -EFSCORRUPTED;
1896 : break;
1897 : }
1898 :
1899 51446224 : if (item->ri_ops->reorder)
1900 27484906 : fate = item->ri_ops->reorder(item);
1901 :
1902 27484906 : switch (fate) {
1903 26732836 : case XLOG_REORDER_BUFFER_LIST:
1904 26732836 : list_move_tail(&item->ri_list, &buffer_list);
1905 26732836 : break;
1906 735012 : case XLOG_REORDER_CANCEL_LIST:
1907 735012 : trace_xfs_log_recover_item_reorder_head(log,
1908 : trans, item, pass);
1909 735012 : list_move(&item->ri_list, &cancel_list);
1910 735012 : break;
1911 17058 : case XLOG_REORDER_INODE_BUFFER_LIST:
1912 17058 : list_move(&item->ri_list, &inode_buffer_list);
1913 17058 : break;
1914 23961318 : case XLOG_REORDER_ITEM_LIST:
1915 23961318 : trace_xfs_log_recover_item_reorder_tail(log,
1916 : trans, item, pass);
1917 23961318 : list_move_tail(&item->ri_list, &item_list);
1918 23961318 : break;
1919 : }
1920 : }
1921 :
1922 763682 : ASSERT(list_empty(&sort_list));
1923 763682 : if (!list_empty(&buffer_list))
1924 727072 : list_splice(&buffer_list, &trans->r_itemq);
1925 763682 : if (!list_empty(&item_list))
1926 759836 : list_splice_tail(&item_list, &trans->r_itemq);
1927 763682 : if (!list_empty(&inode_buffer_list))
1928 4862 : list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1929 763682 : if (!list_empty(&cancel_list))
1930 141062 : list_splice_tail(&cancel_list, &trans->r_itemq);
1931 763682 : return error;
1932 : }
1933 :
1934 : void
1935 25421891 : xlog_buf_readahead(
1936 : struct xlog *log,
1937 : xfs_daddr_t blkno,
1938 : uint len,
1939 : const struct xfs_buf_ops *ops)
1940 : {
1941 25421891 : if (!xlog_is_buffer_cancelled(log, blkno, len))
1942 24388300 : xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops);
1943 25421891 : }
1944 :
1945 : STATIC int
1946 489570 : xlog_recover_items_pass2(
1947 : struct xlog *log,
1948 : struct xlog_recover *trans,
1949 : struct list_head *buffer_list,
1950 : struct list_head *item_list)
1951 : {
1952 489570 : struct xlog_recover_item *item;
1953 489570 : int error = 0;
1954 :
1955 26212682 : list_for_each_entry(item, item_list, ri_list) {
1956 25723112 : trace_xfs_log_recover_item_recover(log, trans, item,
1957 : XLOG_RECOVER_PASS2);
1958 :
1959 25723112 : if (item->ri_ops->commit_pass2)
1960 25723112 : error = item->ri_ops->commit_pass2(log, buffer_list,
1961 : item, trans->r_lsn);
1962 25723112 : if (error)
1963 0 : return error;
1964 : }
1965 :
1966 : return error;
1967 : }
1968 :
1969 : /*
1970 : * Perform the transaction.
1971 : *
1972 : * If the transaction modifies a buffer or inode, do it now. Otherwise,
1973 : * EFIs and EFDs get queued up by adding entries into the AIL for them.
1974 : */
1975 : STATIC int
1976 763682 : xlog_recover_commit_trans(
1977 : struct xlog *log,
1978 : struct xlog_recover *trans,
1979 : int pass,
1980 : struct list_head *buffer_list)
1981 : {
1982 763682 : int error = 0;
1983 763682 : int items_queued = 0;
1984 763682 : struct xlog_recover_item *item;
1985 763682 : struct xlog_recover_item *next;
1986 763682 : LIST_HEAD (ra_list);
1987 763682 : LIST_HEAD (done_list);
1988 :
1989 : #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
1990 :
1991 763682 : hlist_del_init(&trans->r_list);
1992 :
1993 763682 : error = xlog_recover_reorder_trans(log, trans, pass);
1994 763682 : if (error)
1995 : return error;
1996 :
1997 52209906 : list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
1998 51446224 : trace_xfs_log_recover_item_recover(log, trans, item, pass);
1999 :
2000 51446224 : switch (pass) {
2001 25723112 : case XLOG_RECOVER_PASS1:
2002 25723112 : if (item->ri_ops->commit_pass1)
2003 13735006 : error = item->ri_ops->commit_pass1(log, item);
2004 : break;
2005 25723112 : case XLOG_RECOVER_PASS2:
2006 25723112 : if (item->ri_ops->ra_pass2)
2007 25421891 : item->ri_ops->ra_pass2(log, item);
2008 25723112 : list_move_tail(&item->ri_list, &ra_list);
2009 25723112 : items_queued++;
2010 25723112 : if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
2011 109460 : error = xlog_recover_items_pass2(log, trans,
2012 : buffer_list, &ra_list);
2013 109460 : list_splice_tail_init(&ra_list, &done_list);
2014 : items_queued = 0;
2015 : }
2016 :
2017 : break;
2018 0 : default:
2019 0 : ASSERT(0);
2020 : }
2021 :
2022 51446224 : if (error)
2023 0 : goto out;
2024 : }
2025 :
2026 763682 : out:
2027 763682 : if (!list_empty(&ra_list)) {
2028 380110 : if (!error)
2029 380110 : error = xlog_recover_items_pass2(log, trans,
2030 : buffer_list, &ra_list);
2031 380110 : list_splice_tail_init(&ra_list, &done_list);
2032 : }
2033 :
2034 763682 : if (!list_empty(&done_list))
2035 381841 : list_splice_init(&done_list, &trans->r_itemq);
2036 :
2037 : return error;
2038 : }
2039 :
2040 : STATIC void
2041 51559424 : xlog_recover_add_item(
2042 : struct list_head *head)
2043 : {
2044 51559424 : struct xlog_recover_item *item;
2045 :
2046 51559424 : item = kmem_zalloc(sizeof(struct xlog_recover_item), 0);
2047 51559424 : INIT_LIST_HEAD(&item->ri_list);
2048 51559424 : list_add_tail(&item->ri_list, head);
2049 51559424 : }
2050 :
2051 : STATIC int
2052 1800744 : xlog_recover_add_to_cont_trans(
2053 : struct xlog *log,
2054 : struct xlog_recover *trans,
2055 : char *dp,
2056 : int len)
2057 : {
2058 1800744 : struct xlog_recover_item *item;
2059 1800744 : char *ptr, *old_ptr;
2060 1800744 : int old_len;
2061 :
2062 : /*
2063 : * If the transaction is empty, the header was split across this and the
2064 : * previous record. Copy the rest of the header.
2065 : */
2066 1800744 : if (list_empty(&trans->r_itemq)) {
2067 2 : ASSERT(len <= sizeof(struct xfs_trans_header));
2068 2 : if (len > sizeof(struct xfs_trans_header)) {
2069 0 : xfs_warn(log->l_mp, "%s: bad header length", __func__);
2070 0 : return -EFSCORRUPTED;
2071 : }
2072 :
2073 2 : xlog_recover_add_item(&trans->r_itemq);
2074 2 : ptr = (char *)&trans->r_theader +
2075 2 : sizeof(struct xfs_trans_header) - len;
2076 4 : memcpy(ptr, dp, len);
2077 2 : return 0;
2078 : }
2079 :
2080 : /* take the tail entry */
2081 1800742 : item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
2082 : ri_list);
2083 :
2084 1800742 : old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
2085 1800742 : old_len = item->ri_buf[item->ri_cnt-1].i_len;
2086 :
2087 1800742 : ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL);
2088 1800742 : if (!ptr)
2089 : return -ENOMEM;
2090 3601484 : memcpy(&ptr[old_len], dp, len);
2091 1800742 : item->ri_buf[item->ri_cnt-1].i_len += len;
2092 1800742 : item->ri_buf[item->ri_cnt-1].i_addr = ptr;
2093 1800742 : trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
2094 1800742 : return 0;
2095 : }
2096 :
2097 : /*
2098 : * The next region to add is the start of a new region. It could be
2099 : * a whole region or it could be the first part of a new region. Because
2100 : * of this, the assumption here is that the type and size fields of all
2101 : * format structures fit into the first 32 bits of the structure.
2102 : *
2103 : * This works because all regions must be 32 bit aligned. Therefore, we
2104 : * either have both fields or we have neither field. In the case we have
2105 : * neither field, the data part of the region is zero length. We only have
2106 : * a log_op_header and can throw away the header since a new one will appear
2107 : * later. If we have at least 4 bytes, then we can determine how many regions
2108 : * will appear in the current log item.
2109 : */
2110 : STATIC int
2111 130940826 : xlog_recover_add_to_trans(
2112 : struct xlog *log,
2113 : struct xlog_recover *trans,
2114 : char *dp,
2115 : int len)
2116 : {
2117 130940826 : struct xfs_inode_log_format *in_f; /* any will do */
2118 130940826 : struct xlog_recover_item *item;
2119 130940826 : char *ptr;
2120 :
2121 130940826 : if (!len)
2122 : return 0;
2123 130940826 : if (list_empty(&trans->r_itemq)) {
2124 : /* we need to catch log corruptions here */
2125 764912 : if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
2126 0 : xfs_warn(log->l_mp, "%s: bad header magic number",
2127 : __func__);
2128 0 : ASSERT(0);
2129 0 : return -EFSCORRUPTED;
2130 : }
2131 :
2132 764912 : if (len > sizeof(struct xfs_trans_header)) {
2133 0 : xfs_warn(log->l_mp, "%s: bad header length", __func__);
2134 0 : ASSERT(0);
2135 0 : return -EFSCORRUPTED;
2136 : }
2137 :
2138 : /*
2139 : * The transaction header can be arbitrarily split across op
2140 : * records. If we don't have the whole thing here, copy what we
2141 : * do have and handle the rest in the next record.
2142 : */
2143 764912 : if (len == sizeof(struct xfs_trans_header))
2144 764910 : xlog_recover_add_item(&trans->r_itemq);
2145 1529824 : memcpy(&trans->r_theader, dp, len);
2146 764912 : return 0;
2147 : }
2148 :
2149 130175914 : ptr = kmem_alloc(len, 0);
2150 260351828 : memcpy(ptr, dp, len);
2151 130175914 : in_f = (struct xfs_inode_log_format *)ptr;
2152 :
2153 : /* take the tail entry */
2154 130175914 : item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
2155 : ri_list);
2156 130175914 : if (item->ri_total != 0 &&
2157 129411002 : item->ri_total == item->ri_cnt) {
2158 : /* tail item is in use, get a new one */
2159 50794512 : xlog_recover_add_item(&trans->r_itemq);
2160 50794512 : item = list_entry(trans->r_itemq.prev,
2161 : struct xlog_recover_item, ri_list);
2162 : }
2163 :
2164 130175914 : if (item->ri_total == 0) { /* first region to be added */
2165 51559424 : if (in_f->ilf_size == 0 ||
2166 : in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
2167 0 : xfs_warn(log->l_mp,
2168 : "bad number of regions (%d) in inode log format",
2169 : in_f->ilf_size);
2170 0 : ASSERT(0);
2171 0 : kmem_free(ptr);
2172 0 : return -EFSCORRUPTED;
2173 : }
2174 :
2175 51559424 : item->ri_total = in_f->ilf_size;
2176 51559424 : item->ri_buf =
2177 51559424 : kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
2178 : 0);
2179 : }
2180 :
2181 130175914 : if (item->ri_total <= item->ri_cnt) {
2182 0 : xfs_warn(log->l_mp,
2183 : "log item region count (%d) overflowed size (%d)",
2184 : item->ri_cnt, item->ri_total);
2185 0 : ASSERT(0);
2186 0 : kmem_free(ptr);
2187 0 : return -EFSCORRUPTED;
2188 : }
2189 :
2190 : /* Description region is ri_buf[0] */
2191 130175914 : item->ri_buf[item->ri_cnt].i_addr = ptr;
2192 130175914 : item->ri_buf[item->ri_cnt].i_len = len;
2193 130175914 : item->ri_cnt++;
2194 130175914 : trace_xfs_log_recover_item_add(log, trans, item, 0);
2195 130175914 : return 0;
2196 : }
2197 :
2198 : /*
2199 : * Free up any resources allocated by the transaction
2200 : *
2201 : * Remember that EFIs, EFDs, and IUNLINKs are handled later.
2202 : */
2203 : STATIC void
2204 764912 : xlog_recover_free_trans(
2205 : struct xlog_recover *trans)
2206 : {
2207 764912 : struct xlog_recover_item *item, *n;
2208 764912 : int i;
2209 :
2210 764912 : hlist_del_init(&trans->r_list);
2211 :
2212 52324336 : list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
2213 : /* Free the regions in the item. */
2214 51559424 : list_del(&item->ri_list);
2215 181735338 : for (i = 0; i < item->ri_cnt; i++)
2216 130175914 : kmem_free(item->ri_buf[i].i_addr);
2217 : /* Free the item itself */
2218 51559424 : kmem_free(item->ri_buf);
2219 51559424 : kmem_free(item);
2220 : }
2221 : /* Free the transaction recover structure */
2222 764912 : kmem_free(trans);
2223 764912 : }
2224 :
2225 : /*
2226 : * On error or completion, trans is freed.
2227 : */
2228 : STATIC int
2229 133505252 : xlog_recovery_process_trans(
2230 : struct xlog *log,
2231 : struct xlog_recover *trans,
2232 : char *dp,
2233 : unsigned int len,
2234 : unsigned int flags,
2235 : int pass,
2236 : struct list_head *buffer_list)
2237 : {
2238 133505252 : int error = 0;
2239 133505252 : bool freeit = false;
2240 :
2241 : /* mask off ophdr transaction container flags */
2242 133505252 : flags &= ~XLOG_END_TRANS;
2243 133505252 : if (flags & XLOG_WAS_CONT_TRANS)
2244 1800744 : flags &= ~XLOG_CONTINUE_TRANS;
2245 :
2246 : /*
2247 : * Callees must not free the trans structure. We'll decide if we need to
2248 : * free it or not based on the operation being done and it's result.
2249 : */
2250 133505252 : switch (flags) {
2251 : /* expected flag values */
2252 130940826 : case 0:
2253 : case XLOG_CONTINUE_TRANS:
2254 130940826 : error = xlog_recover_add_to_trans(log, trans, dp, len);
2255 130940826 : break;
2256 1800744 : case XLOG_WAS_CONT_TRANS:
2257 1800744 : error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
2258 1800744 : break;
2259 763682 : case XLOG_COMMIT_TRANS:
2260 763682 : error = xlog_recover_commit_trans(log, trans, pass,
2261 : buffer_list);
2262 : /* success or fail, we are now done with this transaction. */
2263 763682 : freeit = true;
2264 763682 : break;
2265 :
2266 : /* unexpected flag values */
2267 0 : case XLOG_UNMOUNT_TRANS:
2268 : /* just skip trans */
2269 0 : xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
2270 0 : freeit = true;
2271 0 : break;
2272 0 : case XLOG_START_TRANS:
2273 : default:
2274 0 : xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
2275 0 : ASSERT(0);
2276 0 : error = -EFSCORRUPTED;
2277 0 : break;
2278 : }
2279 133505252 : if (error || freeit)
2280 763682 : xlog_recover_free_trans(trans);
2281 133505252 : return error;
2282 : }
2283 :
2284 : /*
2285 : * Lookup the transaction recovery structure associated with the ID in the
2286 : * current ophdr. If the transaction doesn't exist and the start flag is set in
2287 : * the ophdr, then allocate a new transaction for future ID matches to find.
2288 : * Either way, return what we found during the lookup - an existing transaction
2289 : * or nothing.
2290 : */
2291 : STATIC struct xlog_recover *
2292 134389642 : xlog_recover_ophdr_to_trans(
2293 : struct hlist_head rhash[],
2294 : struct xlog_rec_header *rhead,
2295 : struct xlog_op_header *ohead)
2296 : {
2297 134389642 : struct xlog_recover *trans;
2298 134389642 : xlog_tid_t tid;
2299 134389642 : struct hlist_head *rhp;
2300 :
2301 134389642 : tid = be32_to_cpu(ohead->oh_tid);
2302 134389642 : rhp = &rhash[XLOG_RHASH(tid)];
2303 268781064 : hlist_for_each_entry(trans, rhp, r_list) {
2304 133507032 : if (trans->r_log_tid == tid)
2305 133505252 : return trans;
2306 : }
2307 :
2308 : /*
2309 : * skip over non-start transaction headers - we could be
2310 : * processing slack space before the next transaction starts
2311 : */
2312 884390 : if (!(ohead->oh_flags & XLOG_START_TRANS))
2313 : return NULL;
2314 :
2315 764912 : ASSERT(be32_to_cpu(ohead->oh_len) == 0);
2316 :
2317 : /*
2318 : * This is a new transaction so allocate a new recovery container to
2319 : * hold the recovery ops that will follow.
2320 : */
2321 764912 : trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
2322 764912 : trans->r_log_tid = tid;
2323 764912 : trans->r_lsn = be64_to_cpu(rhead->h_lsn);
2324 764912 : INIT_LIST_HEAD(&trans->r_itemq);
2325 764912 : INIT_HLIST_NODE(&trans->r_list);
2326 764912 : hlist_add_head(&trans->r_list, rhp);
2327 :
2328 : /*
2329 : * Nothing more to do for this ophdr. Items to be added to this new
2330 : * transaction will be in subsequent ophdr containers.
2331 : */
2332 764912 : return NULL;
2333 : }
2334 :
2335 : STATIC int
2336 134389642 : xlog_recover_process_ophdr(
2337 : struct xlog *log,
2338 : struct hlist_head rhash[],
2339 : struct xlog_rec_header *rhead,
2340 : struct xlog_op_header *ohead,
2341 : char *dp,
2342 : char *end,
2343 : int pass,
2344 : struct list_head *buffer_list)
2345 : {
2346 134389642 : struct xlog_recover *trans;
2347 134389642 : unsigned int len;
2348 134389642 : int error;
2349 :
2350 : /* Do we understand who wrote this op? */
2351 134389642 : if (ohead->oh_clientid != XFS_TRANSACTION &&
2352 : ohead->oh_clientid != XFS_LOG) {
2353 0 : xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
2354 : __func__, ohead->oh_clientid);
2355 0 : ASSERT(0);
2356 0 : return -EFSCORRUPTED;
2357 : }
2358 :
2359 : /*
2360 : * Check the ophdr contains all the data it is supposed to contain.
2361 : */
2362 134389642 : len = be32_to_cpu(ohead->oh_len);
2363 134389642 : if (dp + len > end) {
2364 0 : xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
2365 0 : WARN_ON(1);
2366 0 : return -EFSCORRUPTED;
2367 : }
2368 :
2369 134389642 : trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
2370 134389642 : if (!trans) {
2371 : /* nothing to do, so skip over this ophdr */
2372 : return 0;
2373 : }
2374 :
2375 : /*
2376 : * The recovered buffer queue is drained only once we know that all
2377 : * recovery items for the current LSN have been processed. This is
2378 : * required because:
2379 : *
2380 : * - Buffer write submission updates the metadata LSN of the buffer.
2381 : * - Log recovery skips items with a metadata LSN >= the current LSN of
2382 : * the recovery item.
2383 : * - Separate recovery items against the same metadata buffer can share
2384 : * a current LSN. I.e., consider that the LSN of a recovery item is
2385 : * defined as the starting LSN of the first record in which its
2386 : * transaction appears, that a record can hold multiple transactions,
2387 : * and/or that a transaction can span multiple records.
2388 : *
2389 : * In other words, we are allowed to submit a buffer from log recovery
2390 : * once per current LSN. Otherwise, we may incorrectly skip recovery
2391 : * items and cause corruption.
2392 : *
2393 : * We don't know up front whether buffers are updated multiple times per
2394 : * LSN. Therefore, track the current LSN of each commit log record as it
2395 : * is processed and drain the queue when it changes. Use commit records
2396 : * because they are ordered correctly by the logging code.
2397 : */
2398 133505252 : if (log->l_recovery_lsn != trans->r_lsn &&
2399 133397250 : ohead->oh_flags & XLOG_COMMIT_TRANS) {
2400 760390 : error = xfs_buf_delwri_submit(buffer_list);
2401 760390 : if (error)
2402 : return error;
2403 760390 : log->l_recovery_lsn = trans->r_lsn;
2404 : }
2405 :
2406 133505252 : return xlog_recovery_process_trans(log, trans, dp, len,
2407 133505252 : ohead->oh_flags, pass, buffer_list);
2408 : }
2409 :
2410 : /*
2411 : * There are two valid states of the r_state field. 0 indicates that the
2412 : * transaction structure is in a normal state. We have either seen the
2413 : * start of the transaction or the last operation we added was not a partial
2414 : * operation. If the last operation we added to the transaction was a
2415 : * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
2416 : *
2417 : * NOTE: skip LRs with 0 data length.
2418 : */
2419 : STATIC int
2420 2609736 : xlog_recover_process_data(
2421 : struct xlog *log,
2422 : struct hlist_head rhash[],
2423 : struct xlog_rec_header *rhead,
2424 : char *dp,
2425 : int pass,
2426 : struct list_head *buffer_list)
2427 : {
2428 2609736 : struct xlog_op_header *ohead;
2429 2609736 : char *end;
2430 2609736 : int num_logops;
2431 2609736 : int error;
2432 :
2433 2609736 : end = dp + be32_to_cpu(rhead->h_len);
2434 2609736 : num_logops = be32_to_cpu(rhead->h_num_logops);
2435 :
2436 : /* check the log format matches our own - else we can't recover */
2437 2609736 : if (xlog_header_check_recover(log->l_mp, rhead))
2438 : return -EIO;
2439 :
2440 2609736 : trace_xfs_log_recover_record(log, rhead, pass);
2441 136999378 : while ((dp < end) && num_logops) {
2442 :
2443 134389642 : ohead = (struct xlog_op_header *)dp;
2444 134389642 : dp += sizeof(*ohead);
2445 134389642 : ASSERT(dp <= end);
2446 :
2447 : /* errors will abort recovery */
2448 134389642 : error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
2449 : dp, end, pass, buffer_list);
2450 134389642 : if (error)
2451 0 : return error;
2452 :
2453 134389642 : dp += be32_to_cpu(ohead->oh_len);
2454 134389642 : num_logops--;
2455 : }
2456 : return 0;
2457 : }
2458 :
2459 : /* Take all the collected deferred ops and finish them in order. */
2460 : static int
2461 11281 : xlog_finish_defer_ops(
2462 : struct xfs_mount *mp,
2463 : struct list_head *capture_list)
2464 : {
2465 11281 : struct xfs_defer_capture *dfc, *next;
2466 11281 : struct xfs_trans *tp;
2467 11281 : int error = 0;
2468 :
2469 11967 : list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
2470 686 : struct xfs_trans_res resv;
2471 686 : struct xfs_defer_resources dres;
2472 :
2473 : /*
2474 : * Create a new transaction reservation from the captured
2475 : * information. Set logcount to 1 to force the new transaction
2476 : * to regrant every roll so that we can make forward progress
2477 : * in recovery no matter how full the log might be.
2478 : */
2479 686 : resv.tr_logres = dfc->dfc_logres;
2480 686 : resv.tr_logcount = 1;
2481 686 : resv.tr_logflags = XFS_TRANS_PERM_LOG_RES;
2482 :
2483 686 : error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
2484 : dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
2485 686 : if (error) {
2486 0 : xlog_force_shutdown(mp->m_log, SHUTDOWN_LOG_IO_ERROR);
2487 0 : return error;
2488 : }
2489 :
2490 : /*
2491 : * Transfer to this new transaction all the dfops we captured
2492 : * from recovering a single intent item.
2493 : */
2494 686 : list_del_init(&dfc->dfc_list);
2495 686 : xfs_defer_ops_continue(dfc, tp, &dres);
2496 686 : error = xfs_trans_commit(tp);
2497 686 : xfs_defer_resources_rele(&dres);
2498 686 : if (error)
2499 0 : return error;
2500 : }
2501 :
2502 11281 : ASSERT(list_empty(capture_list));
2503 : return 0;
2504 : }
2505 :
2506 : /* Release all the captured defer ops and capture structures in this list. */
2507 : static void
2508 2 : xlog_abort_defer_ops(
2509 : struct xfs_mount *mp,
2510 : struct list_head *capture_list)
2511 : {
2512 2 : struct xfs_defer_capture *dfc;
2513 2 : struct xfs_defer_capture *next;
2514 :
2515 2 : list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
2516 0 : list_del_init(&dfc->dfc_list);
2517 0 : xfs_defer_ops_capture_free(mp, dfc);
2518 : }
2519 2 : }
2520 :
2521 : /*
2522 : * When this is called, all of the log intent items which did not have
2523 : * corresponding log done items should be in the AIL. What we do now is update
2524 : * the data structures associated with each one.
2525 : *
2526 : * Since we process the log intent items in normal transactions, they will be
2527 : * removed at some point after the commit. This prevents us from just walking
2528 : * down the list processing each one. We'll use a flag in the intent item to
2529 : * skip those that we've already processed and use the AIL iteration mechanism's
2530 : * generation count to try to speed this up at least a bit.
2531 : *
2532 : * When we start, we know that the intents are the only things in the AIL. As we
2533 : * process them, however, other items are added to the AIL. Hence we know we
2534 : * have started recovery on all the pending intents when we find an non-intent
2535 : * item in the AIL.
2536 : */
2537 : STATIC int
2538 11283 : xlog_recover_process_intents(
2539 : struct xlog *log)
2540 : {
2541 11283 : LIST_HEAD(capture_list);
2542 11283 : struct xfs_ail_cursor cur;
2543 11283 : struct xfs_log_item *lip;
2544 11283 : struct xfs_ail *ailp;
2545 11283 : int error = 0;
2546 : #if defined(DEBUG) || defined(XFS_WARN)
2547 11283 : xfs_lsn_t last_lsn;
2548 : #endif
2549 :
2550 11283 : ailp = log->l_ailp;
2551 11283 : spin_lock(&ailp->ail_lock);
2552 : #if defined(DEBUG) || defined(XFS_WARN)
2553 11283 : last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
2554 : #endif
2555 11283 : for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2556 15046 : lip != NULL;
2557 3763 : lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
2558 3765 : const struct xfs_item_ops *ops;
2559 :
2560 3765 : if (!xlog_item_is_intent(lip))
2561 : break;
2562 :
2563 : /*
2564 : * We should never see a redo item with a LSN higher than
2565 : * the last transaction we found in the log at the start
2566 : * of recovery.
2567 : */
2568 7530 : ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
2569 :
2570 : /*
2571 : * NOTE: If your intent processing routine can create more
2572 : * deferred ops, you /must/ attach them to the capture list in
2573 : * the recover routine or else those subsequent intents will be
2574 : * replayed in the wrong order!
2575 : *
2576 : * The recovery function can free the log item, so we must not
2577 : * access lip after it returns.
2578 : */
2579 3765 : spin_unlock(&ailp->ail_lock);
2580 3765 : ops = lip->li_ops;
2581 3765 : error = ops->iop_recover(lip, &capture_list);
2582 3765 : spin_lock(&ailp->ail_lock);
2583 3765 : if (error) {
2584 2 : trace_xlog_intent_recovery_failed(log->l_mp, error,
2585 2 : ops->iop_recover);
2586 2 : break;
2587 : }
2588 : }
2589 :
2590 11283 : xfs_trans_ail_cursor_done(&cur);
2591 11283 : spin_unlock(&ailp->ail_lock);
2592 11283 : if (error)
2593 2 : goto err;
2594 :
2595 11281 : error = xlog_finish_defer_ops(log->l_mp, &capture_list);
2596 11281 : if (error)
2597 0 : goto err;
2598 :
2599 : return 0;
2600 2 : err:
2601 2 : xlog_abort_defer_ops(log->l_mp, &capture_list);
2602 2 : return error;
2603 : }
2604 :
2605 : /*
2606 : * A cancel occurs when the mount has failed and we're bailing out. Release all
2607 : * pending log intent items that we haven't started recovery on so they don't
2608 : * pin the AIL.
2609 : */
2610 : STATIC void
2611 2 : xlog_recover_cancel_intents(
2612 : struct xlog *log)
2613 : {
2614 2 : struct xfs_log_item *lip;
2615 2 : struct xfs_ail_cursor cur;
2616 2 : struct xfs_ail *ailp;
2617 :
2618 2 : ailp = log->l_ailp;
2619 2 : spin_lock(&ailp->ail_lock);
2620 2 : lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2621 2 : while (lip != NULL) {
2622 0 : if (!xlog_item_is_intent(lip))
2623 : break;
2624 :
2625 0 : spin_unlock(&ailp->ail_lock);
2626 0 : lip->li_ops->iop_release(lip);
2627 0 : spin_lock(&ailp->ail_lock);
2628 0 : lip = xfs_trans_ail_cursor_next(ailp, &cur);
2629 : }
2630 :
2631 2 : xfs_trans_ail_cursor_done(&cur);
2632 2 : spin_unlock(&ailp->ail_lock);
2633 2 : }
2634 :
2635 : /*
2636 : * This routine performs a transaction to null out a bad inode pointer
2637 : * in an agi unlinked inode hash bucket.
2638 : */
2639 : STATIC void
2640 4 : xlog_recover_clear_agi_bucket(
2641 : struct xfs_perag *pag,
2642 : int bucket)
2643 : {
2644 4 : struct xfs_mount *mp = pag->pag_mount;
2645 4 : struct xfs_trans *tp;
2646 4 : struct xfs_agi *agi;
2647 4 : struct xfs_buf *agibp;
2648 4 : int offset;
2649 4 : int error;
2650 :
2651 4 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
2652 4 : if (error)
2653 4 : goto out_error;
2654 :
2655 0 : error = xfs_read_agi(pag, tp, &agibp);
2656 0 : if (error)
2657 0 : goto out_abort;
2658 :
2659 0 : agi = agibp->b_addr;
2660 0 : agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
2661 0 : offset = offsetof(xfs_agi_t, agi_unlinked) +
2662 : (sizeof(xfs_agino_t) * bucket);
2663 0 : xfs_trans_log_buf(tp, agibp, offset,
2664 : (offset + sizeof(xfs_agino_t) - 1));
2665 :
2666 0 : error = xfs_trans_commit(tp);
2667 0 : if (error)
2668 0 : goto out_error;
2669 : return;
2670 :
2671 : out_abort:
2672 0 : xfs_trans_cancel(tp);
2673 4 : out_error:
2674 4 : xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__,
2675 : pag->pag_agno);
2676 4 : return;
2677 : }
2678 :
2679 : static int
2680 2886016 : xlog_recover_iunlink_bucket(
2681 : struct xfs_perag *pag,
2682 : struct xfs_agi *agi,
2683 : int bucket)
2684 : {
2685 2886016 : struct xfs_mount *mp = pag->pag_mount;
2686 2886016 : struct xfs_inode *prev_ip = NULL;
2687 2886016 : struct xfs_inode *ip;
2688 2886016 : xfs_agino_t prev_agino, agino;
2689 2886016 : int error = 0;
2690 :
2691 2886016 : agino = be32_to_cpu(agi->agi_unlinked[bucket]);
2692 3052453 : while (agino != NULLAGINO) {
2693 166437 : error = xfs_iget(mp, NULL,
2694 166437 : XFS_AGINO_TO_INO(mp, pag->pag_agno, agino),
2695 : 0, 0, &ip);
2696 166437 : if (error)
2697 : break;
2698 :
2699 166437 : ASSERT(VFS_I(ip)->i_nlink == 0);
2700 166437 : ASSERT(VFS_I(ip)->i_mode != 0);
2701 166437 : xfs_iflags_clear(ip, XFS_IRECOVERY);
2702 166437 : agino = ip->i_next_unlinked;
2703 :
2704 166437 : if (prev_ip) {
2705 161106 : ip->i_prev_unlinked = prev_agino;
2706 161106 : xfs_irele(prev_ip);
2707 :
2708 : /*
2709 : * Ensure the inode is removed from the unlinked list
2710 : * before we continue so that it won't race with
2711 : * building the in-memory list here. This could be
2712 : * serialised with the agibp lock, but that just
2713 : * serialises via lockstepping and it's much simpler
2714 : * just to flush the inodegc queue and wait for it to
2715 : * complete.
2716 : */
2717 161106 : error = xfs_inodegc_flush(mp);
2718 161106 : if (error)
2719 : break;
2720 : }
2721 :
2722 166437 : prev_agino = agino;
2723 166437 : prev_ip = ip;
2724 : }
2725 :
2726 2886016 : if (prev_ip) {
2727 5331 : int error2;
2728 :
2729 5331 : ip->i_prev_unlinked = prev_agino;
2730 5331 : xfs_irele(prev_ip);
2731 :
2732 5331 : error2 = xfs_inodegc_flush(mp);
2733 5331 : if (error2 && !error)
2734 4 : return error2;
2735 : }
2736 : return error;
2737 : }
2738 :
2739 : /*
2740 : * Recover AGI unlinked lists
2741 : *
2742 : * This is called during recovery to process any inodes which we unlinked but
2743 : * not freed when the system crashed. These inodes will be on the lists in the
2744 : * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
2745 : * any inodes found on the lists. Each inode is removed from the lists when it
2746 : * has been fully truncated and is freed. The freeing of the inode and its
2747 : * removal from the list must be atomic.
2748 : *
2749 : * If everything we touch in the agi processing loop is already in memory, this
2750 : * loop can hold the cpu for a long time. It runs without lock contention,
2751 : * memory allocation contention, the need wait for IO, etc, and so will run
2752 : * until we either run out of inodes to process, run low on memory or we run out
2753 : * of log space.
2754 : *
2755 : * This behaviour is bad for latency on single CPU and non-preemptible kernels,
2756 : * and can prevent other filesystem work (such as CIL pushes) from running. This
2757 : * can lead to deadlocks if the recovery process runs out of log reservation
2758 : * space. Hence we need to yield the CPU when there is other kernel work
2759 : * scheduled on this CPU to ensure other scheduled work can run without undue
2760 : * latency.
2761 : */
2762 : static void
2763 45106 : xlog_recover_iunlink_ag(
2764 : struct xfs_perag *pag)
2765 : {
2766 45106 : struct xfs_agi *agi;
2767 45106 : struct xfs_buf *agibp;
2768 45106 : int bucket;
2769 45106 : int error;
2770 :
2771 45106 : error = xfs_read_agi(pag, NULL, &agibp);
2772 45106 : if (error) {
2773 : /*
2774 : * AGI is b0rked. Don't process it.
2775 : *
2776 : * We should probably mark the filesystem as corrupt after we've
2777 : * recovered all the ag's we can....
2778 : */
2779 12 : return;
2780 : }
2781 :
2782 : /*
2783 : * Unlock the buffer so that it can be acquired in the normal course of
2784 : * the transaction to truncate and free each inode. Because we are not
2785 : * racing with anyone else here for the AGI buffer, we don't even need
2786 : * to hold it locked to read the initial unlinked bucket entries out of
2787 : * the buffer. We keep buffer reference though, so that it stays pinned
2788 : * in memory while we need the buffer.
2789 : */
2790 45094 : agi = agibp->b_addr;
2791 45094 : xfs_buf_unlock(agibp);
2792 :
2793 2976204 : for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
2794 2886016 : error = xlog_recover_iunlink_bucket(pag, agi, bucket);
2795 2886016 : if (error) {
2796 : /*
2797 : * Bucket is unrecoverable, so only a repair scan can
2798 : * free the remaining unlinked inodes. Just empty the
2799 : * bucket and remaining inodes on it unreferenced and
2800 : * unfreeable.
2801 : */
2802 4 : xlog_recover_clear_agi_bucket(pag, bucket);
2803 : }
2804 : }
2805 :
2806 45094 : xfs_buf_rele(agibp);
2807 : }
2808 :
2809 : static void
2810 11281 : xlog_recover_process_iunlinks(
2811 : struct xlog *log)
2812 : {
2813 11281 : struct xfs_perag *pag;
2814 11281 : xfs_agnumber_t agno;
2815 :
2816 56387 : for_each_perag(log->l_mp, agno, pag)
2817 45106 : xlog_recover_iunlink_ag(pag);
2818 11281 : }
2819 :
2820 : STATIC void
2821 2609736 : xlog_unpack_data(
2822 : struct xlog_rec_header *rhead,
2823 : char *dp,
2824 : struct xlog *log)
2825 : {
2826 2609736 : int i, j, k;
2827 :
2828 143985740 : for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
2829 141376004 : i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
2830 141376004 : *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
2831 141376004 : dp += BBSIZE;
2832 : }
2833 :
2834 2609736 : if (xfs_has_logv2(log->l_mp)) {
2835 : xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
2836 2915952 : for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
2837 306216 : j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
2838 306216 : k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
2839 306216 : *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
2840 306216 : dp += BBSIZE;
2841 : }
2842 : }
2843 2609736 : }
2844 :
2845 : /*
2846 : * CRC check, unpack and process a log record.
2847 : */
2848 : STATIC int
2849 3985667 : xlog_recover_process(
2850 : struct xlog *log,
2851 : struct hlist_head rhash[],
2852 : struct xlog_rec_header *rhead,
2853 : char *dp,
2854 : int pass,
2855 : struct list_head *buffer_list)
2856 : {
2857 3985667 : __le32 old_crc = rhead->h_crc;
2858 3985667 : __le32 crc;
2859 :
2860 3985667 : crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
2861 :
2862 : /*
2863 : * Nothing else to do if this is a CRC verification pass. Just return
2864 : * if this a record with a non-zero crc. Unfortunately, mkfs always
2865 : * sets old_crc to 0 so we must consider this valid even on v5 supers.
2866 : * Otherwise, return EFSBADCRC on failure so the callers up the stack
2867 : * know precisely what failed.
2868 : */
2869 3985667 : if (pass == XLOG_RECOVER_CRCPASS) {
2870 1375931 : if (old_crc && crc != old_crc)
2871 : return -EFSBADCRC;
2872 1375920 : return 0;
2873 : }
2874 :
2875 : /*
2876 : * We're in the normal recovery path. Issue a warning if and only if the
2877 : * CRC in the header is non-zero. This is an advisory warning and the
2878 : * zero CRC check prevents warnings from being emitted when upgrading
2879 : * the kernel from one that does not add CRCs by default.
2880 : */
2881 2609736 : if (crc != old_crc) {
2882 0 : if (old_crc || xfs_has_crc(log->l_mp)) {
2883 0 : xfs_alert(log->l_mp,
2884 : "log record CRC mismatch: found 0x%x, expected 0x%x.",
2885 : le32_to_cpu(old_crc),
2886 : le32_to_cpu(crc));
2887 0 : xfs_hex_dump(dp, 32);
2888 : }
2889 :
2890 : /*
2891 : * If the filesystem is CRC enabled, this mismatch becomes a
2892 : * fatal log corruption failure.
2893 : */
2894 0 : if (xfs_has_crc(log->l_mp)) {
2895 0 : XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
2896 0 : return -EFSCORRUPTED;
2897 : }
2898 : }
2899 :
2900 2609736 : xlog_unpack_data(rhead, dp, log);
2901 :
2902 2609736 : return xlog_recover_process_data(log, rhash, rhead, dp, pass,
2903 : buffer_list);
2904 : }
2905 :
2906 : STATIC int
2907 4030803 : xlog_valid_rec_header(
2908 : struct xlog *log,
2909 : struct xlog_rec_header *rhead,
2910 : xfs_daddr_t blkno,
2911 : int bufsize)
2912 : {
2913 4030803 : int hlen;
2914 :
2915 4030803 : if (XFS_IS_CORRUPT(log->l_mp,
2916 : rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)))
2917 0 : return -EFSCORRUPTED;
2918 4030803 : if (XFS_IS_CORRUPT(log->l_mp,
2919 : (!rhead->h_version ||
2920 : (be32_to_cpu(rhead->h_version) &
2921 : (~XLOG_VERSION_OKBITS))))) {
2922 0 : xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
2923 : __func__, be32_to_cpu(rhead->h_version));
2924 0 : return -EFSCORRUPTED;
2925 : }
2926 :
2927 : /*
2928 : * LR body must have data (or it wouldn't have been written)
2929 : * and h_len must not be greater than LR buffer size.
2930 : */
2931 4030803 : hlen = be32_to_cpu(rhead->h_len);
2932 4030803 : if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > bufsize))
2933 0 : return -EFSCORRUPTED;
2934 :
2935 4030803 : if (XFS_IS_CORRUPT(log->l_mp,
2936 : blkno > log->l_logBBsize || blkno > INT_MAX))
2937 0 : return -EFSCORRUPTED;
2938 : return 0;
2939 : }
2940 :
2941 : /*
2942 : * Read the log from tail to head and process the log records found.
2943 : * Handle the two cases where the tail and head are in the same cycle
2944 : * and where the active portion of the log wraps around the end of
2945 : * the physical log separately. The pass parameter is passed through
2946 : * to the routines called to process the data and is not looked at
2947 : * here.
2948 : */
2949 : STATIC int
2950 45136 : xlog_do_recovery_pass(
2951 : struct xlog *log,
2952 : xfs_daddr_t head_blk,
2953 : xfs_daddr_t tail_blk,
2954 : int pass,
2955 : xfs_daddr_t *first_bad) /* out: first bad log rec */
2956 : {
2957 45136 : xlog_rec_header_t *rhead;
2958 45136 : xfs_daddr_t blk_no, rblk_no;
2959 45136 : xfs_daddr_t rhead_blk;
2960 45136 : char *offset;
2961 45136 : char *hbp, *dbp;
2962 45136 : int error = 0, h_size, h_len;
2963 45136 : int error2 = 0;
2964 45136 : int bblks, split_bblks;
2965 45136 : int hblks, split_hblks, wrapped_hblks;
2966 45136 : int i;
2967 45136 : struct hlist_head rhash[XLOG_RHASH_SIZE];
2968 45136 : LIST_HEAD (buffer_list);
2969 :
2970 45136 : ASSERT(head_blk != tail_blk);
2971 : blk_no = rhead_blk = tail_blk;
2972 :
2973 767312 : for (i = 0; i < XLOG_RHASH_SIZE; i++)
2974 722176 : INIT_HLIST_HEAD(&rhash[i]);
2975 :
2976 : /*
2977 : * Read the header of the tail block and get the iclog buffer size from
2978 : * h_size. Use this to tell how many sectors make up the log header.
2979 : */
2980 45136 : if (xfs_has_logv2(log->l_mp)) {
2981 : /*
2982 : * When using variable length iclogs, read first sector of
2983 : * iclog header and extract the header size from it. Get a
2984 : * new hbp that is the correct size.
2985 : */
2986 45136 : hbp = xlog_alloc_buffer(log, 1);
2987 45136 : if (!hbp)
2988 : return -ENOMEM;
2989 :
2990 45136 : error = xlog_bread(log, tail_blk, 1, hbp, &offset);
2991 45136 : if (error)
2992 0 : goto bread_err1;
2993 :
2994 45136 : rhead = (xlog_rec_header_t *)offset;
2995 :
2996 : /*
2997 : * xfsprogs has a bug where record length is based on lsunit but
2998 : * h_size (iclog size) is hardcoded to 32k. Now that we
2999 : * unconditionally CRC verify the unmount record, this means the
3000 : * log buffer can be too small for the record and cause an
3001 : * overrun.
3002 : *
3003 : * Detect this condition here. Use lsunit for the buffer size as
3004 : * long as this looks like the mkfs case. Otherwise, return an
3005 : * error to avoid a buffer overrun.
3006 : */
3007 45136 : h_size = be32_to_cpu(rhead->h_size);
3008 45136 : h_len = be32_to_cpu(rhead->h_len);
3009 45136 : if (h_len > h_size && h_len <= log->l_mp->m_logbsize &&
3010 0 : rhead->h_num_logops == cpu_to_be32(1)) {
3011 0 : xfs_warn(log->l_mp,
3012 : "invalid iclog size (%d bytes), using lsunit (%d bytes)",
3013 : h_size, log->l_mp->m_logbsize);
3014 0 : h_size = log->l_mp->m_logbsize;
3015 : }
3016 :
3017 45136 : error = xlog_valid_rec_header(log, rhead, tail_blk, h_size);
3018 45136 : if (error)
3019 0 : goto bread_err1;
3020 :
3021 45136 : hblks = xlog_logrec_hblks(log, rhead);
3022 45136 : if (hblks != 1) {
3023 168 : kmem_free(hbp);
3024 168 : hbp = xlog_alloc_buffer(log, hblks);
3025 : }
3026 : } else {
3027 0 : ASSERT(log->l_sectBBsize == 1);
3028 0 : hblks = 1;
3029 0 : hbp = xlog_alloc_buffer(log, 1);
3030 0 : h_size = XLOG_BIG_RECORD_BSIZE;
3031 : }
3032 :
3033 45136 : if (!hbp)
3034 : return -ENOMEM;
3035 45136 : dbp = xlog_alloc_buffer(log, BTOBB(h_size));
3036 45136 : if (!dbp) {
3037 0 : kmem_free(hbp);
3038 0 : return -ENOMEM;
3039 : }
3040 :
3041 45136 : memset(rhash, 0, sizeof(rhash));
3042 45136 : if (tail_blk > head_blk) {
3043 : /*
3044 : * Perform recovery around the end of the physical log.
3045 : * When the head is not on the same cycle number as the tail,
3046 : * we can't do a sequential recovery.
3047 : */
3048 255003 : while (blk_no < log->l_logBBsize) {
3049 : /*
3050 : * Check for header wrapping around physical end-of-log
3051 : */
3052 253368 : offset = hbp;
3053 253368 : split_hblks = 0;
3054 253368 : wrapped_hblks = 0;
3055 253368 : if (blk_no + hblks <= log->l_logBBsize) {
3056 : /* Read header in one read */
3057 253368 : error = xlog_bread(log, blk_no, hblks, hbp,
3058 : &offset);
3059 253368 : if (error)
3060 0 : goto bread_err2;
3061 : } else {
3062 : /* This LR is split across physical log end */
3063 0 : if (blk_no != log->l_logBBsize) {
3064 : /* some data before physical log end */
3065 0 : ASSERT(blk_no <= INT_MAX);
3066 0 : split_hblks = log->l_logBBsize - (int)blk_no;
3067 0 : ASSERT(split_hblks > 0);
3068 0 : error = xlog_bread(log, blk_no,
3069 : split_hblks, hbp,
3070 : &offset);
3071 0 : if (error)
3072 0 : goto bread_err2;
3073 : }
3074 :
3075 : /*
3076 : * Note: this black magic still works with
3077 : * large sector sizes (non-512) only because:
3078 : * - we increased the buffer size originally
3079 : * by 1 sector giving us enough extra space
3080 : * for the second read;
3081 : * - the log start is guaranteed to be sector
3082 : * aligned;
3083 : * - we read the log end (LR header start)
3084 : * _first_, then the log start (LR header end)
3085 : * - order is important.
3086 : */
3087 0 : wrapped_hblks = hblks - split_hblks;
3088 0 : error = xlog_bread_noalign(log, 0,
3089 : wrapped_hblks,
3090 0 : offset + BBTOB(split_hblks));
3091 0 : if (error)
3092 0 : goto bread_err2;
3093 : }
3094 253368 : rhead = (xlog_rec_header_t *)offset;
3095 253368 : error = xlog_valid_rec_header(log, rhead,
3096 : split_hblks ? blk_no : 0, h_size);
3097 253368 : if (error)
3098 0 : goto bread_err2;
3099 :
3100 253368 : bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3101 253368 : blk_no += hblks;
3102 :
3103 : /*
3104 : * Read the log record data in multiple reads if it
3105 : * wraps around the end of the log. Note that if the
3106 : * header already wrapped, blk_no could point past the
3107 : * end of the log. The record data is contiguous in
3108 : * that case.
3109 : */
3110 253368 : if (blk_no + bblks <= log->l_logBBsize ||
3111 : blk_no >= log->l_logBBsize) {
3112 251989 : rblk_no = xlog_wrap_logbno(log, blk_no);
3113 251989 : error = xlog_bread(log, rblk_no, bblks, dbp,
3114 : &offset);
3115 251989 : if (error)
3116 0 : goto bread_err2;
3117 : } else {
3118 : /* This log record is split across the
3119 : * physical end of log */
3120 1379 : offset = dbp;
3121 1379 : split_bblks = 0;
3122 1379 : if (blk_no != log->l_logBBsize) {
3123 : /* some data is before the physical
3124 : * end of log */
3125 1379 : ASSERT(!wrapped_hblks);
3126 1379 : ASSERT(blk_no <= INT_MAX);
3127 1379 : split_bblks =
3128 1379 : log->l_logBBsize - (int)blk_no;
3129 1379 : ASSERT(split_bblks > 0);
3130 1379 : error = xlog_bread(log, blk_no,
3131 : split_bblks, dbp,
3132 : &offset);
3133 1379 : if (error)
3134 0 : goto bread_err2;
3135 : }
3136 :
3137 : /*
3138 : * Note: this black magic still works with
3139 : * large sector sizes (non-512) only because:
3140 : * - we increased the buffer size originally
3141 : * by 1 sector giving us enough extra space
3142 : * for the second read;
3143 : * - the log start is guaranteed to be sector
3144 : * aligned;
3145 : * - we read the log end (LR header start)
3146 : * _first_, then the log start (LR header end)
3147 : * - order is important.
3148 : */
3149 1379 : error = xlog_bread_noalign(log, 0,
3150 : bblks - split_bblks,
3151 1379 : offset + BBTOB(split_bblks));
3152 1379 : if (error)
3153 0 : goto bread_err2;
3154 : }
3155 :
3156 253368 : error = xlog_recover_process(log, rhash, rhead, offset,
3157 : pass, &buffer_list);
3158 253368 : if (error)
3159 0 : goto bread_err2;
3160 :
3161 : blk_no += bblks;
3162 : rhead_blk = blk_no;
3163 : }
3164 :
3165 1635 : ASSERT(blk_no >= log->l_logBBsize);
3166 1635 : blk_no -= log->l_logBBsize;
3167 1635 : rhead_blk = blk_no;
3168 : }
3169 :
3170 : /* read first part of physical log */
3171 3777424 : while (blk_no < head_blk) {
3172 3732299 : error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3173 3732299 : if (error)
3174 0 : goto bread_err2;
3175 :
3176 3732299 : rhead = (xlog_rec_header_t *)offset;
3177 3732299 : error = xlog_valid_rec_header(log, rhead, blk_no, h_size);
3178 3732299 : if (error)
3179 0 : goto bread_err2;
3180 :
3181 : /* blocks in data section */
3182 3732299 : bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3183 3732299 : error = xlog_bread(log, blk_no+hblks, bblks, dbp,
3184 : &offset);
3185 3732299 : if (error)
3186 0 : goto bread_err2;
3187 :
3188 3732299 : error = xlog_recover_process(log, rhash, rhead, offset, pass,
3189 : &buffer_list);
3190 3732299 : if (error)
3191 11 : goto bread_err2;
3192 :
3193 3732288 : blk_no += bblks + hblks;
3194 3732288 : rhead_blk = blk_no;
3195 : }
3196 :
3197 45125 : bread_err2:
3198 45136 : kmem_free(dbp);
3199 45136 : bread_err1:
3200 45136 : kmem_free(hbp);
3201 :
3202 : /*
3203 : * Submit buffers that have been added from the last record processed,
3204 : * regardless of error status.
3205 : */
3206 45136 : if (!list_empty(&buffer_list))
3207 10480 : error2 = xfs_buf_delwri_submit(&buffer_list);
3208 :
3209 45136 : if (error && first_bad)
3210 11 : *first_bad = rhead_blk;
3211 :
3212 : /*
3213 : * Transactions are freed at commit time but transactions without commit
3214 : * records on disk are never committed. Free any that may be left in the
3215 : * hash table.
3216 : */
3217 767312 : for (i = 0; i < XLOG_RHASH_SIZE; i++) {
3218 722176 : struct hlist_node *tmp;
3219 722176 : struct xlog_recover *trans;
3220 :
3221 1445582 : hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
3222 1230 : xlog_recover_free_trans(trans);
3223 : }
3224 :
3225 45136 : return error ? error : error2;
3226 : }
3227 :
3228 : /*
3229 : * Do the recovery of the log. We actually do this in two phases.
3230 : * The two passes are necessary in order to implement the function
3231 : * of cancelling a record written into the log. The first pass
3232 : * determines those things which have been cancelled, and the
3233 : * second pass replays log items normally except for those which
3234 : * have been cancelled. The handling of the replay and cancellations
3235 : * takes place in the log item type specific routines.
3236 : *
3237 : * The table of items which have cancel records in the log is allocated
3238 : * and freed at this level, since only here do we know when all of
3239 : * the log recovery has been completed.
3240 : */
3241 : STATIC int
3242 11283 : xlog_do_log_recovery(
3243 : struct xlog *log,
3244 : xfs_daddr_t head_blk,
3245 : xfs_daddr_t tail_blk)
3246 : {
3247 11283 : int error;
3248 :
3249 11283 : ASSERT(head_blk != tail_blk);
3250 :
3251 : /*
3252 : * First do a pass to find all of the cancelled buf log items.
3253 : * Store them in the buf_cancel_table for use in the second pass.
3254 : */
3255 11283 : error = xlog_alloc_buf_cancel_table(log);
3256 11283 : if (error)
3257 : return error;
3258 :
3259 11283 : error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3260 : XLOG_RECOVER_PASS1, NULL);
3261 11283 : if (error != 0)
3262 0 : goto out_cancel;
3263 :
3264 : /*
3265 : * Then do a second pass to actually recover the items in the log.
3266 : * When it is complete free the table of buf cancel items.
3267 : */
3268 11283 : error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3269 : XLOG_RECOVER_PASS2, NULL);
3270 11283 : if (!error)
3271 11283 : xlog_check_buf_cancel_table(log);
3272 0 : out_cancel:
3273 11283 : xlog_free_buf_cancel_table(log);
3274 11283 : return error;
3275 : }
3276 :
3277 : /*
3278 : * Do the actual recovery
3279 : */
3280 : STATIC int
3281 11283 : xlog_do_recover(
3282 : struct xlog *log,
3283 : xfs_daddr_t head_blk,
3284 : xfs_daddr_t tail_blk)
3285 : {
3286 11283 : struct xfs_mount *mp = log->l_mp;
3287 11283 : struct xfs_buf *bp = mp->m_sb_bp;
3288 11283 : struct xfs_sb *sbp = &mp->m_sb;
3289 11283 : int error;
3290 :
3291 11283 : trace_xfs_log_recover(log, head_blk, tail_blk);
3292 :
3293 : /*
3294 : * First replay the images in the log.
3295 : */
3296 11283 : error = xlog_do_log_recovery(log, head_blk, tail_blk);
3297 11283 : if (error)
3298 : return error;
3299 :
3300 22566 : if (xlog_is_shutdown(log))
3301 : return -EIO;
3302 :
3303 : /*
3304 : * We now update the tail_lsn since much of the recovery has completed
3305 : * and there may be space available to use. If there were no extent
3306 : * or iunlinks, we can free up the entire log and set the tail_lsn to
3307 : * be the last_sync_lsn. This was set in xlog_find_tail to be the
3308 : * lsn of the last known good LR on disk. If there are extent frees
3309 : * or iunlinks they will have some entries in the AIL; so we look at
3310 : * the AIL to determine how to set the tail_lsn.
3311 : */
3312 11283 : xlog_assign_tail_lsn(mp);
3313 :
3314 : /*
3315 : * Now that we've finished replaying all buffer and inode updates,
3316 : * re-read the superblock and reverify it.
3317 : */
3318 11283 : xfs_buf_lock(bp);
3319 11283 : xfs_buf_hold(bp);
3320 11283 : error = _xfs_buf_read(bp, XBF_READ);
3321 11283 : if (error) {
3322 0 : if (!xlog_is_shutdown(log)) {
3323 0 : xfs_buf_ioerror_alert(bp, __this_address);
3324 0 : ASSERT(0);
3325 : }
3326 0 : xfs_buf_relse(bp);
3327 0 : return error;
3328 : }
3329 :
3330 : /* Convert superblock from on-disk format */
3331 11283 : xfs_sb_from_disk(sbp, bp->b_addr);
3332 11283 : xfs_buf_relse(bp);
3333 :
3334 : /* re-initialise in-core superblock and geometry structures */
3335 11283 : mp->m_features |= xfs_sb_version_to_features(sbp);
3336 11283 : xfs_reinit_percpu_counters(mp);
3337 11283 : error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks,
3338 : &mp->m_maxagi);
3339 11283 : if (error) {
3340 0 : xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
3341 0 : return error;
3342 : }
3343 11283 : mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
3344 :
3345 : /* Normal transactions can now occur */
3346 11283 : clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
3347 : return 0;
3348 : }
3349 :
3350 : /*
3351 : * Perform recovery and re-initialize some log variables in xlog_find_tail.
3352 : *
3353 : * Return error or zero.
3354 : */
3355 : int
3356 24105 : xlog_recover(
3357 : struct xlog *log)
3358 : {
3359 24105 : xfs_daddr_t head_blk, tail_blk;
3360 24105 : int error;
3361 :
3362 : /* find the tail of the log */
3363 24105 : error = xlog_find_tail(log, &head_blk, &tail_blk);
3364 24105 : if (error)
3365 : return error;
3366 :
3367 : /*
3368 : * The superblock was read before the log was available and thus the LSN
3369 : * could not be verified. Check the superblock LSN against the current
3370 : * LSN now that it's known.
3371 : */
3372 48160 : if (xfs_has_crc(log->l_mp) &&
3373 24059 : !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
3374 : return -EINVAL;
3375 :
3376 24099 : if (tail_blk != head_blk) {
3377 : /* There used to be a comment here:
3378 : *
3379 : * disallow recovery on read-only mounts. note -- mount
3380 : * checks for ENOSPC and turns it into an intelligent
3381 : * error message.
3382 : * ...but this is no longer true. Now, unless you specify
3383 : * NORECOVERY (in which case this function would never be
3384 : * called), we just go ahead and recover. We do this all
3385 : * under the vfs layer, so we can get away with it unless
3386 : * the device itself is read-only, in which case we fail.
3387 : */
3388 11285 : if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
3389 : return error;
3390 : }
3391 :
3392 : /*
3393 : * Version 5 superblock log feature mask validation. We know the
3394 : * log is dirty so check if there are any unknown log features
3395 : * in what we need to recover. If there are unknown features
3396 : * (e.g. unsupported transactions, then simply reject the
3397 : * attempt at recovery before touching anything.
3398 : */
3399 11283 : if (xfs_sb_is_v5(&log->l_mp->m_sb) &&
3400 : xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
3401 : XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
3402 0 : xfs_warn(log->l_mp,
3403 : "Superblock has unknown incompatible log features (0x%x) enabled.",
3404 : (log->l_mp->m_sb.sb_features_log_incompat &
3405 : XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
3406 0 : xfs_warn(log->l_mp,
3407 : "The log can not be fully and/or safely recovered by this kernel.");
3408 0 : xfs_warn(log->l_mp,
3409 : "Please recover the log on a kernel that supports the unknown features.");
3410 0 : return -EINVAL;
3411 : }
3412 :
3413 : /*
3414 : * Delay log recovery if the debug hook is set. This is debug
3415 : * instrumentation to coordinate simulation of I/O failures with
3416 : * log recovery.
3417 : */
3418 11283 : if (xfs_globals.log_recovery_delay) {
3419 4 : xfs_notice(log->l_mp,
3420 : "Delaying log recovery for %d seconds.",
3421 : xfs_globals.log_recovery_delay);
3422 4 : msleep(xfs_globals.log_recovery_delay * 1000);
3423 : }
3424 :
3425 11283 : xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
3426 : log->l_mp->m_logname ? log->l_mp->m_logname
3427 : : "internal");
3428 :
3429 11283 : error = xlog_do_recover(log, head_blk, tail_blk);
3430 11283 : set_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
3431 : }
3432 : return error;
3433 : }
3434 :
3435 : /*
3436 : * In the first part of recovery we replay inodes and buffers and build up the
3437 : * list of intents which need to be processed. Here we process the intents and
3438 : * clean up the on disk unlinked inode lists. This is separated from the first
3439 : * part of recovery so that the root and real-time bitmap inodes can be read in
3440 : * from disk in between the two stages. This is necessary so that we can free
3441 : * space in the real-time portion of the file system.
3442 : */
3443 : int
3444 11283 : xlog_recover_finish(
3445 : struct xlog *log)
3446 : {
3447 11283 : int error;
3448 :
3449 11283 : error = xlog_recover_process_intents(log);
3450 11283 : if (error) {
3451 : /*
3452 : * Cancel all the unprocessed intent items now so that we don't
3453 : * leave them pinned in the AIL. This can cause the AIL to
3454 : * livelock on the pinned item if anyone tries to push the AIL
3455 : * (inode reclaim does this) before we get around to
3456 : * xfs_log_mount_cancel.
3457 : */
3458 2 : xlog_recover_cancel_intents(log);
3459 2 : xfs_alert(log->l_mp, "Failed to recover intents");
3460 2 : xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
3461 2 : return error;
3462 : }
3463 :
3464 : /*
3465 : * Sync the log to get all the intents out of the AIL. This isn't
3466 : * absolutely necessary, but it helps in case the unlink transactions
3467 : * would have problems pushing the intents out of the way.
3468 : */
3469 11281 : xfs_log_force(log->l_mp, XFS_LOG_SYNC);
3470 :
3471 : /*
3472 : * Now that we've recovered the log and all the intents, we can clear
3473 : * the log incompat feature bits in the superblock because there's no
3474 : * longer anything to protect. We rely on the AIL push to write out the
3475 : * updated superblock after everything else.
3476 : */
3477 11281 : if (xfs_clear_incompat_log_features(log->l_mp,
3478 : XFS_SB_FEAT_INCOMPAT_LOG_ALL)) {
3479 9132 : error = xfs_sync_sb(log->l_mp, false);
3480 9132 : if (error < 0) {
3481 0 : xfs_alert(log->l_mp,
3482 : "Failed to clear log incompat features on recovery");
3483 0 : return error;
3484 : }
3485 : }
3486 :
3487 11281 : xlog_recover_process_iunlinks(log);
3488 :
3489 : /*
3490 : * Recover any CoW staging blocks that are still referenced by the
3491 : * ondisk refcount metadata. During mount there cannot be any live
3492 : * staging extents as we have not permitted any user modifications.
3493 : * Therefore, it is safe to free them all right now, even on a
3494 : * read-only mount.
3495 : */
3496 11281 : error = xfs_reflink_recover_cow(log->l_mp);
3497 11281 : if (error) {
3498 8 : xfs_alert(log->l_mp,
3499 : "Failed to recover leftover CoW staging extents, err %d.",
3500 : error);
3501 : /*
3502 : * If we get an error here, make sure the log is shut down
3503 : * but return zero so that any log items committed since the
3504 : * end of intents processing can be pushed through the CIL
3505 : * and AIL.
3506 : */
3507 8 : xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
3508 : }
3509 :
3510 : return 0;
3511 : }
3512 :
3513 : void
3514 40 : xlog_recover_cancel(
3515 : struct xlog *log)
3516 : {
3517 80 : if (xlog_recovery_needed(log))
3518 0 : xlog_recover_cancel_intents(log);
3519 40 : }
3520 :
|