Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 : * All Rights Reserved.
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_log_format.h"
11 : #include "xfs_trans_resv.h"
12 : #include "xfs_bit.h"
13 : #include "xfs_sb.h"
14 : #include "xfs_mount.h"
15 : #include "xfs_defer.h"
16 : #include "xfs_inode.h"
17 : #include "xfs_trans.h"
18 : #include "xfs_log.h"
19 : #include "xfs_log_priv.h"
20 : #include "xfs_log_recover.h"
21 : #include "xfs_trans_priv.h"
22 : #include "xfs_alloc.h"
23 : #include "xfs_ialloc.h"
24 : #include "xfs_trace.h"
25 : #include "xfs_icache.h"
26 : #include "xfs_error.h"
27 : #include "xfs_buf_item.h"
28 : #include "xfs_ag.h"
29 : #include "xfs_quota.h"
30 : #include "xfs_reflink.h"
31 :
32 : #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
33 :
34 : STATIC int
35 : xlog_find_zeroed(
36 : struct xlog *,
37 : xfs_daddr_t *);
38 : STATIC int
39 : xlog_clear_stale_blocks(
40 : struct xlog *,
41 : xfs_lsn_t);
42 : STATIC int
43 : xlog_do_recovery_pass(
44 : struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
45 :
46 : /*
47 : * Sector aligned buffer routines for buffer create/read/write/access
48 : */
49 :
50 : /*
51 : * Verify the log-relative block number and length in basic blocks are valid for
52 : * an operation involving the given XFS log buffer. Returns true if the fields
53 : * are valid, false otherwise.
54 : */
55 : static inline bool
56 : xlog_verify_bno(
57 : struct xlog *log,
58 : xfs_daddr_t blk_no,
59 : int bbcount)
60 : {
61 11953044 : if (blk_no < 0 || blk_no >= log->l_logBBsize)
62 : return false;
63 12183728 : if (bbcount <= 0 || (blk_no + bbcount) > log->l_logBBsize)
64 0 : return false;
65 : return true;
66 : }
67 :
68 : /*
69 : * Allocate a buffer to hold log data. The buffer needs to be able to map to
70 : * a range of nbblks basic blocks at any valid offset within the log.
71 : */
72 : static char *
73 230684 : xlog_alloc_buffer(
74 : struct xlog *log,
75 : int nbblks)
76 : {
77 : /*
78 : * Pass log block 0 since we don't have an addr yet, buffer will be
79 : * verified on read.
80 : */
81 461368 : if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, 0, nbblks))) {
82 0 : xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
83 : nbblks);
84 0 : return NULL;
85 : }
86 :
87 : /*
88 : * We do log I/O in units of log sectors (a power-of-2 multiple of the
89 : * basic block size), so we round up the requested size to accommodate
90 : * the basic blocks required for complete log sectors.
91 : *
92 : * In addition, the buffer may be used for a non-sector-aligned block
93 : * offset, in which case an I/O of the requested size could extend
94 : * beyond the end of the buffer. If the requested size is only 1 basic
95 : * block it will never straddle a sector boundary, so this won't be an
96 : * issue. Nor will this be a problem if the log I/O is done in basic
97 : * blocks (sector size 1). But otherwise we extend the buffer by one
98 : * extra log sector to ensure there's space to accommodate this
99 : * possibility.
100 : */
101 230684 : if (nbblks > 1 && log->l_sectBBsize > 1)
102 108837 : nbblks += log->l_sectBBsize;
103 230684 : nbblks = round_up(nbblks, log->l_sectBBsize);
104 230684 : return kvzalloc(BBTOB(nbblks), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
105 : }
106 :
107 : /*
108 : * Return the address of the start of the given block number's data
109 : * in a log buffer. The buffer covers a log sector-aligned region.
110 : */
111 : static inline unsigned int
112 : xlog_align(
113 : struct xlog *log,
114 : xfs_daddr_t blk_no)
115 : {
116 11951820 : return BBTOB(blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1));
117 : }
118 :
119 : static int
120 11953044 : xlog_do_io(
121 : struct xlog *log,
122 : xfs_daddr_t blk_no,
123 : unsigned int nbblks,
124 : char *data,
125 : enum req_op op)
126 : {
127 11953044 : int error;
128 :
129 23906088 : if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, blk_no, nbblks))) {
130 0 : xfs_warn(log->l_mp,
131 : "Invalid log block/length (0x%llx, 0x%x) for buffer",
132 : blk_no, nbblks);
133 0 : return -EFSCORRUPTED;
134 : }
135 :
136 11953044 : blk_no = round_down(blk_no, log->l_sectBBsize);
137 11953044 : nbblks = round_up(nbblks, log->l_sectBBsize);
138 11953044 : ASSERT(nbblks > 0);
139 :
140 11953044 : error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no,
141 : BBTOB(nbblks), data, op);
142 11953044 : if (error && !xlog_is_shutdown(log)) {
143 0 : xfs_alert(log->l_mp,
144 : "log recovery %s I/O error at daddr 0x%llx len %d error %d",
145 : op == REQ_OP_WRITE ? "write" : "read",
146 : blk_no, nbblks, error);
147 : }
148 : return error;
149 : }
150 :
151 : STATIC int
152 1224 : xlog_bread_noalign(
153 : struct xlog *log,
154 : xfs_daddr_t blk_no,
155 : int nbblks,
156 : char *data)
157 : {
158 1224 : return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
159 : }
160 :
161 : STATIC int
162 11860158 : xlog_bread(
163 : struct xlog *log,
164 : xfs_daddr_t blk_no,
165 : int nbblks,
166 : char *data,
167 : char **offset)
168 : {
169 11860158 : int error;
170 :
171 11860158 : error = xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
172 11860158 : if (!error)
173 11860158 : *offset = data + xlog_align(log, blk_no);
174 11860158 : return error;
175 : }
176 :
177 : STATIC int
178 91662 : xlog_bwrite(
179 : struct xlog *log,
180 : xfs_daddr_t blk_no,
181 : int nbblks,
182 : char *data)
183 : {
184 91662 : return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_WRITE);
185 : }
186 :
187 : #ifdef DEBUG
188 : /*
189 : * dump debug superblock and log record information
190 : */
191 : STATIC void
192 0 : xlog_header_check_dump(
193 : xfs_mount_t *mp,
194 : xlog_rec_header_t *head)
195 : {
196 0 : xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d",
197 : __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
198 0 : xfs_debug(mp, " log : uuid = %pU, fmt = %d",
199 : &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
200 0 : }
201 : #else
202 : #define xlog_header_check_dump(mp, head)
203 : #endif
204 :
205 : /*
206 : * check log record header for recovery
207 : */
208 : STATIC int
209 2295766 : xlog_header_check_recover(
210 : xfs_mount_t *mp,
211 : xlog_rec_header_t *head)
212 : {
213 2295766 : ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
214 :
215 : /*
216 : * IRIX doesn't write the h_fmt field and leaves it zeroed
217 : * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
218 : * a dirty log created in IRIX.
219 : */
220 2295766 : if (XFS_IS_CORRUPT(mp, head->h_fmt != cpu_to_be32(XLOG_FMT))) {
221 0 : xfs_warn(mp,
222 : "dirty log written in incompatible format - can't recover");
223 0 : xlog_header_check_dump(mp, head);
224 0 : return -EFSCORRUPTED;
225 : }
226 2295766 : if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
227 : &head->h_fs_uuid))) {
228 0 : xfs_warn(mp,
229 : "dirty log entry has mismatched uuid - can't recover");
230 0 : xlog_header_check_dump(mp, head);
231 0 : return -EFSCORRUPTED;
232 : }
233 : return 0;
234 : }
235 :
236 : /*
237 : * read the head block of the log and check the header
238 : */
239 : STATIC int
240 22477 : xlog_header_check_mount(
241 : xfs_mount_t *mp,
242 : xlog_rec_header_t *head)
243 : {
244 22477 : ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
245 :
246 22477 : if (uuid_is_null(&head->h_fs_uuid)) {
247 : /*
248 : * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
249 : * h_fs_uuid is null, we assume this log was last mounted
250 : * by IRIX and continue.
251 : */
252 0 : xfs_warn(mp, "null uuid in log - IRIX style log");
253 22477 : } else if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
254 : &head->h_fs_uuid))) {
255 0 : xfs_warn(mp, "log has mismatched uuid - can't recover");
256 0 : xlog_header_check_dump(mp, head);
257 0 : return -EFSCORRUPTED;
258 : }
259 : return 0;
260 : }
261 :
262 : /*
263 : * This routine finds (to an approximation) the first block in the physical
264 : * log which contains the given cycle. It uses a binary search algorithm.
265 : * Note that the algorithm can not be perfect because the disk will not
266 : * necessarily be perfect.
267 : */
268 : STATIC int
269 22476 : xlog_find_cycle_start(
270 : struct xlog *log,
271 : char *buffer,
272 : xfs_daddr_t first_blk,
273 : xfs_daddr_t *last_blk,
274 : uint cycle)
275 : {
276 22476 : char *offset;
277 22476 : xfs_daddr_t mid_blk;
278 22476 : xfs_daddr_t end_blk;
279 22476 : uint mid_cycle;
280 22476 : int error;
281 :
282 22476 : end_blk = *last_blk;
283 22476 : mid_blk = BLK_AVG(first_blk, end_blk);
284 403175 : while (mid_blk != first_blk && mid_blk != end_blk) {
285 380699 : error = xlog_bread(log, mid_blk, 1, buffer, &offset);
286 380699 : if (error)
287 0 : return error;
288 380699 : mid_cycle = xlog_get_cycle(offset);
289 380699 : if (mid_cycle == cycle)
290 : end_blk = mid_blk; /* last_half_cycle == mid_cycle */
291 : else
292 120353 : first_blk = mid_blk; /* first_half_cycle == mid_cycle */
293 380699 : mid_blk = BLK_AVG(first_blk, end_blk);
294 : }
295 22476 : ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
296 : (mid_blk == end_blk && mid_blk-1 == first_blk));
297 :
298 22476 : *last_blk = end_blk;
299 :
300 22476 : return 0;
301 : }
302 :
303 : /*
304 : * Check that a range of blocks does not contain stop_on_cycle_no.
305 : * Fill in *new_blk with the block offset where such a block is
306 : * found, or with -1 (an invalid block number) if there is no such
307 : * block in the range. The scan needs to occur from front to back
308 : * and the pointer into the region must be updated since a later
309 : * routine will need to perform another test.
310 : */
311 : STATIC int
312 23052 : xlog_find_verify_cycle(
313 : struct xlog *log,
314 : xfs_daddr_t start_blk,
315 : int nbblks,
316 : uint stop_on_cycle_no,
317 : xfs_daddr_t *new_blk)
318 : {
319 23052 : xfs_daddr_t i, j;
320 23052 : uint cycle;
321 23052 : char *buffer;
322 23052 : xfs_daddr_t bufblks;
323 23052 : char *buf = NULL;
324 23052 : int error = 0;
325 :
326 : /*
327 : * Greedily allocate a buffer big enough to handle the full
328 : * range of basic blocks we'll be examining. If that fails,
329 : * try a smaller size. We need to be able to read at least
330 : * a log sector, or we're out of luck.
331 : */
332 23052 : bufblks = 1 << ffs(nbblks);
333 23052 : while (bufblks > log->l_logBBsize)
334 0 : bufblks >>= 1;
335 23052 : while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
336 0 : bufblks >>= 1;
337 0 : if (bufblks < log->l_sectBBsize)
338 : return -ENOMEM;
339 : }
340 :
341 196999 : for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
342 173982 : int bcount;
343 :
344 173982 : bcount = min(bufblks, (start_blk + nbblks - i));
345 :
346 173982 : error = xlog_bread(log, i, bcount, buffer, &buf);
347 173982 : if (error)
348 0 : goto out;
349 :
350 65076673 : for (j = 0; j < bcount; j++) {
351 64902726 : cycle = xlog_get_cycle(buf);
352 64902726 : if (cycle == stop_on_cycle_no) {
353 35 : *new_blk = i+j;
354 35 : goto out;
355 : }
356 :
357 64902691 : buf += BBSIZE;
358 : }
359 : }
360 :
361 23017 : *new_blk = -1;
362 :
363 23052 : out:
364 23052 : kmem_free(buffer);
365 23052 : return error;
366 : }
367 :
368 : static inline int
369 86145 : xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh)
370 : {
371 86145 : if (xfs_has_logv2(log->l_mp)) {
372 86133 : int h_size = be32_to_cpu(rh->h_size);
373 :
374 86133 : if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) &&
375 : h_size > XLOG_HEADER_CYCLE_SIZE)
376 444 : return DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE);
377 : }
378 : return 1;
379 : }
380 :
381 : /*
382 : * Potentially backup over partial log record write.
383 : *
384 : * In the typical case, last_blk is the number of the block directly after
385 : * a good log record. Therefore, we subtract one to get the block number
386 : * of the last block in the given buffer. extra_bblks contains the number
387 : * of blocks we would have read on a previous read. This happens when the
388 : * last log record is split over the end of the physical log.
389 : *
390 : * extra_bblks is the number of blocks potentially verified on a previous
391 : * call to this routine.
392 : */
393 : STATIC int
394 22483 : xlog_find_verify_log_record(
395 : struct xlog *log,
396 : xfs_daddr_t start_blk,
397 : xfs_daddr_t *last_blk,
398 : int extra_bblks)
399 : {
400 22483 : xfs_daddr_t i;
401 22483 : char *buffer;
402 22483 : char *offset = NULL;
403 22483 : xlog_rec_header_t *head = NULL;
404 22483 : int error = 0;
405 22483 : int smallmem = 0;
406 22483 : int num_blks = *last_blk - start_blk;
407 22483 : int xhdrs;
408 :
409 22483 : ASSERT(start_blk != 0 || *last_blk != start_blk);
410 :
411 22483 : buffer = xlog_alloc_buffer(log, num_blks);
412 22483 : if (!buffer) {
413 0 : buffer = xlog_alloc_buffer(log, 1);
414 0 : if (!buffer)
415 : return -ENOMEM;
416 : smallmem = 1;
417 : } else {
418 22483 : error = xlog_bread(log, start_blk, num_blks, buffer, &offset);
419 22483 : if (error)
420 0 : goto out;
421 22483 : offset += ((num_blks - 1) << BBSHIFT);
422 : }
423 :
424 535636 : for (i = (*last_blk) - 1; i >= 0; i--) {
425 535634 : if (i < start_blk) {
426 : /* valid log record not found */
427 4 : xfs_warn(log->l_mp,
428 : "Log inconsistent (didn't find previous header)");
429 4 : ASSERT(0);
430 4 : error = -EFSCORRUPTED;
431 4 : goto out;
432 : }
433 :
434 535630 : if (smallmem) {
435 0 : error = xlog_bread(log, i, 1, buffer, &offset);
436 0 : if (error)
437 0 : goto out;
438 : }
439 :
440 535630 : head = (xlog_rec_header_t *)offset;
441 :
442 535630 : if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
443 : break;
444 :
445 513153 : if (!smallmem)
446 513153 : offset -= BBSIZE;
447 : }
448 :
449 : /*
450 : * We hit the beginning of the physical log & still no header. Return
451 : * to caller. If caller can handle a return of -1, then this routine
452 : * will be called again for the end of the physical log.
453 : */
454 22479 : if (i == -1) {
455 2 : error = 1;
456 2 : goto out;
457 : }
458 :
459 : /*
460 : * We have the final block of the good log (the first block
461 : * of the log record _before_ the head. So we check the uuid.
462 : */
463 22477 : if ((error = xlog_header_check_mount(log->l_mp, head)))
464 0 : goto out;
465 :
466 : /*
467 : * We may have found a log record header before we expected one.
468 : * last_blk will be the 1st block # with a given cycle #. We may end
469 : * up reading an entire log record. In this case, we don't want to
470 : * reset last_blk. Only when last_blk points in the middle of a log
471 : * record do we update last_blk.
472 : */
473 22477 : xhdrs = xlog_logrec_hblks(log, head);
474 :
475 44954 : if (*last_blk - i + extra_bblks !=
476 22477 : BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
477 36 : *last_blk = i;
478 :
479 22441 : out:
480 22483 : kmem_free(buffer);
481 22483 : return error;
482 : }
483 :
484 : /*
485 : * Head is defined to be the point of the log where the next log write
486 : * could go. This means that incomplete LR writes at the end are
487 : * eliminated when calculating the head. We aren't guaranteed that previous
488 : * LR have complete transactions. We only know that a cycle number of
489 : * current cycle number -1 won't be present in the log if we start writing
490 : * from our current block number.
491 : *
492 : * last_blk contains the block number of the first block with a given
493 : * cycle number.
494 : *
495 : * Return: zero if normal, non-zero if error.
496 : */
497 : STATIC int
498 22481 : xlog_find_head(
499 : struct xlog *log,
500 : xfs_daddr_t *return_head_blk)
501 : {
502 22481 : char *buffer;
503 22481 : char *offset;
504 22481 : xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
505 22481 : int num_scan_bblks;
506 22481 : uint first_half_cycle, last_half_cycle;
507 22481 : uint stop_on_cycle;
508 22481 : int error, log_bbnum = log->l_logBBsize;
509 :
510 : /* Is the end of the log device zeroed? */
511 22481 : error = xlog_find_zeroed(log, &first_blk);
512 22481 : if (error < 0) {
513 0 : xfs_warn(log->l_mp, "empty log check failed");
514 0 : return error;
515 : }
516 22481 : if (error == 1) {
517 8299 : *return_head_blk = first_blk;
518 :
519 : /* Is the whole lot zeroed? */
520 8299 : if (!first_blk) {
521 : /* Linux XFS shouldn't generate totally zeroed logs -
522 : * mkfs etc write a dummy unmount record to a fresh
523 : * log so we can store the uuid in there
524 : */
525 0 : xfs_warn(log->l_mp, "totally zeroed log");
526 : }
527 :
528 8299 : return 0;
529 : }
530 :
531 14182 : first_blk = 0; /* get cycle # of 1st block */
532 14182 : buffer = xlog_alloc_buffer(log, 1);
533 14182 : if (!buffer)
534 : return -ENOMEM;
535 :
536 14182 : error = xlog_bread(log, 0, 1, buffer, &offset);
537 14182 : if (error)
538 0 : goto out_free_buffer;
539 :
540 14182 : first_half_cycle = xlog_get_cycle(offset);
541 :
542 14182 : last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
543 14182 : error = xlog_bread(log, last_blk, 1, buffer, &offset);
544 14182 : if (error)
545 0 : goto out_free_buffer;
546 :
547 14182 : last_half_cycle = xlog_get_cycle(offset);
548 14182 : ASSERT(last_half_cycle != 0);
549 :
550 : /*
551 : * If the 1st half cycle number is equal to the last half cycle number,
552 : * then the entire log is stamped with the same cycle number. In this
553 : * case, head_blk can't be set to zero (which makes sense). The below
554 : * math doesn't work out properly with head_blk equal to zero. Instead,
555 : * we set it to log_bbnum which is an invalid block number, but this
556 : * value makes the math correct. If head_blk doesn't changed through
557 : * all the tests below, *head_blk is set to zero at the very end rather
558 : * than log_bbnum. In a sense, log_bbnum and zero are the same block
559 : * in a circular file.
560 : */
561 14182 : if (first_half_cycle == last_half_cycle) {
562 : /*
563 : * In this case we believe that the entire log should have
564 : * cycle number last_half_cycle. We need to scan backwards
565 : * from the end verifying that there are no holes still
566 : * containing last_half_cycle - 1. If we find such a hole,
567 : * then the start of that hole will be the new head. The
568 : * simple case looks like
569 : * x | x ... | x - 1 | x
570 : * Another case that fits this picture would be
571 : * x | x + 1 | x ... | x
572 : * In this case the head really is somewhere at the end of the
573 : * log, as one of the latest writes at the beginning was
574 : * incomplete.
575 : * One more case is
576 : * x | x + 1 | x ... | x - 1 | x
577 : * This is really the combination of the above two cases, and
578 : * the head has to end up at the start of the x-1 hole at the
579 : * end of the log.
580 : *
581 : * In the 256k log case, we will read from the beginning to the
582 : * end of the log and search for cycle numbers equal to x-1.
583 : * We don't worry about the x+1 blocks that we encounter,
584 : * because we know that they cannot be the head since the log
585 : * started with x.
586 : */
587 5 : head_blk = log_bbnum;
588 5 : stop_on_cycle = last_half_cycle - 1;
589 : } else {
590 : /*
591 : * In this case we want to find the first block with cycle
592 : * number matching last_half_cycle. We expect the log to be
593 : * some variation on
594 : * x + 1 ... | x ... | x
595 : * The first block with cycle number x (last_half_cycle) will
596 : * be where the new head belongs. First we do a binary search
597 : * for the first occurrence of last_half_cycle. The binary
598 : * search may not be totally accurate, so then we scan back
599 : * from there looking for occurrences of last_half_cycle before
600 : * us. If that backwards scan wraps around the beginning of
601 : * the log, then we look for occurrences of last_half_cycle - 1
602 : * at the end of the log. The cases we're looking for look
603 : * like
604 : * v binary search stopped here
605 : * x + 1 ... | x | x + 1 | x ... | x
606 : * ^ but we want to locate this spot
607 : * or
608 : * <---------> less than scan distance
609 : * x + 1 ... | x ... | x - 1 | x
610 : * ^ we want to locate this spot
611 : */
612 14177 : stop_on_cycle = last_half_cycle;
613 14177 : error = xlog_find_cycle_start(log, buffer, first_blk, &head_blk,
614 : last_half_cycle);
615 14177 : if (error)
616 0 : goto out_free_buffer;
617 : }
618 :
619 : /*
620 : * Now validate the answer. Scan back some number of maximum possible
621 : * blocks and make sure each one has the expected cycle number. The
622 : * maximum is determined by the total possible amount of buffering
623 : * in the in-core log. The following number can be made tighter if
624 : * we actually look at the block size of the filesystem.
625 : */
626 14182 : num_scan_bblks = min_t(int, log_bbnum, XLOG_TOTAL_REC_SHIFT(log));
627 14182 : if (head_blk >= num_scan_bblks) {
628 : /*
629 : * We are guaranteed that the entire check can be performed
630 : * in one buffer.
631 : */
632 13611 : start_blk = head_blk - num_scan_bblks;
633 13611 : if ((error = xlog_find_verify_cycle(log,
634 : start_blk, num_scan_bblks,
635 : stop_on_cycle, &new_blk)))
636 0 : goto out_free_buffer;
637 13611 : if (new_blk != -1)
638 28 : head_blk = new_blk;
639 : } else { /* need to read 2 parts of log */
640 : /*
641 : * We are going to scan backwards in the log in two parts.
642 : * First we scan the physical end of the log. In this part
643 : * of the log, we are looking for blocks with cycle number
644 : * last_half_cycle - 1.
645 : * If we find one, then we know that the log starts there, as
646 : * we've found a hole that didn't get written in going around
647 : * the end of the physical log. The simple case for this is
648 : * x + 1 ... | x ... | x - 1 | x
649 : * <---------> less than scan distance
650 : * If all of the blocks at the end of the log have cycle number
651 : * last_half_cycle, then we check the blocks at the start of
652 : * the log looking for occurrences of last_half_cycle. If we
653 : * find one, then our current estimate for the location of the
654 : * first occurrence of last_half_cycle is wrong and we move
655 : * back to the hole we've found. This case looks like
656 : * x + 1 ... | x | x + 1 | x ...
657 : * ^ binary search stopped here
658 : * Another case we need to handle that only occurs in 256k
659 : * logs is
660 : * x + 1 ... | x ... | x+1 | x ...
661 : * ^ binary search stops here
662 : * In a 256k log, the scan at the end of the log will see the
663 : * x + 1 blocks. We need to skip past those since that is
664 : * certainly not the head of the log. By searching for
665 : * last_half_cycle-1 we accomplish that.
666 : */
667 571 : ASSERT(head_blk <= INT_MAX &&
668 : (xfs_daddr_t) num_scan_bblks >= head_blk);
669 571 : start_blk = log_bbnum - (num_scan_bblks - head_blk);
670 571 : if ((error = xlog_find_verify_cycle(log, start_blk,
671 571 : num_scan_bblks - (int)head_blk,
672 : (stop_on_cycle - 1), &new_blk)))
673 0 : goto out_free_buffer;
674 571 : if (new_blk != -1) {
675 0 : head_blk = new_blk;
676 0 : goto validate_head;
677 : }
678 :
679 : /*
680 : * Scan beginning of log now. The last part of the physical
681 : * log is good. This scan needs to verify that it doesn't find
682 : * the last_half_cycle.
683 : */
684 571 : start_blk = 0;
685 571 : ASSERT(head_blk <= INT_MAX);
686 571 : if ((error = xlog_find_verify_cycle(log,
687 : start_blk, (int)head_blk,
688 : stop_on_cycle, &new_blk)))
689 0 : goto out_free_buffer;
690 571 : if (new_blk != -1)
691 2 : head_blk = new_blk;
692 : }
693 :
694 569 : validate_head:
695 : /*
696 : * Now we need to make sure head_blk is not pointing to a block in
697 : * the middle of a log record.
698 : */
699 14182 : num_scan_bblks = XLOG_REC_SHIFT(log);
700 14182 : if (head_blk >= num_scan_bblks) {
701 14104 : start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
702 :
703 : /* start ptr at last block ptr before head_blk */
704 14104 : error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
705 14104 : if (error == 1)
706 : error = -EIO;
707 14104 : if (error)
708 4 : goto out_free_buffer;
709 : } else {
710 78 : start_blk = 0;
711 78 : ASSERT(head_blk <= INT_MAX);
712 78 : error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
713 78 : if (error < 0)
714 0 : goto out_free_buffer;
715 78 : if (error == 1) {
716 : /* We hit the beginning of the log during our search */
717 2 : start_blk = log_bbnum - (num_scan_bblks - head_blk);
718 2 : new_blk = log_bbnum;
719 2 : ASSERT(start_blk <= INT_MAX &&
720 : (xfs_daddr_t) log_bbnum-start_blk >= 0);
721 2 : ASSERT(head_blk <= INT_MAX);
722 2 : error = xlog_find_verify_log_record(log, start_blk,
723 : &new_blk, (int)head_blk);
724 2 : if (error == 1)
725 : error = -EIO;
726 2 : if (error)
727 0 : goto out_free_buffer;
728 2 : if (new_blk != log_bbnum)
729 0 : head_blk = new_blk;
730 76 : } else if (error)
731 0 : goto out_free_buffer;
732 : }
733 :
734 14178 : kmem_free(buffer);
735 14178 : if (head_blk == log_bbnum)
736 1 : *return_head_blk = 0;
737 : else
738 14177 : *return_head_blk = head_blk;
739 : /*
740 : * When returning here, we have a good block number. Bad block
741 : * means that during a previous crash, we didn't have a clean break
742 : * from cycle number N to cycle number N-1. In this case, we need
743 : * to find the first block with cycle number N-1.
744 : */
745 : return 0;
746 :
747 4 : out_free_buffer:
748 4 : kmem_free(buffer);
749 4 : if (error)
750 4 : xfs_warn(log->l_mp, "failed to find log head");
751 4 : return error;
752 : }
753 :
754 : /*
755 : * Seek backwards in the log for log record headers.
756 : *
757 : * Given a starting log block, walk backwards until we find the provided number
758 : * of records or hit the provided tail block. The return value is the number of
759 : * records encountered or a negative error code. The log block and buffer
760 : * pointer of the last record seen are returned in rblk and rhead respectively.
761 : */
762 : STATIC int
763 32784 : xlog_rseek_logrec_hdr(
764 : struct xlog *log,
765 : xfs_daddr_t head_blk,
766 : xfs_daddr_t tail_blk,
767 : int count,
768 : char *buffer,
769 : xfs_daddr_t *rblk,
770 : struct xlog_rec_header **rhead,
771 : bool *wrapped)
772 : {
773 32784 : int i;
774 32784 : int error;
775 32784 : int found = 0;
776 32784 : char *offset = NULL;
777 32784 : xfs_daddr_t end_blk;
778 :
779 32784 : *wrapped = false;
780 :
781 : /*
782 : * Walk backwards from the head block until we hit the tail or the first
783 : * block in the log.
784 : */
785 32784 : end_blk = head_blk > tail_blk ? tail_blk : 0;
786 4097834 : for (i = (int) head_blk - 1; i >= end_blk; i--) {
787 4095594 : error = xlog_bread(log, i, 1, buffer, &offset);
788 4095594 : if (error)
789 0 : goto out_error;
790 :
791 4095594 : if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
792 92462 : *rblk = i;
793 92462 : *rhead = (struct xlog_rec_header *) offset;
794 92462 : if (++found == count)
795 : break;
796 : }
797 : }
798 :
799 : /*
800 : * If we haven't hit the tail block or the log record header count,
801 : * start looking again from the end of the physical log. Note that
802 : * callers can pass head == tail if the tail is not yet known.
803 : */
804 32784 : if (tail_blk >= head_blk && found != count) {
805 22848 : for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
806 22848 : error = xlog_bread(log, i, 1, buffer, &offset);
807 22848 : if (error)
808 0 : goto out_error;
809 :
810 22848 : if (*(__be32 *)offset ==
811 : cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
812 144 : *wrapped = true;
813 144 : *rblk = i;
814 144 : *rhead = (struct xlog_rec_header *) offset;
815 144 : if (++found == count)
816 : break;
817 : }
818 : }
819 : }
820 :
821 : return found;
822 :
823 : out_error:
824 : return error;
825 : }
826 :
827 : /*
828 : * Seek forward in the log for log record headers.
829 : *
830 : * Given head and tail blocks, walk forward from the tail block until we find
831 : * the provided number of records or hit the head block. The return value is the
832 : * number of records encountered or a negative error code. The log block and
833 : * buffer pointer of the last record seen are returned in rblk and rhead
834 : * respectively.
835 : */
836 : STATIC int
837 10296 : xlog_seek_logrec_hdr(
838 : struct xlog *log,
839 : xfs_daddr_t head_blk,
840 : xfs_daddr_t tail_blk,
841 : int count,
842 : char *buffer,
843 : xfs_daddr_t *rblk,
844 : struct xlog_rec_header **rhead,
845 : bool *wrapped)
846 : {
847 10296 : int i;
848 10296 : int error;
849 10296 : int found = 0;
850 10296 : char *offset = NULL;
851 10296 : xfs_daddr_t end_blk;
852 :
853 10296 : *wrapped = false;
854 :
855 : /*
856 : * Walk forward from the tail block until we hit the head or the last
857 : * block in the log.
858 : */
859 10296 : end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
860 10296 : for (i = (int) tail_blk; i <= end_blk; i++) {
861 10296 : error = xlog_bread(log, i, 1, buffer, &offset);
862 10296 : if (error)
863 0 : goto out_error;
864 :
865 10296 : if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
866 10296 : *rblk = i;
867 10296 : *rhead = (struct xlog_rec_header *) offset;
868 10296 : if (++found == count)
869 : break;
870 : }
871 : }
872 :
873 : /*
874 : * If we haven't hit the head block or the log record header count,
875 : * start looking again from the start of the physical log.
876 : */
877 10296 : if (tail_blk > head_blk && found != count) {
878 0 : for (i = 0; i < (int) head_blk; i++) {
879 0 : error = xlog_bread(log, i, 1, buffer, &offset);
880 0 : if (error)
881 0 : goto out_error;
882 :
883 0 : if (*(__be32 *)offset ==
884 : cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
885 0 : *wrapped = true;
886 0 : *rblk = i;
887 0 : *rhead = (struct xlog_rec_header *) offset;
888 0 : if (++found == count)
889 : break;
890 : }
891 : }
892 : }
893 :
894 : return found;
895 :
896 : out_error:
897 : return error;
898 : }
899 :
900 : /*
901 : * Calculate distance from head to tail (i.e., unused space in the log).
902 : */
903 : static inline int
904 : xlog_tail_distance(
905 : struct xlog *log,
906 : xfs_daddr_t head_blk,
907 : xfs_daddr_t tail_blk)
908 : {
909 0 : if (head_blk < tail_blk)
910 0 : return tail_blk - head_blk;
911 :
912 0 : return tail_blk + (log->l_logBBsize - head_blk);
913 : }
914 :
915 : /*
916 : * Verify the log tail. This is particularly important when torn or incomplete
917 : * writes have been detected near the front of the log and the head has been
918 : * walked back accordingly.
919 : *
920 : * We also have to handle the case where the tail was pinned and the head
921 : * blocked behind the tail right before a crash. If the tail had been pushed
922 : * immediately prior to the crash and the subsequent checkpoint was only
923 : * partially written, it's possible it overwrote the last referenced tail in the
924 : * log with garbage. This is not a coherency problem because the tail must have
925 : * been pushed before it can be overwritten, but appears as log corruption to
926 : * recovery because we have no way to know the tail was updated if the
927 : * subsequent checkpoint didn't write successfully.
928 : *
929 : * Therefore, CRC check the log from tail to head. If a failure occurs and the
930 : * offending record is within max iclog bufs from the head, walk the tail
931 : * forward and retry until a valid tail is found or corruption is detected out
932 : * of the range of a possible overwrite.
933 : */
934 : STATIC int
935 10296 : xlog_verify_tail(
936 : struct xlog *log,
937 : xfs_daddr_t head_blk,
938 : xfs_daddr_t *tail_blk,
939 : int hsize)
940 : {
941 10296 : struct xlog_rec_header *thead;
942 10296 : char *buffer;
943 10296 : xfs_daddr_t first_bad;
944 10296 : int error = 0;
945 10296 : bool wrapped;
946 10296 : xfs_daddr_t tmp_tail;
947 10296 : xfs_daddr_t orig_tail = *tail_blk;
948 :
949 10296 : buffer = xlog_alloc_buffer(log, 1);
950 10296 : if (!buffer)
951 : return -ENOMEM;
952 :
953 : /*
954 : * Make sure the tail points to a record (returns positive count on
955 : * success).
956 : */
957 10296 : error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, buffer,
958 : &tmp_tail, &thead, &wrapped);
959 10296 : if (error < 0)
960 0 : goto out;
961 10296 : if (*tail_blk != tmp_tail)
962 0 : *tail_blk = tmp_tail;
963 :
964 : /*
965 : * Run a CRC check from the tail to the head. We can't just check
966 : * MAX_ICLOGS records past the tail because the tail may point to stale
967 : * blocks cleared during the search for the head/tail. These blocks are
968 : * overwritten with zero-length records and thus record count is not a
969 : * reliable indicator of the iclog state before a crash.
970 : */
971 10296 : first_bad = 0;
972 10296 : error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
973 : XLOG_RECOVER_CRCPASS, &first_bad);
974 10296 : while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
975 0 : int tail_distance;
976 :
977 : /*
978 : * Is corruption within range of the head? If so, retry from
979 : * the next record. Otherwise return an error.
980 : */
981 0 : tail_distance = xlog_tail_distance(log, head_blk, first_bad);
982 0 : if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
983 : break;
984 :
985 : /* skip to the next record; returns positive count on success */
986 0 : error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2,
987 : buffer, &tmp_tail, &thead, &wrapped);
988 0 : if (error < 0)
989 0 : goto out;
990 :
991 0 : *tail_blk = tmp_tail;
992 0 : first_bad = 0;
993 0 : error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
994 : XLOG_RECOVER_CRCPASS, &first_bad);
995 : }
996 :
997 10296 : if (!error && *tail_blk != orig_tail)
998 0 : xfs_warn(log->l_mp,
999 : "Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
1000 : orig_tail, *tail_blk);
1001 10296 : out:
1002 10296 : kmem_free(buffer);
1003 10296 : return error;
1004 : }
1005 :
1006 : /*
1007 : * Detect and trim torn writes from the head of the log.
1008 : *
1009 : * Storage without sector atomicity guarantees can result in torn writes in the
1010 : * log in the event of a crash. Our only means to detect this scenario is via
1011 : * CRC verification. While we can't always be certain that CRC verification
1012 : * failure is due to a torn write vs. an unrelated corruption, we do know that
1013 : * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
1014 : * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
1015 : * the log and treat failures in this range as torn writes as a matter of
1016 : * policy. In the event of CRC failure, the head is walked back to the last good
1017 : * record in the log and the tail is updated from that record and verified.
1018 : */
1019 : STATIC int
1020 10296 : xlog_verify_head(
1021 : struct xlog *log,
1022 : xfs_daddr_t *head_blk, /* in/out: unverified head */
1023 : xfs_daddr_t *tail_blk, /* out: tail block */
1024 : char *buffer,
1025 : xfs_daddr_t *rhead_blk, /* start blk of last record */
1026 : struct xlog_rec_header **rhead, /* ptr to last record */
1027 : bool *wrapped) /* last rec. wraps phys. log */
1028 : {
1029 10296 : struct xlog_rec_header *tmp_rhead;
1030 10296 : char *tmp_buffer;
1031 10296 : xfs_daddr_t first_bad;
1032 10296 : xfs_daddr_t tmp_rhead_blk;
1033 10296 : int found;
1034 10296 : int error;
1035 10296 : bool tmp_wrapped;
1036 :
1037 : /*
1038 : * Check the head of the log for torn writes. Search backwards from the
1039 : * head until we hit the tail or the maximum number of log record I/Os
1040 : * that could have been in flight at one time. Use a temporary buffer so
1041 : * we don't trash the rhead/buffer pointers from the caller.
1042 : */
1043 10296 : tmp_buffer = xlog_alloc_buffer(log, 1);
1044 10296 : if (!tmp_buffer)
1045 : return -ENOMEM;
1046 10296 : error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
1047 : XLOG_MAX_ICLOGS, tmp_buffer,
1048 : &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
1049 10296 : kmem_free(tmp_buffer);
1050 10296 : if (error < 0)
1051 : return error;
1052 :
1053 : /*
1054 : * Now run a CRC verification pass over the records starting at the
1055 : * block found above to the current head. If a CRC failure occurs, the
1056 : * log block of the first bad record is saved in first_bad.
1057 : */
1058 10296 : error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
1059 : XLOG_RECOVER_CRCPASS, &first_bad);
1060 10296 : if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
1061 : /*
1062 : * We've hit a potential torn write. Reset the error and warn
1063 : * about it.
1064 : */
1065 11 : error = 0;
1066 11 : xfs_warn(log->l_mp,
1067 : "Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
1068 : first_bad, *head_blk);
1069 :
1070 : /*
1071 : * Get the header block and buffer pointer for the last good
1072 : * record before the bad record.
1073 : *
1074 : * Note that xlog_find_tail() clears the blocks at the new head
1075 : * (i.e., the records with invalid CRC) if the cycle number
1076 : * matches the current cycle.
1077 : */
1078 11 : found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
1079 : buffer, rhead_blk, rhead, wrapped);
1080 11 : if (found < 0)
1081 : return found;
1082 11 : if (found == 0) /* XXX: right thing to do here? */
1083 : return -EIO;
1084 :
1085 : /*
1086 : * Reset the head block to the starting block of the first bad
1087 : * log record and set the tail block based on the last good
1088 : * record.
1089 : *
1090 : * Bail out if the updated head/tail match as this indicates
1091 : * possible corruption outside of the acceptable
1092 : * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
1093 : */
1094 11 : *head_blk = first_bad;
1095 11 : *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
1096 11 : if (*head_blk == *tail_blk) {
1097 0 : ASSERT(0);
1098 0 : return 0;
1099 : }
1100 : }
1101 10285 : if (error)
1102 : return error;
1103 :
1104 10296 : return xlog_verify_tail(log, *head_blk, tail_blk,
1105 10296 : be32_to_cpu((*rhead)->h_size));
1106 : }
1107 :
1108 : /*
1109 : * We need to make sure we handle log wrapping properly, so we can't use the
1110 : * calculated logbno directly. Make sure it wraps to the correct bno inside the
1111 : * log.
1112 : *
1113 : * The log is limited to 32 bit sizes, so we use the appropriate modulus
1114 : * operation here and cast it back to a 64 bit daddr on return.
1115 : */
1116 : static inline xfs_daddr_t
1117 : xlog_wrap_logbno(
1118 : struct xlog *log,
1119 : xfs_daddr_t bno)
1120 : {
1121 203072 : int mod;
1122 :
1123 203072 : div_s64_rem(bno, log->l_logBBsize, &mod);
1124 203072 : return mod;
1125 : }
1126 :
1127 : /*
1128 : * Check whether the head of the log points to an unmount record. In other
1129 : * words, determine whether the log is clean. If so, update the in-core state
1130 : * appropriately.
1131 : */
1132 : static int
1133 22488 : xlog_check_unmount_rec(
1134 : struct xlog *log,
1135 : xfs_daddr_t *head_blk,
1136 : xfs_daddr_t *tail_blk,
1137 : struct xlog_rec_header *rhead,
1138 : xfs_daddr_t rhead_blk,
1139 : char *buffer,
1140 : bool *clean)
1141 : {
1142 22488 : struct xlog_op_header *op_head;
1143 22488 : xfs_daddr_t umount_data_blk;
1144 22488 : xfs_daddr_t after_umount_blk;
1145 22488 : int hblks;
1146 22488 : int error;
1147 22488 : char *offset;
1148 :
1149 22488 : *clean = false;
1150 :
1151 : /*
1152 : * Look for unmount record. If we find it, then we know there was a
1153 : * clean unmount. Since 'i' could be the last block in the physical
1154 : * log, we convert to a log block before comparing to the head_blk.
1155 : *
1156 : * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
1157 : * below. We won't want to clear the unmount record if there is one, so
1158 : * we pass the lsn of the unmount record rather than the block after it.
1159 : */
1160 22488 : hblks = xlog_logrec_hblks(log, rhead);
1161 44976 : after_umount_blk = xlog_wrap_logbno(log,
1162 22488 : rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)));
1163 :
1164 44976 : if (*head_blk == after_umount_blk &&
1165 22488 : be32_to_cpu(rhead->h_num_logops) == 1) {
1166 12185 : umount_data_blk = xlog_wrap_logbno(log, rhead_blk + hblks);
1167 12185 : error = xlog_bread(log, umount_data_blk, 1, buffer, &offset);
1168 12185 : if (error)
1169 : return error;
1170 :
1171 12185 : op_head = (struct xlog_op_header *)offset;
1172 12185 : if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
1173 : /*
1174 : * Set tail and last sync so that newly written log
1175 : * records will point recovery to after the current
1176 : * unmount record.
1177 : */
1178 12181 : xlog_assign_atomic_lsn(&log->l_tail_lsn,
1179 12181 : log->l_curr_cycle, after_umount_blk);
1180 12181 : xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
1181 : log->l_curr_cycle, after_umount_blk);
1182 12181 : *tail_blk = after_umount_blk;
1183 :
1184 12181 : *clean = true;
1185 : }
1186 : }
1187 :
1188 : return 0;
1189 : }
1190 :
1191 : static void
1192 22488 : xlog_set_state(
1193 : struct xlog *log,
1194 : xfs_daddr_t head_blk,
1195 : struct xlog_rec_header *rhead,
1196 : xfs_daddr_t rhead_blk,
1197 : bool bump_cycle)
1198 : {
1199 : /*
1200 : * Reset log values according to the state of the log when we
1201 : * crashed. In the case where head_blk == 0, we bump curr_cycle
1202 : * one because the next write starts a new cycle rather than
1203 : * continuing the cycle of the last good log record. At this
1204 : * point we have guaranteed that all partial log records have been
1205 : * accounted for. Therefore, we know that the last good log record
1206 : * written was complete and ended exactly on the end boundary
1207 : * of the physical log.
1208 : */
1209 22488 : log->l_prev_block = rhead_blk;
1210 22488 : log->l_curr_block = (int)head_blk;
1211 22488 : log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
1212 22488 : if (bump_cycle)
1213 39 : log->l_curr_cycle++;
1214 22488 : atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
1215 22488 : atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
1216 22488 : xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
1217 : BBTOB(log->l_curr_block));
1218 22488 : xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
1219 : BBTOB(log->l_curr_block));
1220 22488 : }
1221 :
1222 : /*
1223 : * Find the sync block number or the tail of the log.
1224 : *
1225 : * This will be the block number of the last record to have its
1226 : * associated buffers synced to disk. Every log record header has
1227 : * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
1228 : * to get a sync block number. The only concern is to figure out which
1229 : * log record header to believe.
1230 : *
1231 : * The following algorithm uses the log record header with the largest
1232 : * lsn. The entire log record does not need to be valid. We only care
1233 : * that the header is valid.
1234 : *
1235 : * We could speed up search by using current head_blk buffer, but it is not
1236 : * available.
1237 : */
1238 : STATIC int
1239 22481 : xlog_find_tail(
1240 : struct xlog *log,
1241 : xfs_daddr_t *head_blk,
1242 : xfs_daddr_t *tail_blk)
1243 : {
1244 22481 : xlog_rec_header_t *rhead;
1245 22481 : char *offset = NULL;
1246 22481 : char *buffer;
1247 22481 : int error;
1248 22481 : xfs_daddr_t rhead_blk;
1249 22481 : xfs_lsn_t tail_lsn;
1250 22481 : bool wrapped = false;
1251 22481 : bool clean = false;
1252 :
1253 : /*
1254 : * Find previous log record
1255 : */
1256 22481 : if ((error = xlog_find_head(log, head_blk)))
1257 : return error;
1258 22477 : ASSERT(*head_blk < INT_MAX);
1259 :
1260 22477 : buffer = xlog_alloc_buffer(log, 1);
1261 22477 : if (!buffer)
1262 : return -ENOMEM;
1263 22477 : if (*head_blk == 0) { /* special case */
1264 37 : error = xlog_bread(log, 0, 1, buffer, &offset);
1265 37 : if (error)
1266 0 : goto done;
1267 :
1268 37 : if (xlog_get_cycle(offset) == 0) {
1269 0 : *tail_blk = 0;
1270 : /* leave all other log inited values alone */
1271 0 : goto done;
1272 : }
1273 : }
1274 :
1275 : /*
1276 : * Search backwards through the log looking for the log record header
1277 : * block. This wraps all the way back around to the head so something is
1278 : * seriously wrong if we can't find it.
1279 : */
1280 22477 : error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer,
1281 : &rhead_blk, &rhead, &wrapped);
1282 22477 : if (error < 0)
1283 0 : goto done;
1284 22477 : if (!error) {
1285 0 : xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
1286 0 : error = -EFSCORRUPTED;
1287 0 : goto done;
1288 : }
1289 22477 : *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
1290 :
1291 : /*
1292 : * Set the log state based on the current head record.
1293 : */
1294 22477 : xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
1295 22477 : tail_lsn = atomic64_read(&log->l_tail_lsn);
1296 :
1297 : /*
1298 : * Look for an unmount record at the head of the log. This sets the log
1299 : * state to determine whether recovery is necessary.
1300 : */
1301 22477 : error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
1302 : rhead_blk, buffer, &clean);
1303 22477 : if (error)
1304 0 : goto done;
1305 :
1306 : /*
1307 : * Verify the log head if the log is not clean (e.g., we have anything
1308 : * but an unmount record at the head). This uses CRC verification to
1309 : * detect and trim torn writes. If discovered, CRC failures are
1310 : * considered torn writes and the log head is trimmed accordingly.
1311 : *
1312 : * Note that we can only run CRC verification when the log is dirty
1313 : * because there's no guarantee that the log data behind an unmount
1314 : * record is compatible with the current architecture.
1315 : */
1316 22477 : if (!clean) {
1317 10296 : xfs_daddr_t orig_head = *head_blk;
1318 :
1319 10296 : error = xlog_verify_head(log, head_blk, tail_blk, buffer,
1320 : &rhead_blk, &rhead, &wrapped);
1321 10296 : if (error)
1322 0 : goto done;
1323 :
1324 : /* update in-core state again if the head changed */
1325 10296 : if (*head_blk != orig_head) {
1326 11 : xlog_set_state(log, *head_blk, rhead, rhead_blk,
1327 : wrapped);
1328 11 : tail_lsn = atomic64_read(&log->l_tail_lsn);
1329 11 : error = xlog_check_unmount_rec(log, head_blk, tail_blk,
1330 : rhead, rhead_blk, buffer,
1331 : &clean);
1332 11 : if (error)
1333 0 : goto done;
1334 : }
1335 : }
1336 :
1337 : /*
1338 : * Note that the unmount was clean. If the unmount was not clean, we
1339 : * need to know this to rebuild the superblock counters from the perag
1340 : * headers if we have a filesystem using non-persistent counters.
1341 : */
1342 22477 : if (clean)
1343 12181 : set_bit(XFS_OPSTATE_CLEAN, &log->l_mp->m_opstate);
1344 :
1345 : /*
1346 : * Make sure that there are no blocks in front of the head
1347 : * with the same cycle number as the head. This can happen
1348 : * because we allow multiple outstanding log writes concurrently,
1349 : * and the later writes might make it out before earlier ones.
1350 : *
1351 : * We use the lsn from before modifying it so that we'll never
1352 : * overwrite the unmount record after a clean unmount.
1353 : *
1354 : * Do this only if we are going to recover the filesystem
1355 : *
1356 : * NOTE: This used to say "if (!readonly)"
1357 : * However on Linux, we can & do recover a read-only filesystem.
1358 : * We only skip recovery if NORECOVERY is specified on mount,
1359 : * in which case we would not be here.
1360 : *
1361 : * But... if the -device- itself is readonly, just skip this.
1362 : * We can't recover this device anyway, so it won't matter.
1363 : */
1364 22477 : if (!xfs_readonly_buftarg(log->l_targ))
1365 22473 : error = xlog_clear_stale_blocks(log, tail_lsn);
1366 :
1367 4 : done:
1368 22477 : kmem_free(buffer);
1369 :
1370 22477 : if (error)
1371 0 : xfs_warn(log->l_mp, "failed to locate log tail");
1372 : return error;
1373 : }
1374 :
1375 : /*
1376 : * Is the log zeroed at all?
1377 : *
1378 : * The last binary search should be changed to perform an X block read
1379 : * once X becomes small enough. You can then search linearly through
1380 : * the X blocks. This will cut down on the number of reads we need to do.
1381 : *
1382 : * If the log is partially zeroed, this routine will pass back the blkno
1383 : * of the first block with cycle number 0. It won't have a complete LR
1384 : * preceding it.
1385 : *
1386 : * Return:
1387 : * 0 => the log is completely written to
1388 : * 1 => use *blk_no as the first block of the log
1389 : * <0 => error has occurred
1390 : */
1391 : STATIC int
1392 22481 : xlog_find_zeroed(
1393 : struct xlog *log,
1394 : xfs_daddr_t *blk_no)
1395 : {
1396 22481 : char *buffer;
1397 22481 : char *offset;
1398 22481 : uint first_cycle, last_cycle;
1399 22481 : xfs_daddr_t new_blk, last_blk, start_blk;
1400 22481 : xfs_daddr_t num_scan_bblks;
1401 22481 : int error, log_bbnum = log->l_logBBsize;
1402 :
1403 22481 : *blk_no = 0;
1404 :
1405 : /* check totally zeroed log */
1406 22481 : buffer = xlog_alloc_buffer(log, 1);
1407 22481 : if (!buffer)
1408 : return -ENOMEM;
1409 22481 : error = xlog_bread(log, 0, 1, buffer, &offset);
1410 22481 : if (error)
1411 0 : goto out_free_buffer;
1412 :
1413 22481 : first_cycle = xlog_get_cycle(offset);
1414 22481 : if (first_cycle == 0) { /* completely zeroed log */
1415 0 : *blk_no = 0;
1416 0 : kmem_free(buffer);
1417 0 : return 1;
1418 : }
1419 :
1420 : /* check partially zeroed log */
1421 22481 : error = xlog_bread(log, log_bbnum-1, 1, buffer, &offset);
1422 22481 : if (error)
1423 0 : goto out_free_buffer;
1424 :
1425 22481 : last_cycle = xlog_get_cycle(offset);
1426 22481 : if (last_cycle != 0) { /* log completely written to */
1427 14182 : kmem_free(buffer);
1428 14182 : return 0;
1429 : }
1430 :
1431 : /* we have a partially zeroed log */
1432 8299 : last_blk = log_bbnum-1;
1433 8299 : error = xlog_find_cycle_start(log, buffer, 0, &last_blk, 0);
1434 8299 : if (error)
1435 0 : goto out_free_buffer;
1436 :
1437 : /*
1438 : * Validate the answer. Because there is no way to guarantee that
1439 : * the entire log is made up of log records which are the same size,
1440 : * we scan over the defined maximum blocks. At this point, the maximum
1441 : * is not chosen to mean anything special. XXXmiken
1442 : */
1443 8299 : num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1444 8299 : ASSERT(num_scan_bblks <= INT_MAX);
1445 :
1446 8299 : if (last_blk < num_scan_bblks)
1447 : num_scan_bblks = last_blk;
1448 8299 : start_blk = last_blk - num_scan_bblks;
1449 :
1450 : /*
1451 : * We search for any instances of cycle number 0 that occur before
1452 : * our current estimate of the head. What we're trying to detect is
1453 : * 1 ... | 0 | 1 | 0...
1454 : * ^ binary search ends here
1455 : */
1456 8299 : if ((error = xlog_find_verify_cycle(log, start_blk,
1457 : (int)num_scan_bblks, 0, &new_blk)))
1458 0 : goto out_free_buffer;
1459 8299 : if (new_blk != -1)
1460 5 : last_blk = new_blk;
1461 :
1462 : /*
1463 : * Potentially backup over partial log record write. We don't need
1464 : * to search the end of the log because we know it is zero.
1465 : */
1466 8299 : error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
1467 8299 : if (error == 1)
1468 : error = -EIO;
1469 8299 : if (error)
1470 0 : goto out_free_buffer;
1471 :
1472 8299 : *blk_no = last_blk;
1473 8299 : out_free_buffer:
1474 8299 : kmem_free(buffer);
1475 8299 : if (error)
1476 0 : return error;
1477 : return 1;
1478 : }
1479 :
1480 : /*
1481 : * These are simple subroutines used by xlog_clear_stale_blocks() below
1482 : * to initialize a buffer full of empty log record headers and write
1483 : * them into the log.
1484 : */
1485 : STATIC void
1486 92024760 : xlog_add_record(
1487 : struct xlog *log,
1488 : char *buf,
1489 : int cycle,
1490 : int block,
1491 : int tail_cycle,
1492 : int tail_block)
1493 : {
1494 92024760 : xlog_rec_header_t *recp = (xlog_rec_header_t *)buf;
1495 :
1496 92024760 : memset(buf, 0, BBSIZE);
1497 92024760 : recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1498 92024760 : recp->h_cycle = cpu_to_be32(cycle);
1499 92024760 : recp->h_version = cpu_to_be32(
1500 : xfs_has_logv2(log->l_mp) ? 2 : 1);
1501 92024760 : recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1502 92024760 : recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1503 92024760 : recp->h_fmt = cpu_to_be32(XLOG_FMT);
1504 184049520 : memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1505 92024760 : }
1506 :
1507 : STATIC int
1508 22889 : xlog_write_log_records(
1509 : struct xlog *log,
1510 : int cycle,
1511 : int start_block,
1512 : int blocks,
1513 : int tail_cycle,
1514 : int tail_block)
1515 : {
1516 22889 : char *offset;
1517 22889 : char *buffer;
1518 22889 : int balign, ealign;
1519 22889 : int sectbb = log->l_sectBBsize;
1520 22889 : int end_block = start_block + blocks;
1521 22889 : int bufblks;
1522 22889 : int error = 0;
1523 22889 : int i, j = 0;
1524 :
1525 : /*
1526 : * Greedily allocate a buffer big enough to handle the full
1527 : * range of basic blocks to be written. If that fails, try
1528 : * a smaller size. We need to be able to write at least a
1529 : * log sector, or we're out of luck.
1530 : */
1531 22889 : bufblks = 1 << ffs(blocks);
1532 22895 : while (bufblks > log->l_logBBsize)
1533 6 : bufblks >>= 1;
1534 22889 : while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
1535 0 : bufblks >>= 1;
1536 0 : if (bufblks < sectbb)
1537 : return -ENOMEM;
1538 : }
1539 :
1540 : /* We may need to do a read at the start to fill in part of
1541 : * the buffer in the starting sector not covered by the first
1542 : * write below.
1543 : */
1544 22889 : balign = round_down(start_block, sectbb);
1545 22889 : if (balign != start_block) {
1546 0 : error = xlog_bread_noalign(log, start_block, 1, buffer);
1547 0 : if (error)
1548 0 : goto out_free_buffer;
1549 :
1550 0 : j = start_block - balign;
1551 : }
1552 :
1553 114551 : for (i = start_block; i < end_block; i += bufblks) {
1554 91662 : int bcount, endcount;
1555 :
1556 91662 : bcount = min(bufblks, end_block - start_block);
1557 91662 : endcount = bcount - j;
1558 :
1559 : /* We may need to do a read at the end to fill in part of
1560 : * the buffer in the final sector not covered by the write.
1561 : * If this is the same sector as the above read, skip it.
1562 : */
1563 91662 : ealign = round_down(end_block, sectbb);
1564 91662 : if (j == 0 && (start_block + endcount > ealign)) {
1565 0 : error = xlog_bread_noalign(log, ealign, sectbb,
1566 0 : buffer + BBTOB(ealign - start_block));
1567 0 : if (error)
1568 : break;
1569 :
1570 : }
1571 :
1572 91662 : offset = buffer + xlog_align(log, start_block);
1573 92116422 : for (; j < endcount; j++) {
1574 92024760 : xlog_add_record(log, offset, cycle, i+j,
1575 : tail_cycle, tail_block);
1576 92024760 : offset += BBSIZE;
1577 : }
1578 91662 : error = xlog_bwrite(log, start_block, endcount, buffer);
1579 91662 : if (error)
1580 : break;
1581 91662 : start_block += endcount;
1582 91662 : j = 0;
1583 : }
1584 :
1585 22889 : out_free_buffer:
1586 22889 : kmem_free(buffer);
1587 22889 : return error;
1588 : }
1589 :
1590 : /*
1591 : * This routine is called to blow away any incomplete log writes out
1592 : * in front of the log head. We do this so that we won't become confused
1593 : * if we come up, write only a little bit more, and then crash again.
1594 : * If we leave the partial log records out there, this situation could
1595 : * cause us to think those partial writes are valid blocks since they
1596 : * have the current cycle number. We get rid of them by overwriting them
1597 : * with empty log records with the old cycle number rather than the
1598 : * current one.
1599 : *
1600 : * The tail lsn is passed in rather than taken from
1601 : * the log so that we will not write over the unmount record after a
1602 : * clean unmount in a 512 block log. Doing so would leave the log without
1603 : * any valid log records in it until a new one was written. If we crashed
1604 : * during that time we would not be able to recover.
1605 : */
1606 : STATIC int
1607 22473 : xlog_clear_stale_blocks(
1608 : struct xlog *log,
1609 : xfs_lsn_t tail_lsn)
1610 : {
1611 22473 : int tail_cycle, head_cycle;
1612 22473 : int tail_block, head_block;
1613 22473 : int tail_distance, max_distance;
1614 22473 : int distance;
1615 22473 : int error;
1616 :
1617 22473 : tail_cycle = CYCLE_LSN(tail_lsn);
1618 22473 : tail_block = BLOCK_LSN(tail_lsn);
1619 22473 : head_cycle = log->l_curr_cycle;
1620 22473 : head_block = log->l_curr_block;
1621 :
1622 : /*
1623 : * Figure out the distance between the new head of the log
1624 : * and the tail. We want to write over any blocks beyond the
1625 : * head that we may have written just before the crash, but
1626 : * we don't want to overwrite the tail of the log.
1627 : */
1628 22473 : if (head_cycle == tail_cycle) {
1629 : /*
1630 : * The tail is behind the head in the physical log,
1631 : * so the distance from the head to the tail is the
1632 : * distance from the head to the end of the log plus
1633 : * the distance from the beginning of the log to the
1634 : * tail.
1635 : */
1636 21971 : if (XFS_IS_CORRUPT(log->l_mp,
1637 : head_block < tail_block ||
1638 : head_block >= log->l_logBBsize))
1639 0 : return -EFSCORRUPTED;
1640 21971 : tail_distance = tail_block + (log->l_logBBsize - head_block);
1641 : } else {
1642 : /*
1643 : * The head is behind the tail in the physical log,
1644 : * so the distance from the head to the tail is just
1645 : * the tail block minus the head block.
1646 : */
1647 502 : if (XFS_IS_CORRUPT(log->l_mp,
1648 : head_block >= tail_block ||
1649 : head_cycle != tail_cycle + 1))
1650 0 : return -EFSCORRUPTED;
1651 502 : tail_distance = tail_block - head_block;
1652 : }
1653 :
1654 : /*
1655 : * If the head is right up against the tail, we can't clear
1656 : * anything.
1657 : */
1658 22473 : if (tail_distance <= 0) {
1659 0 : ASSERT(tail_distance == 0);
1660 0 : return 0;
1661 : }
1662 :
1663 22473 : max_distance = XLOG_TOTAL_REC_SHIFT(log);
1664 : /*
1665 : * Take the smaller of the maximum amount of outstanding I/O
1666 : * we could have and the distance to the tail to clear out.
1667 : * We take the smaller so that we don't overwrite the tail and
1668 : * we don't waste all day writing from the head to the tail
1669 : * for no reason.
1670 : */
1671 22473 : max_distance = min(max_distance, tail_distance);
1672 :
1673 22473 : if ((head_block + max_distance) <= log->l_logBBsize) {
1674 : /*
1675 : * We can stomp all the blocks we need to without
1676 : * wrapping around the end of the log. Just do it
1677 : * in a single write. Use the cycle number of the
1678 : * current cycle minus one so that the log will look like:
1679 : * n ... | n - 1 ...
1680 : */
1681 22057 : error = xlog_write_log_records(log, (head_cycle - 1),
1682 : head_block, max_distance, tail_cycle,
1683 : tail_block);
1684 22057 : if (error)
1685 0 : return error;
1686 : } else {
1687 : /*
1688 : * We need to wrap around the end of the physical log in
1689 : * order to clear all the blocks. Do it in two separate
1690 : * I/Os. The first write should be from the head to the
1691 : * end of the physical log, and it should use the current
1692 : * cycle number minus one just like above.
1693 : */
1694 416 : distance = log->l_logBBsize - head_block;
1695 416 : error = xlog_write_log_records(log, (head_cycle - 1),
1696 : head_block, distance, tail_cycle,
1697 : tail_block);
1698 :
1699 416 : if (error)
1700 : return error;
1701 :
1702 : /*
1703 : * Now write the blocks at the start of the physical log.
1704 : * This writes the remainder of the blocks we want to clear.
1705 : * It uses the current cycle number since we're now on the
1706 : * same cycle as the head so that we get:
1707 : * n ... n ... | n - 1 ...
1708 : * ^^^^^ blocks we're writing
1709 : */
1710 416 : distance = max_distance - (log->l_logBBsize - head_block);
1711 416 : error = xlog_write_log_records(log, head_cycle, 0, distance,
1712 : tail_cycle, tail_block);
1713 416 : if (error)
1714 0 : return error;
1715 : }
1716 :
1717 : return 0;
1718 : }
1719 :
1720 : /*
1721 : * Release the recovered intent item in the AIL that matches the given intent
1722 : * type and intent id.
1723 : */
1724 : void
1725 118304 : xlog_recover_release_intent(
1726 : struct xlog *log,
1727 : unsigned short intent_type,
1728 : uint64_t intent_id)
1729 : {
1730 118304 : struct xfs_ail_cursor cur;
1731 118304 : struct xfs_log_item *lip;
1732 118304 : struct xfs_ail *ailp = log->l_ailp;
1733 :
1734 118304 : spin_lock(&ailp->ail_lock);
1735 137865 : for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL;
1736 19561 : lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
1737 137230 : if (lip->li_type != intent_type)
1738 14785 : continue;
1739 122445 : if (!lip->li_ops->iop_match(lip, intent_id))
1740 4776 : continue;
1741 :
1742 117669 : spin_unlock(&ailp->ail_lock);
1743 117669 : lip->li_ops->iop_release(lip);
1744 117669 : spin_lock(&ailp->ail_lock);
1745 : break;
1746 : }
1747 :
1748 118304 : xfs_trans_ail_cursor_done(&cur);
1749 118304 : spin_unlock(&ailp->ail_lock);
1750 118304 : }
1751 :
1752 : int
1753 410 : xlog_recover_iget(
1754 : struct xfs_mount *mp,
1755 : xfs_ino_t ino,
1756 : struct xfs_inode **ipp)
1757 : {
1758 410 : int error;
1759 :
1760 410 : error = xfs_iget(mp, NULL, ino, 0, 0, ipp);
1761 410 : if (error)
1762 : return error;
1763 :
1764 410 : error = xfs_qm_dqattach(*ipp);
1765 410 : if (error) {
1766 0 : xfs_irele(*ipp);
1767 0 : return error;
1768 : }
1769 :
1770 410 : if (VFS_I(*ipp)->i_nlink == 0)
1771 12 : xfs_iflags_set(*ipp, XFS_IRECOVERY);
1772 :
1773 : return 0;
1774 : }
1775 :
1776 : /******************************************************************************
1777 : *
1778 : * Log recover routines
1779 : *
1780 : ******************************************************************************
1781 : */
1782 : static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
1783 : &xlog_buf_item_ops,
1784 : &xlog_inode_item_ops,
1785 : &xlog_dquot_item_ops,
1786 : &xlog_quotaoff_item_ops,
1787 : &xlog_icreate_item_ops,
1788 : &xlog_efi_item_ops,
1789 : &xlog_efd_item_ops,
1790 : &xlog_rui_item_ops,
1791 : &xlog_rud_item_ops,
1792 : &xlog_cui_item_ops,
1793 : &xlog_cud_item_ops,
1794 : &xlog_bui_item_ops,
1795 : &xlog_bud_item_ops,
1796 : &xlog_attri_item_ops,
1797 : &xlog_attrd_item_ops,
1798 : };
1799 :
1800 : static const struct xlog_recover_item_ops *
1801 47225500 : xlog_find_item_ops(
1802 : struct xlog_recover_item *item)
1803 : {
1804 47225500 : unsigned int i;
1805 :
1806 85029212 : for (i = 0; i < ARRAY_SIZE(xlog_recover_item_ops); i++)
1807 85029212 : if (ITEM_TYPE(item) == xlog_recover_item_ops[i]->item_type)
1808 47225500 : return xlog_recover_item_ops[i];
1809 :
1810 : return NULL;
1811 : }
1812 :
1813 : /*
1814 : * Sort the log items in the transaction.
1815 : *
1816 : * The ordering constraints are defined by the inode allocation and unlink
1817 : * behaviour. The rules are:
1818 : *
1819 : * 1. Every item is only logged once in a given transaction. Hence it
1820 : * represents the last logged state of the item. Hence ordering is
1821 : * dependent on the order in which operations need to be performed so
1822 : * required initial conditions are always met.
1823 : *
1824 : * 2. Cancelled buffers are recorded in pass 1 in a separate table and
1825 : * there's nothing to replay from them so we can simply cull them
1826 : * from the transaction. However, we can't do that until after we've
1827 : * replayed all the other items because they may be dependent on the
1828 : * cancelled buffer and replaying the cancelled buffer can remove it
1829 : * form the cancelled buffer table. Hence they have tobe done last.
1830 : *
1831 : * 3. Inode allocation buffers must be replayed before inode items that
1832 : * read the buffer and replay changes into it. For filesystems using the
1833 : * ICREATE transactions, this means XFS_LI_ICREATE objects need to get
1834 : * treated the same as inode allocation buffers as they create and
1835 : * initialise the buffers directly.
1836 : *
1837 : * 4. Inode unlink buffers must be replayed after inode items are replayed.
1838 : * This ensures that inodes are completely flushed to the inode buffer
1839 : * in a "free" state before we remove the unlinked inode list pointer.
1840 : *
1841 : * Hence the ordering needs to be inode allocation buffers first, inode items
1842 : * second, inode unlink buffers third and cancelled buffers last.
1843 : *
1844 : * But there's a problem with that - we can't tell an inode allocation buffer
1845 : * apart from a regular buffer, so we can't separate them. We can, however,
1846 : * tell an inode unlink buffer from the others, and so we can separate them out
1847 : * from all the other buffers and move them to last.
1848 : *
1849 : * Hence, 4 lists, in order from head to tail:
1850 : * - buffer_list for all buffers except cancelled/inode unlink buffers
1851 : * - item_list for all non-buffer items
1852 : * - inode_buffer_list for inode unlink buffers
1853 : * - cancel_list for the cancelled buffers
1854 : *
1855 : * Note that we add objects to the tail of the lists so that first-to-last
1856 : * ordering is preserved within the lists. Adding objects to the head of the
1857 : * list means when we traverse from the head we walk them in last-to-first
1858 : * order. For cancelled buffers and inode unlink buffers this doesn't matter,
1859 : * but for all other items there may be specific ordering that we need to
1860 : * preserve.
1861 : */
1862 : STATIC int
1863 727392 : xlog_recover_reorder_trans(
1864 : struct xlog *log,
1865 : struct xlog_recover *trans,
1866 : int pass)
1867 : {
1868 727392 : struct xlog_recover_item *item, *n;
1869 727392 : int error = 0;
1870 727392 : LIST_HEAD(sort_list);
1871 727392 : LIST_HEAD(cancel_list);
1872 727392 : LIST_HEAD(buffer_list);
1873 727392 : LIST_HEAD(inode_buffer_list);
1874 727392 : LIST_HEAD(item_list);
1875 :
1876 727392 : list_splice_init(&trans->r_itemq, &sort_list);
1877 47952892 : list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1878 47225500 : enum xlog_recover_reorder fate = XLOG_REORDER_ITEM_LIST;
1879 :
1880 47225500 : item->ri_ops = xlog_find_item_ops(item);
1881 47225500 : if (!item->ri_ops) {
1882 0 : xfs_warn(log->l_mp,
1883 : "%s: unrecognized type of log operation (%d)",
1884 : __func__, ITEM_TYPE(item));
1885 0 : ASSERT(0);
1886 : /*
1887 : * return the remaining items back to the transaction
1888 : * item list so they can be freed in caller.
1889 : */
1890 0 : if (!list_empty(&sort_list))
1891 0 : list_splice_init(&sort_list, &trans->r_itemq);
1892 : error = -EFSCORRUPTED;
1893 : break;
1894 : }
1895 :
1896 47225500 : if (item->ri_ops->reorder)
1897 23770148 : fate = item->ri_ops->reorder(item);
1898 :
1899 23770148 : switch (fate) {
1900 23194334 : case XLOG_REORDER_BUFFER_LIST:
1901 23194334 : list_move_tail(&item->ri_list, &buffer_list);
1902 23194334 : break;
1903 558444 : case XLOG_REORDER_CANCEL_LIST:
1904 558444 : trace_xfs_log_recover_item_reorder_head(log,
1905 : trans, item, pass);
1906 558444 : list_move(&item->ri_list, &cancel_list);
1907 558444 : break;
1908 17370 : case XLOG_REORDER_INODE_BUFFER_LIST:
1909 17370 : list_move(&item->ri_list, &inode_buffer_list);
1910 17370 : break;
1911 23455352 : case XLOG_REORDER_ITEM_LIST:
1912 23455352 : trace_xfs_log_recover_item_reorder_tail(log,
1913 : trans, item, pass);
1914 23455352 : list_move_tail(&item->ri_list, &item_list);
1915 23455352 : break;
1916 : }
1917 : }
1918 :
1919 727392 : ASSERT(list_empty(&sort_list));
1920 727392 : if (!list_empty(&buffer_list))
1921 696990 : list_splice(&buffer_list, &trans->r_itemq);
1922 727392 : if (!list_empty(&item_list))
1923 726608 : list_splice_tail(&item_list, &trans->r_itemq);
1924 727392 : if (!list_empty(&inode_buffer_list))
1925 5074 : list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1926 727392 : if (!list_empty(&cancel_list))
1927 79996 : list_splice_tail(&cancel_list, &trans->r_itemq);
1928 727392 : return error;
1929 : }
1930 :
1931 : void
1932 23364941 : xlog_buf_readahead(
1933 : struct xlog *log,
1934 : xfs_daddr_t blkno,
1935 : uint len,
1936 : const struct xfs_buf_ops *ops)
1937 : {
1938 23364941 : if (!xlog_is_buffer_cancelled(log, blkno, len))
1939 22528053 : xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops);
1940 23364941 : }
1941 :
1942 : STATIC int
1943 456489 : xlog_recover_items_pass2(
1944 : struct xlog *log,
1945 : struct xlog_recover *trans,
1946 : struct list_head *buffer_list,
1947 : struct list_head *item_list)
1948 : {
1949 456489 : struct xlog_recover_item *item;
1950 456489 : int error = 0;
1951 :
1952 24069239 : list_for_each_entry(item, item_list, ri_list) {
1953 23612750 : trace_xfs_log_recover_item_recover(log, trans, item,
1954 : XLOG_RECOVER_PASS2);
1955 :
1956 23612750 : if (item->ri_ops->commit_pass2)
1957 23612750 : error = item->ri_ops->commit_pass2(log, buffer_list,
1958 : item, trans->r_lsn);
1959 23612750 : if (error)
1960 0 : return error;
1961 : }
1962 :
1963 : return error;
1964 : }
1965 :
1966 : /*
1967 : * Perform the transaction.
1968 : *
1969 : * If the transaction modifies a buffer or inode, do it now. Otherwise,
1970 : * EFIs and EFDs get queued up by adding entries into the AIL for them.
1971 : */
1972 : STATIC int
1973 727392 : xlog_recover_commit_trans(
1974 : struct xlog *log,
1975 : struct xlog_recover *trans,
1976 : int pass,
1977 : struct list_head *buffer_list)
1978 : {
1979 727392 : int error = 0;
1980 727392 : int items_queued = 0;
1981 727392 : struct xlog_recover_item *item;
1982 727392 : struct xlog_recover_item *next;
1983 727392 : LIST_HEAD (ra_list);
1984 727392 : LIST_HEAD (done_list);
1985 :
1986 : #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
1987 :
1988 727392 : hlist_del_init(&trans->r_list);
1989 :
1990 727392 : error = xlog_recover_reorder_trans(log, trans, pass);
1991 727392 : if (error)
1992 : return error;
1993 :
1994 47952892 : list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
1995 47225500 : trace_xfs_log_recover_item_recover(log, trans, item, pass);
1996 :
1997 47225500 : switch (pass) {
1998 23612750 : case XLOG_RECOVER_PASS1:
1999 23612750 : if (item->ri_ops->commit_pass1)
2000 11877272 : error = item->ri_ops->commit_pass1(log, item);
2001 : break;
2002 23612750 : case XLOG_RECOVER_PASS2:
2003 23612750 : if (item->ri_ops->ra_pass2)
2004 23364941 : item->ri_ops->ra_pass2(log, item);
2005 23612750 : list_move_tail(&item->ri_list, &ra_list);
2006 23612750 : items_queued++;
2007 23612750 : if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
2008 94334 : error = xlog_recover_items_pass2(log, trans,
2009 : buffer_list, &ra_list);
2010 94334 : list_splice_tail_init(&ra_list, &done_list);
2011 : items_queued = 0;
2012 : }
2013 :
2014 : break;
2015 0 : default:
2016 0 : ASSERT(0);
2017 : }
2018 :
2019 47225500 : if (error)
2020 0 : goto out;
2021 : }
2022 :
2023 727392 : out:
2024 727392 : if (!list_empty(&ra_list)) {
2025 362155 : if (!error)
2026 362155 : error = xlog_recover_items_pass2(log, trans,
2027 : buffer_list, &ra_list);
2028 362155 : list_splice_tail_init(&ra_list, &done_list);
2029 : }
2030 :
2031 727392 : if (!list_empty(&done_list))
2032 363696 : list_splice_init(&done_list, &trans->r_itemq);
2033 :
2034 : return error;
2035 : }
2036 :
2037 : STATIC void
2038 47401032 : xlog_recover_add_item(
2039 : struct list_head *head)
2040 : {
2041 47401032 : struct xlog_recover_item *item;
2042 :
2043 47401032 : item = kmem_zalloc(sizeof(struct xlog_recover_item), 0);
2044 47401032 : INIT_LIST_HEAD(&item->ri_list);
2045 47401032 : list_add_tail(&item->ri_list, head);
2046 47401032 : }
2047 :
2048 : STATIC int
2049 1530126 : xlog_recover_add_to_cont_trans(
2050 : struct xlog *log,
2051 : struct xlog_recover *trans,
2052 : char *dp,
2053 : int len)
2054 : {
2055 1530126 : struct xlog_recover_item *item;
2056 1530126 : char *ptr, *old_ptr;
2057 1530126 : int old_len;
2058 :
2059 : /*
2060 : * If the transaction is empty, the header was split across this and the
2061 : * previous record. Copy the rest of the header.
2062 : */
2063 1530126 : if (list_empty(&trans->r_itemq)) {
2064 0 : ASSERT(len <= sizeof(struct xfs_trans_header));
2065 0 : if (len > sizeof(struct xfs_trans_header)) {
2066 0 : xfs_warn(log->l_mp, "%s: bad header length", __func__);
2067 0 : return -EFSCORRUPTED;
2068 : }
2069 :
2070 0 : xlog_recover_add_item(&trans->r_itemq);
2071 0 : ptr = (char *)&trans->r_theader +
2072 0 : sizeof(struct xfs_trans_header) - len;
2073 0 : memcpy(ptr, dp, len);
2074 0 : return 0;
2075 : }
2076 :
2077 : /* take the tail entry */
2078 1530126 : item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
2079 : ri_list);
2080 :
2081 1530126 : old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
2082 1530126 : old_len = item->ri_buf[item->ri_cnt-1].i_len;
2083 :
2084 1530126 : ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL);
2085 1530126 : if (!ptr)
2086 : return -ENOMEM;
2087 3060252 : memcpy(&ptr[old_len], dp, len);
2088 1530126 : item->ri_buf[item->ri_cnt-1].i_len += len;
2089 1530126 : item->ri_buf[item->ri_cnt-1].i_addr = ptr;
2090 1530126 : trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
2091 1530126 : return 0;
2092 : }
2093 :
2094 : /*
2095 : * The next region to add is the start of a new region. It could be
2096 : * a whole region or it could be the first part of a new region. Because
2097 : * of this, the assumption here is that the type and size fields of all
2098 : * format structures fit into the first 32 bits of the structure.
2099 : *
2100 : * This works because all regions must be 32 bit aligned. Therefore, we
2101 : * either have both fields or we have neither field. In the case we have
2102 : * neither field, the data part of the region is zero length. We only have
2103 : * a log_op_header and can throw away the header since a new one will appear
2104 : * later. If we have at least 4 bytes, then we can determine how many regions
2105 : * will appear in the current log item.
2106 : */
2107 : STATIC int
2108 115203568 : xlog_recover_add_to_trans(
2109 : struct xlog *log,
2110 : struct xlog_recover *trans,
2111 : char *dp,
2112 : int len)
2113 : {
2114 115203568 : struct xfs_inode_log_format *in_f; /* any will do */
2115 115203568 : struct xlog_recover_item *item;
2116 115203568 : char *ptr;
2117 :
2118 115203568 : if (!len)
2119 : return 0;
2120 115203568 : if (list_empty(&trans->r_itemq)) {
2121 : /* we need to catch log corruptions here */
2122 728868 : if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
2123 0 : xfs_warn(log->l_mp, "%s: bad header magic number",
2124 : __func__);
2125 0 : ASSERT(0);
2126 0 : return -EFSCORRUPTED;
2127 : }
2128 :
2129 728868 : if (len > sizeof(struct xfs_trans_header)) {
2130 0 : xfs_warn(log->l_mp, "%s: bad header length", __func__);
2131 0 : ASSERT(0);
2132 0 : return -EFSCORRUPTED;
2133 : }
2134 :
2135 : /*
2136 : * The transaction header can be arbitrarily split across op
2137 : * records. If we don't have the whole thing here, copy what we
2138 : * do have and handle the rest in the next record.
2139 : */
2140 728868 : if (len == sizeof(struct xfs_trans_header))
2141 728868 : xlog_recover_add_item(&trans->r_itemq);
2142 1457736 : memcpy(&trans->r_theader, dp, len);
2143 728868 : return 0;
2144 : }
2145 :
2146 114474700 : ptr = kmem_alloc(len, 0);
2147 228949400 : memcpy(ptr, dp, len);
2148 114474700 : in_f = (struct xfs_inode_log_format *)ptr;
2149 :
2150 : /* take the tail entry */
2151 114474700 : item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
2152 : ri_list);
2153 114474700 : if (item->ri_total != 0 &&
2154 113745832 : item->ri_total == item->ri_cnt) {
2155 : /* tail item is in use, get a new one */
2156 46672164 : xlog_recover_add_item(&trans->r_itemq);
2157 46672164 : item = list_entry(trans->r_itemq.prev,
2158 : struct xlog_recover_item, ri_list);
2159 : }
2160 :
2161 114474700 : if (item->ri_total == 0) { /* first region to be added */
2162 47401032 : if (in_f->ilf_size == 0 ||
2163 : in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
2164 0 : xfs_warn(log->l_mp,
2165 : "bad number of regions (%d) in inode log format",
2166 : in_f->ilf_size);
2167 0 : ASSERT(0);
2168 0 : kmem_free(ptr);
2169 0 : return -EFSCORRUPTED;
2170 : }
2171 :
2172 47401032 : item->ri_total = in_f->ilf_size;
2173 47401032 : item->ri_buf =
2174 47401032 : kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
2175 : 0);
2176 : }
2177 :
2178 114474700 : if (item->ri_total <= item->ri_cnt) {
2179 0 : xfs_warn(log->l_mp,
2180 : "log item region count (%d) overflowed size (%d)",
2181 : item->ri_cnt, item->ri_total);
2182 0 : ASSERT(0);
2183 0 : kmem_free(ptr);
2184 0 : return -EFSCORRUPTED;
2185 : }
2186 :
2187 : /* Description region is ri_buf[0] */
2188 114474700 : item->ri_buf[item->ri_cnt].i_addr = ptr;
2189 114474700 : item->ri_buf[item->ri_cnt].i_len = len;
2190 114474700 : item->ri_cnt++;
2191 114474700 : trace_xfs_log_recover_item_add(log, trans, item, 0);
2192 114474700 : return 0;
2193 : }
2194 :
2195 : /*
2196 : * Free up any resources allocated by the transaction
2197 : *
2198 : * Remember that EFIs, EFDs, and IUNLINKs are handled later.
2199 : */
2200 : STATIC void
2201 728868 : xlog_recover_free_trans(
2202 : struct xlog_recover *trans)
2203 : {
2204 728868 : struct xlog_recover_item *item, *n;
2205 728868 : int i;
2206 :
2207 728868 : hlist_del_init(&trans->r_list);
2208 :
2209 48129900 : list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
2210 : /* Free the regions in the item. */
2211 47401032 : list_del(&item->ri_list);
2212 161875732 : for (i = 0; i < item->ri_cnt; i++)
2213 114474700 : kmem_free(item->ri_buf[i].i_addr);
2214 : /* Free the item itself */
2215 47401032 : kmem_free(item->ri_buf);
2216 47401032 : kmem_free(item);
2217 : }
2218 : /* Free the transaction recover structure */
2219 728868 : kmem_free(trans);
2220 728868 : }
2221 :
2222 : /*
2223 : * On error or completion, trans is freed.
2224 : */
2225 : STATIC int
2226 117461086 : xlog_recovery_process_trans(
2227 : struct xlog *log,
2228 : struct xlog_recover *trans,
2229 : char *dp,
2230 : unsigned int len,
2231 : unsigned int flags,
2232 : int pass,
2233 : struct list_head *buffer_list)
2234 : {
2235 117461086 : int error = 0;
2236 117461086 : bool freeit = false;
2237 :
2238 : /* mask off ophdr transaction container flags */
2239 117461086 : flags &= ~XLOG_END_TRANS;
2240 117461086 : if (flags & XLOG_WAS_CONT_TRANS)
2241 1530126 : flags &= ~XLOG_CONTINUE_TRANS;
2242 :
2243 : /*
2244 : * Callees must not free the trans structure. We'll decide if we need to
2245 : * free it or not based on the operation being done and it's result.
2246 : */
2247 117461086 : switch (flags) {
2248 : /* expected flag values */
2249 115203568 : case 0:
2250 : case XLOG_CONTINUE_TRANS:
2251 115203568 : error = xlog_recover_add_to_trans(log, trans, dp, len);
2252 115203568 : break;
2253 1530126 : case XLOG_WAS_CONT_TRANS:
2254 1530126 : error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
2255 1530126 : break;
2256 727392 : case XLOG_COMMIT_TRANS:
2257 727392 : error = xlog_recover_commit_trans(log, trans, pass,
2258 : buffer_list);
2259 : /* success or fail, we are now done with this transaction. */
2260 727392 : freeit = true;
2261 727392 : break;
2262 :
2263 : /* unexpected flag values */
2264 0 : case XLOG_UNMOUNT_TRANS:
2265 : /* just skip trans */
2266 0 : xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
2267 0 : freeit = true;
2268 0 : break;
2269 0 : case XLOG_START_TRANS:
2270 : default:
2271 0 : xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
2272 0 : ASSERT(0);
2273 0 : error = -EFSCORRUPTED;
2274 0 : break;
2275 : }
2276 117461086 : if (error || freeit)
2277 727392 : xlog_recover_free_trans(trans);
2278 117461086 : return error;
2279 : }
2280 :
2281 : /*
2282 : * Lookup the transaction recovery structure associated with the ID in the
2283 : * current ophdr. If the transaction doesn't exist and the start flag is set in
2284 : * the ophdr, then allocate a new transaction for future ID matches to find.
2285 : * Either way, return what we found during the lookup - an existing transaction
2286 : * or nothing.
2287 : */
2288 : STATIC struct xlog_recover *
2289 118245384 : xlog_recover_ophdr_to_trans(
2290 : struct hlist_head rhash[],
2291 : struct xlog_rec_header *rhead,
2292 : struct xlog_op_header *ohead)
2293 : {
2294 118245384 : struct xlog_recover *trans;
2295 118245384 : xlog_tid_t tid;
2296 118245384 : struct hlist_head *rhp;
2297 :
2298 118245384 : tid = be32_to_cpu(ohead->oh_tid);
2299 118245384 : rhp = &rhash[XLOG_RHASH(tid)];
2300 236782790 : hlist_for_each_entry(trans, rhp, r_list) {
2301 117753108 : if (trans->r_log_tid == tid)
2302 117461086 : return trans;
2303 : }
2304 :
2305 : /*
2306 : * skip over non-start transaction headers - we could be
2307 : * processing slack space before the next transaction starts
2308 : */
2309 784298 : if (!(ohead->oh_flags & XLOG_START_TRANS))
2310 : return NULL;
2311 :
2312 728868 : ASSERT(be32_to_cpu(ohead->oh_len) == 0);
2313 :
2314 : /*
2315 : * This is a new transaction so allocate a new recovery container to
2316 : * hold the recovery ops that will follow.
2317 : */
2318 728868 : trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
2319 728868 : trans->r_log_tid = tid;
2320 728868 : trans->r_lsn = be64_to_cpu(rhead->h_lsn);
2321 728868 : INIT_LIST_HEAD(&trans->r_itemq);
2322 728868 : INIT_HLIST_NODE(&trans->r_list);
2323 728868 : hlist_add_head(&trans->r_list, rhp);
2324 :
2325 : /*
2326 : * Nothing more to do for this ophdr. Items to be added to this new
2327 : * transaction will be in subsequent ophdr containers.
2328 : */
2329 728868 : return NULL;
2330 : }
2331 :
2332 : STATIC int
2333 118245384 : xlog_recover_process_ophdr(
2334 : struct xlog *log,
2335 : struct hlist_head rhash[],
2336 : struct xlog_rec_header *rhead,
2337 : struct xlog_op_header *ohead,
2338 : char *dp,
2339 : char *end,
2340 : int pass,
2341 : struct list_head *buffer_list)
2342 : {
2343 118245384 : struct xlog_recover *trans;
2344 118245384 : unsigned int len;
2345 118245384 : int error;
2346 :
2347 : /* Do we understand who wrote this op? */
2348 118245384 : if (ohead->oh_clientid != XFS_TRANSACTION &&
2349 : ohead->oh_clientid != XFS_LOG) {
2350 0 : xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
2351 : __func__, ohead->oh_clientid);
2352 0 : ASSERT(0);
2353 0 : return -EFSCORRUPTED;
2354 : }
2355 :
2356 : /*
2357 : * Check the ophdr contains all the data it is supposed to contain.
2358 : */
2359 118245384 : len = be32_to_cpu(ohead->oh_len);
2360 118245384 : if (dp + len > end) {
2361 0 : xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
2362 0 : WARN_ON(1);
2363 0 : return -EFSCORRUPTED;
2364 : }
2365 :
2366 118245384 : trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
2367 118245384 : if (!trans) {
2368 : /* nothing to do, so skip over this ophdr */
2369 : return 0;
2370 : }
2371 :
2372 : /*
2373 : * The recovered buffer queue is drained only once we know that all
2374 : * recovery items for the current LSN have been processed. This is
2375 : * required because:
2376 : *
2377 : * - Buffer write submission updates the metadata LSN of the buffer.
2378 : * - Log recovery skips items with a metadata LSN >= the current LSN of
2379 : * the recovery item.
2380 : * - Separate recovery items against the same metadata buffer can share
2381 : * a current LSN. I.e., consider that the LSN of a recovery item is
2382 : * defined as the starting LSN of the first record in which its
2383 : * transaction appears, that a record can hold multiple transactions,
2384 : * and/or that a transaction can span multiple records.
2385 : *
2386 : * In other words, we are allowed to submit a buffer from log recovery
2387 : * once per current LSN. Otherwise, we may incorrectly skip recovery
2388 : * items and cause corruption.
2389 : *
2390 : * We don't know up front whether buffers are updated multiple times per
2391 : * LSN. Therefore, track the current LSN of each commit log record as it
2392 : * is processed and drain the queue when it changes. Use commit records
2393 : * because they are ordered correctly by the logging code.
2394 : */
2395 117461086 : if (log->l_recovery_lsn != trans->r_lsn &&
2396 117339635 : ohead->oh_flags & XLOG_COMMIT_TRANS) {
2397 724595 : error = xfs_buf_delwri_submit(buffer_list);
2398 724595 : if (error)
2399 : return error;
2400 724595 : log->l_recovery_lsn = trans->r_lsn;
2401 : }
2402 :
2403 117461086 : return xlog_recovery_process_trans(log, trans, dp, len,
2404 117461086 : ohead->oh_flags, pass, buffer_list);
2405 : }
2406 :
2407 : /*
2408 : * There are two valid states of the r_state field. 0 indicates that the
2409 : * transaction structure is in a normal state. We have either seen the
2410 : * start of the transaction or the last operation we added was not a partial
2411 : * operation. If the last operation we added to the transaction was a
2412 : * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
2413 : *
2414 : * NOTE: skip LRs with 0 data length.
2415 : */
2416 : STATIC int
2417 2295766 : xlog_recover_process_data(
2418 : struct xlog *log,
2419 : struct hlist_head rhash[],
2420 : struct xlog_rec_header *rhead,
2421 : char *dp,
2422 : int pass,
2423 : struct list_head *buffer_list)
2424 : {
2425 2295766 : struct xlog_op_header *ohead;
2426 2295766 : char *end;
2427 2295766 : int num_logops;
2428 2295766 : int error;
2429 :
2430 2295766 : end = dp + be32_to_cpu(rhead->h_len);
2431 2295766 : num_logops = be32_to_cpu(rhead->h_num_logops);
2432 :
2433 : /* check the log format matches our own - else we can't recover */
2434 2295766 : if (xlog_header_check_recover(log->l_mp, rhead))
2435 : return -EIO;
2436 :
2437 2295766 : trace_xfs_log_recover_record(log, rhead, pass);
2438 120541150 : while ((dp < end) && num_logops) {
2439 :
2440 118245384 : ohead = (struct xlog_op_header *)dp;
2441 118245384 : dp += sizeof(*ohead);
2442 118245384 : ASSERT(dp <= end);
2443 :
2444 : /* errors will abort recovery */
2445 118245384 : error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
2446 : dp, end, pass, buffer_list);
2447 118245384 : if (error)
2448 0 : return error;
2449 :
2450 118245384 : dp += be32_to_cpu(ohead->oh_len);
2451 118245384 : num_logops--;
2452 : }
2453 : return 0;
2454 : }
2455 :
2456 : /* Take all the collected deferred ops and finish them in order. */
2457 : static int
2458 10292 : xlog_finish_defer_ops(
2459 : struct xfs_mount *mp,
2460 : struct list_head *capture_list)
2461 : {
2462 10292 : struct xfs_defer_capture *dfc, *next;
2463 10292 : struct xfs_trans *tp;
2464 10292 : int error = 0;
2465 :
2466 10981 : list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
2467 689 : struct xfs_trans_res resv;
2468 689 : struct xfs_defer_resources dres;
2469 :
2470 : /*
2471 : * Create a new transaction reservation from the captured
2472 : * information. Set logcount to 1 to force the new transaction
2473 : * to regrant every roll so that we can make forward progress
2474 : * in recovery no matter how full the log might be.
2475 : */
2476 689 : resv.tr_logres = dfc->dfc_logres;
2477 689 : resv.tr_logcount = 1;
2478 689 : resv.tr_logflags = XFS_TRANS_PERM_LOG_RES;
2479 :
2480 689 : error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
2481 : dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
2482 689 : if (error) {
2483 0 : xlog_force_shutdown(mp->m_log, SHUTDOWN_LOG_IO_ERROR);
2484 0 : return error;
2485 : }
2486 :
2487 : /*
2488 : * Transfer to this new transaction all the dfops we captured
2489 : * from recovering a single intent item.
2490 : */
2491 689 : list_del_init(&dfc->dfc_list);
2492 689 : xfs_defer_ops_continue(dfc, tp, &dres);
2493 689 : error = xfs_trans_commit(tp);
2494 689 : xfs_defer_resources_rele(&dres);
2495 689 : if (error)
2496 0 : return error;
2497 : }
2498 :
2499 10292 : ASSERT(list_empty(capture_list));
2500 : return 0;
2501 : }
2502 :
2503 : /* Release all the captured defer ops and capture structures in this list. */
2504 : static void
2505 2 : xlog_abort_defer_ops(
2506 : struct xfs_mount *mp,
2507 : struct list_head *capture_list)
2508 : {
2509 2 : struct xfs_defer_capture *dfc;
2510 2 : struct xfs_defer_capture *next;
2511 :
2512 2 : list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
2513 0 : list_del_init(&dfc->dfc_list);
2514 0 : xfs_defer_ops_capture_abort(mp, dfc);
2515 : }
2516 2 : }
2517 :
2518 : /*
2519 : * When this is called, all of the log intent items which did not have
2520 : * corresponding log done items should be in the AIL. What we do now is update
2521 : * the data structures associated with each one.
2522 : *
2523 : * Since we process the log intent items in normal transactions, they will be
2524 : * removed at some point after the commit. This prevents us from just walking
2525 : * down the list processing each one. We'll use a flag in the intent item to
2526 : * skip those that we've already processed and use the AIL iteration mechanism's
2527 : * generation count to try to speed this up at least a bit.
2528 : *
2529 : * When we start, we know that the intents are the only things in the AIL. As we
2530 : * process them, however, other items are added to the AIL. Hence we know we
2531 : * have started recovery on all the pending intents when we find an non-intent
2532 : * item in the AIL.
2533 : */
2534 : STATIC int
2535 10294 : xlog_recover_process_intents(
2536 : struct xlog *log)
2537 : {
2538 10294 : LIST_HEAD(capture_list);
2539 10294 : struct xfs_ail_cursor cur;
2540 10294 : struct xfs_log_item *lip;
2541 10294 : struct xfs_ail *ailp;
2542 10294 : int error = 0;
2543 : #if defined(DEBUG) || defined(XFS_WARN)
2544 10294 : xfs_lsn_t last_lsn;
2545 : #endif
2546 :
2547 10294 : ailp = log->l_ailp;
2548 10294 : spin_lock(&ailp->ail_lock);
2549 : #if defined(DEBUG) || defined(XFS_WARN)
2550 10294 : last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
2551 : #endif
2552 10294 : for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2553 14326 : lip != NULL;
2554 4032 : lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
2555 4034 : const struct xfs_item_ops *ops;
2556 :
2557 4034 : if (!xlog_item_is_intent(lip))
2558 : break;
2559 :
2560 : /*
2561 : * We should never see a redo item with a LSN higher than
2562 : * the last transaction we found in the log at the start
2563 : * of recovery.
2564 : */
2565 8068 : ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
2566 :
2567 : /*
2568 : * NOTE: If your intent processing routine can create more
2569 : * deferred ops, you /must/ attach them to the capture list in
2570 : * the recover routine or else those subsequent intents will be
2571 : * replayed in the wrong order!
2572 : *
2573 : * The recovery function can free the log item, so we must not
2574 : * access lip after it returns.
2575 : */
2576 4034 : spin_unlock(&ailp->ail_lock);
2577 4034 : ops = lip->li_ops;
2578 4034 : error = ops->iop_recover(lip, &capture_list);
2579 4034 : spin_lock(&ailp->ail_lock);
2580 4034 : if (error) {
2581 2 : trace_xlog_intent_recovery_failed(log->l_mp, error,
2582 2 : ops->iop_recover);
2583 2 : break;
2584 : }
2585 : }
2586 :
2587 10294 : xfs_trans_ail_cursor_done(&cur);
2588 10294 : spin_unlock(&ailp->ail_lock);
2589 10294 : if (error)
2590 2 : goto err;
2591 :
2592 10292 : error = xlog_finish_defer_ops(log->l_mp, &capture_list);
2593 10292 : if (error)
2594 0 : goto err;
2595 :
2596 : return 0;
2597 2 : err:
2598 2 : xlog_abort_defer_ops(log->l_mp, &capture_list);
2599 2 : return error;
2600 : }
2601 :
2602 : /*
2603 : * A cancel occurs when the mount has failed and we're bailing out. Release all
2604 : * pending log intent items that we haven't started recovery on so they don't
2605 : * pin the AIL.
2606 : */
2607 : STATIC void
2608 2 : xlog_recover_cancel_intents(
2609 : struct xlog *log)
2610 : {
2611 2 : struct xfs_log_item *lip;
2612 2 : struct xfs_ail_cursor cur;
2613 2 : struct xfs_ail *ailp;
2614 :
2615 2 : ailp = log->l_ailp;
2616 2 : spin_lock(&ailp->ail_lock);
2617 2 : lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2618 2 : while (lip != NULL) {
2619 0 : if (!xlog_item_is_intent(lip))
2620 : break;
2621 :
2622 0 : spin_unlock(&ailp->ail_lock);
2623 0 : lip->li_ops->iop_release(lip);
2624 0 : spin_lock(&ailp->ail_lock);
2625 0 : lip = xfs_trans_ail_cursor_next(ailp, &cur);
2626 : }
2627 :
2628 2 : xfs_trans_ail_cursor_done(&cur);
2629 2 : spin_unlock(&ailp->ail_lock);
2630 2 : }
2631 :
2632 : /*
2633 : * This routine performs a transaction to null out a bad inode pointer
2634 : * in an agi unlinked inode hash bucket.
2635 : */
2636 : STATIC void
2637 4 : xlog_recover_clear_agi_bucket(
2638 : struct xfs_perag *pag,
2639 : int bucket)
2640 : {
2641 4 : struct xfs_mount *mp = pag->pag_mount;
2642 4 : struct xfs_trans *tp;
2643 4 : struct xfs_agi *agi;
2644 4 : struct xfs_buf *agibp;
2645 4 : int offset;
2646 4 : int error;
2647 :
2648 4 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
2649 4 : if (error)
2650 4 : goto out_error;
2651 :
2652 0 : error = xfs_read_agi(pag, tp, &agibp);
2653 0 : if (error)
2654 0 : goto out_abort;
2655 :
2656 0 : agi = agibp->b_addr;
2657 0 : agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
2658 0 : offset = offsetof(xfs_agi_t, agi_unlinked) +
2659 : (sizeof(xfs_agino_t) * bucket);
2660 0 : xfs_trans_log_buf(tp, agibp, offset,
2661 : (offset + sizeof(xfs_agino_t) - 1));
2662 :
2663 0 : error = xfs_trans_commit(tp);
2664 0 : if (error)
2665 0 : goto out_error;
2666 : return;
2667 :
2668 : out_abort:
2669 0 : xfs_trans_cancel(tp);
2670 4 : out_error:
2671 4 : xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__,
2672 : pag->pag_agno);
2673 4 : return;
2674 : }
2675 :
2676 : static int
2677 2632832 : xlog_recover_iunlink_bucket(
2678 : struct xfs_perag *pag,
2679 : struct xfs_agi *agi,
2680 : int bucket)
2681 : {
2682 2632832 : struct xfs_mount *mp = pag->pag_mount;
2683 2632832 : struct xfs_inode *prev_ip = NULL;
2684 2632832 : struct xfs_inode *ip;
2685 2632832 : xfs_agino_t prev_agino, agino;
2686 2632832 : int error = 0;
2687 :
2688 2632832 : agino = be32_to_cpu(agi->agi_unlinked[bucket]);
2689 2798987 : while (agino != NULLAGINO) {
2690 332310 : error = xfs_iget(mp, NULL,
2691 166155 : XFS_AGINO_TO_INO(mp, pag->pag_agno, agino),
2692 : 0, 0, &ip);
2693 166155 : if (error)
2694 : break;
2695 :
2696 166155 : ASSERT(VFS_I(ip)->i_nlink == 0);
2697 166155 : ASSERT(VFS_I(ip)->i_mode != 0);
2698 166155 : xfs_iflags_clear(ip, XFS_IRECOVERY);
2699 166155 : agino = ip->i_next_unlinked;
2700 :
2701 166155 : if (prev_ip) {
2702 161110 : ip->i_prev_unlinked = prev_agino;
2703 161110 : xfs_irele(prev_ip);
2704 :
2705 : /*
2706 : * Ensure the inode is removed from the unlinked list
2707 : * before we continue so that it won't race with
2708 : * building the in-memory list here. This could be
2709 : * serialised with the agibp lock, but that just
2710 : * serialises via lockstepping and it's much simpler
2711 : * just to flush the inodegc queue and wait for it to
2712 : * complete.
2713 : */
2714 161110 : error = xfs_inodegc_flush(mp);
2715 161110 : if (error)
2716 : break;
2717 : }
2718 :
2719 166155 : prev_agino = agino;
2720 166155 : prev_ip = ip;
2721 : }
2722 :
2723 2632832 : if (prev_ip) {
2724 5045 : int error2;
2725 :
2726 5045 : ip->i_prev_unlinked = prev_agino;
2727 5045 : xfs_irele(prev_ip);
2728 :
2729 5045 : error2 = xfs_inodegc_flush(mp);
2730 5045 : if (error2 && !error)
2731 4 : return error2;
2732 : }
2733 : return error;
2734 : }
2735 :
2736 : /*
2737 : * Recover AGI unlinked lists
2738 : *
2739 : * This is called during recovery to process any inodes which we unlinked but
2740 : * not freed when the system crashed. These inodes will be on the lists in the
2741 : * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
2742 : * any inodes found on the lists. Each inode is removed from the lists when it
2743 : * has been fully truncated and is freed. The freeing of the inode and its
2744 : * removal from the list must be atomic.
2745 : *
2746 : * If everything we touch in the agi processing loop is already in memory, this
2747 : * loop can hold the cpu for a long time. It runs without lock contention,
2748 : * memory allocation contention, the need wait for IO, etc, and so will run
2749 : * until we either run out of inodes to process, run low on memory or we run out
2750 : * of log space.
2751 : *
2752 : * This behaviour is bad for latency on single CPU and non-preemptible kernels,
2753 : * and can prevent other filesystem work (such as CIL pushes) from running. This
2754 : * can lead to deadlocks if the recovery process runs out of log reservation
2755 : * space. Hence we need to yield the CPU when there is other kernel work
2756 : * scheduled on this CPU to ensure other scheduled work can run without undue
2757 : * latency.
2758 : */
2759 : static void
2760 41150 : xlog_recover_iunlink_ag(
2761 : struct xfs_perag *pag)
2762 : {
2763 41150 : struct xfs_agi *agi;
2764 41150 : struct xfs_buf *agibp;
2765 41150 : int bucket;
2766 41150 : int error;
2767 :
2768 41150 : error = xfs_read_agi(pag, NULL, &agibp);
2769 41150 : if (error) {
2770 : /*
2771 : * AGI is b0rked. Don't process it.
2772 : *
2773 : * We should probably mark the filesystem as corrupt after we've
2774 : * recovered all the ag's we can....
2775 : */
2776 12 : return;
2777 : }
2778 :
2779 : /*
2780 : * Unlock the buffer so that it can be acquired in the normal course of
2781 : * the transaction to truncate and free each inode. Because we are not
2782 : * racing with anyone else here for the AGI buffer, we don't even need
2783 : * to hold it locked to read the initial unlinked bucket entries out of
2784 : * the buffer. We keep buffer reference though, so that it stays pinned
2785 : * in memory while we need the buffer.
2786 : */
2787 41138 : agi = agibp->b_addr;
2788 41138 : xfs_buf_unlock(agibp);
2789 :
2790 2715108 : for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
2791 2632832 : error = xlog_recover_iunlink_bucket(pag, agi, bucket);
2792 2632832 : if (error) {
2793 : /*
2794 : * Bucket is unrecoverable, so only a repair scan can
2795 : * free the remaining unlinked inodes. Just empty the
2796 : * bucket and remaining inodes on it unreferenced and
2797 : * unfreeable.
2798 : */
2799 4 : xlog_recover_clear_agi_bucket(pag, bucket);
2800 : }
2801 : }
2802 :
2803 41138 : xfs_buf_rele(agibp);
2804 : }
2805 :
2806 : static void
2807 10292 : xlog_recover_process_iunlinks(
2808 : struct xlog *log)
2809 : {
2810 10292 : struct xfs_perag *pag;
2811 10292 : xfs_agnumber_t agno;
2812 :
2813 51442 : for_each_perag(log->l_mp, agno, pag)
2814 41150 : xlog_recover_iunlink_ag(pag);
2815 10292 : }
2816 :
2817 : STATIC void
2818 2295766 : xlog_unpack_data(
2819 : struct xlog_rec_header *rhead,
2820 : char *dp,
2821 : struct xlog *log)
2822 : {
2823 2295766 : int i, j, k;
2824 :
2825 125221172 : for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
2826 122925406 : i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
2827 122925406 : *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
2828 122925406 : dp += BBSIZE;
2829 : }
2830 :
2831 2295766 : if (xfs_has_logv2(log->l_mp)) {
2832 : xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
2833 2602090 : for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
2834 306324 : j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
2835 306324 : k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
2836 306324 : *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
2837 306324 : dp += BBSIZE;
2838 : }
2839 : }
2840 2295766 : }
2841 :
2842 : /*
2843 : * CRC check, unpack and process a log record.
2844 : */
2845 : STATIC int
2846 3513764 : xlog_recover_process(
2847 : struct xlog *log,
2848 : struct hlist_head rhash[],
2849 : struct xlog_rec_header *rhead,
2850 : char *dp,
2851 : int pass,
2852 : struct list_head *buffer_list)
2853 : {
2854 3513764 : __le32 old_crc = rhead->h_crc;
2855 3513764 : __le32 crc;
2856 :
2857 3513764 : crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
2858 :
2859 : /*
2860 : * Nothing else to do if this is a CRC verification pass. Just return
2861 : * if this a record with a non-zero crc. Unfortunately, mkfs always
2862 : * sets old_crc to 0 so we must consider this valid even on v5 supers.
2863 : * Otherwise, return EFSBADCRC on failure so the callers up the stack
2864 : * know precisely what failed.
2865 : */
2866 3513764 : if (pass == XLOG_RECOVER_CRCPASS) {
2867 1217998 : if (old_crc && crc != old_crc)
2868 : return -EFSBADCRC;
2869 1217987 : return 0;
2870 : }
2871 :
2872 : /*
2873 : * We're in the normal recovery path. Issue a warning if and only if the
2874 : * CRC in the header is non-zero. This is an advisory warning and the
2875 : * zero CRC check prevents warnings from being emitted when upgrading
2876 : * the kernel from one that does not add CRCs by default.
2877 : */
2878 2295766 : if (crc != old_crc) {
2879 0 : if (old_crc || xfs_has_crc(log->l_mp)) {
2880 0 : xfs_alert(log->l_mp,
2881 : "log record CRC mismatch: found 0x%x, expected 0x%x.",
2882 : le32_to_cpu(old_crc),
2883 : le32_to_cpu(crc));
2884 0 : xfs_hex_dump(dp, 32);
2885 : }
2886 :
2887 : /*
2888 : * If the filesystem is CRC enabled, this mismatch becomes a
2889 : * fatal log corruption failure.
2890 : */
2891 0 : if (xfs_has_crc(log->l_mp)) {
2892 0 : XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
2893 0 : return -EFSCORRUPTED;
2894 : }
2895 : }
2896 :
2897 2295766 : xlog_unpack_data(rhead, dp, log);
2898 :
2899 2295766 : return xlog_recover_process_data(log, rhash, rhead, dp, pass,
2900 : buffer_list);
2901 : }
2902 :
2903 : STATIC int
2904 3554944 : xlog_valid_rec_header(
2905 : struct xlog *log,
2906 : struct xlog_rec_header *rhead,
2907 : xfs_daddr_t blkno,
2908 : int bufsize)
2909 : {
2910 3554944 : int hlen;
2911 :
2912 3554944 : if (XFS_IS_CORRUPT(log->l_mp,
2913 : rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)))
2914 0 : return -EFSCORRUPTED;
2915 3554944 : if (XFS_IS_CORRUPT(log->l_mp,
2916 : (!rhead->h_version ||
2917 : (be32_to_cpu(rhead->h_version) &
2918 : (~XLOG_VERSION_OKBITS))))) {
2919 0 : xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
2920 : __func__, be32_to_cpu(rhead->h_version));
2921 0 : return -EFSCORRUPTED;
2922 : }
2923 :
2924 : /*
2925 : * LR body must have data (or it wouldn't have been written)
2926 : * and h_len must not be greater than LR buffer size.
2927 : */
2928 3554944 : hlen = be32_to_cpu(rhead->h_len);
2929 3554944 : if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > bufsize))
2930 0 : return -EFSCORRUPTED;
2931 :
2932 3554944 : if (XFS_IS_CORRUPT(log->l_mp,
2933 : blkno > log->l_logBBsize || blkno > INT_MAX))
2934 0 : return -EFSCORRUPTED;
2935 : return 0;
2936 : }
2937 :
2938 : /*
2939 : * Read the log from tail to head and process the log records found.
2940 : * Handle the two cases where the tail and head are in the same cycle
2941 : * and where the active portion of the log wraps around the end of
2942 : * the physical log separately. The pass parameter is passed through
2943 : * to the routines called to process the data and is not looked at
2944 : * here.
2945 : */
2946 : STATIC int
2947 41180 : xlog_do_recovery_pass(
2948 : struct xlog *log,
2949 : xfs_daddr_t head_blk,
2950 : xfs_daddr_t tail_blk,
2951 : int pass,
2952 : xfs_daddr_t *first_bad) /* out: first bad log rec */
2953 : {
2954 41180 : xlog_rec_header_t *rhead;
2955 41180 : xfs_daddr_t blk_no, rblk_no;
2956 41180 : xfs_daddr_t rhead_blk;
2957 41180 : char *offset;
2958 41180 : char *hbp, *dbp;
2959 41180 : int error = 0, h_size, h_len;
2960 41180 : int error2 = 0;
2961 41180 : int bblks, split_bblks;
2962 41180 : int hblks, split_hblks, wrapped_hblks;
2963 41180 : int i;
2964 41180 : struct hlist_head rhash[XLOG_RHASH_SIZE];
2965 41180 : LIST_HEAD (buffer_list);
2966 :
2967 41180 : ASSERT(head_blk != tail_blk);
2968 : blk_no = rhead_blk = tail_blk;
2969 :
2970 700060 : for (i = 0; i < XLOG_RHASH_SIZE; i++)
2971 658880 : INIT_HLIST_HEAD(&rhash[i]);
2972 :
2973 : /*
2974 : * Read the header of the tail block and get the iclog buffer size from
2975 : * h_size. Use this to tell how many sectors make up the log header.
2976 : */
2977 41180 : if (xfs_has_logv2(log->l_mp)) {
2978 : /*
2979 : * When using variable length iclogs, read first sector of
2980 : * iclog header and extract the header size from it. Get a
2981 : * new hbp that is the correct size.
2982 : */
2983 41180 : hbp = xlog_alloc_buffer(log, 1);
2984 41180 : if (!hbp)
2985 : return -ENOMEM;
2986 :
2987 41180 : error = xlog_bread(log, tail_blk, 1, hbp, &offset);
2988 41180 : if (error)
2989 0 : goto bread_err1;
2990 :
2991 41180 : rhead = (xlog_rec_header_t *)offset;
2992 :
2993 : /*
2994 : * xfsprogs has a bug where record length is based on lsunit but
2995 : * h_size (iclog size) is hardcoded to 32k. Now that we
2996 : * unconditionally CRC verify the unmount record, this means the
2997 : * log buffer can be too small for the record and cause an
2998 : * overrun.
2999 : *
3000 : * Detect this condition here. Use lsunit for the buffer size as
3001 : * long as this looks like the mkfs case. Otherwise, return an
3002 : * error to avoid a buffer overrun.
3003 : */
3004 41180 : h_size = be32_to_cpu(rhead->h_size);
3005 41180 : h_len = be32_to_cpu(rhead->h_len);
3006 41180 : if (h_len > h_size && h_len <= log->l_mp->m_logbsize &&
3007 0 : rhead->h_num_logops == cpu_to_be32(1)) {
3008 0 : xfs_warn(log->l_mp,
3009 : "invalid iclog size (%d bytes), using lsunit (%d bytes)",
3010 : h_size, log->l_mp->m_logbsize);
3011 0 : h_size = log->l_mp->m_logbsize;
3012 : }
3013 :
3014 41180 : error = xlog_valid_rec_header(log, rhead, tail_blk, h_size);
3015 41180 : if (error)
3016 0 : goto bread_err1;
3017 :
3018 41180 : hblks = xlog_logrec_hblks(log, rhead);
3019 41180 : if (hblks != 1) {
3020 168 : kmem_free(hbp);
3021 168 : hbp = xlog_alloc_buffer(log, hblks);
3022 : }
3023 : } else {
3024 0 : ASSERT(log->l_sectBBsize == 1);
3025 0 : hblks = 1;
3026 0 : hbp = xlog_alloc_buffer(log, 1);
3027 0 : h_size = XLOG_BIG_RECORD_BSIZE;
3028 : }
3029 :
3030 41180 : if (!hbp)
3031 : return -ENOMEM;
3032 41180 : dbp = xlog_alloc_buffer(log, BTOBB(h_size));
3033 41180 : if (!dbp) {
3034 0 : kmem_free(hbp);
3035 0 : return -ENOMEM;
3036 : }
3037 :
3038 41180 : memset(rhash, 0, sizeof(rhash));
3039 41180 : if (tail_blk > head_blk) {
3040 : /*
3041 : * Perform recovery around the end of the physical log.
3042 : * When the head is not on the same cycle number as the tail,
3043 : * we can't do a sequential recovery.
3044 : */
3045 171036 : while (blk_no < log->l_logBBsize) {
3046 : /*
3047 : * Check for header wrapping around physical end-of-log
3048 : */
3049 169623 : offset = hbp;
3050 169623 : split_hblks = 0;
3051 169623 : wrapped_hblks = 0;
3052 169623 : if (blk_no + hblks <= log->l_logBBsize) {
3053 : /* Read header in one read */
3054 169623 : error = xlog_bread(log, blk_no, hblks, hbp,
3055 : &offset);
3056 169623 : if (error)
3057 0 : goto bread_err2;
3058 : } else {
3059 : /* This LR is split across physical log end */
3060 0 : if (blk_no != log->l_logBBsize) {
3061 : /* some data before physical log end */
3062 0 : ASSERT(blk_no <= INT_MAX);
3063 0 : split_hblks = log->l_logBBsize - (int)blk_no;
3064 0 : ASSERT(split_hblks > 0);
3065 0 : error = xlog_bread(log, blk_no,
3066 : split_hblks, hbp,
3067 : &offset);
3068 0 : if (error)
3069 0 : goto bread_err2;
3070 : }
3071 :
3072 : /*
3073 : * Note: this black magic still works with
3074 : * large sector sizes (non-512) only because:
3075 : * - we increased the buffer size originally
3076 : * by 1 sector giving us enough extra space
3077 : * for the second read;
3078 : * - the log start is guaranteed to be sector
3079 : * aligned;
3080 : * - we read the log end (LR header start)
3081 : * _first_, then the log start (LR header end)
3082 : * - order is important.
3083 : */
3084 0 : wrapped_hblks = hblks - split_hblks;
3085 0 : error = xlog_bread_noalign(log, 0,
3086 : wrapped_hblks,
3087 0 : offset + BBTOB(split_hblks));
3088 0 : if (error)
3089 0 : goto bread_err2;
3090 : }
3091 169623 : rhead = (xlog_rec_header_t *)offset;
3092 169623 : error = xlog_valid_rec_header(log, rhead,
3093 : split_hblks ? blk_no : 0, h_size);
3094 169623 : if (error)
3095 0 : goto bread_err2;
3096 :
3097 169623 : bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3098 169623 : blk_no += hblks;
3099 :
3100 : /*
3101 : * Read the log record data in multiple reads if it
3102 : * wraps around the end of the log. Note that if the
3103 : * header already wrapped, blk_no could point past the
3104 : * end of the log. The record data is contiguous in
3105 : * that case.
3106 : */
3107 169623 : if (blk_no + bblks <= log->l_logBBsize ||
3108 : blk_no >= log->l_logBBsize) {
3109 168399 : rblk_no = xlog_wrap_logbno(log, blk_no);
3110 168399 : error = xlog_bread(log, rblk_no, bblks, dbp,
3111 : &offset);
3112 168399 : if (error)
3113 0 : goto bread_err2;
3114 : } else {
3115 : /* This log record is split across the
3116 : * physical end of log */
3117 1224 : offset = dbp;
3118 1224 : split_bblks = 0;
3119 1224 : if (blk_no != log->l_logBBsize) {
3120 : /* some data is before the physical
3121 : * end of log */
3122 1224 : ASSERT(!wrapped_hblks);
3123 1224 : ASSERT(blk_no <= INT_MAX);
3124 1224 : split_bblks =
3125 1224 : log->l_logBBsize - (int)blk_no;
3126 1224 : ASSERT(split_bblks > 0);
3127 1224 : error = xlog_bread(log, blk_no,
3128 : split_bblks, dbp,
3129 : &offset);
3130 1224 : if (error)
3131 0 : goto bread_err2;
3132 : }
3133 :
3134 : /*
3135 : * Note: this black magic still works with
3136 : * large sector sizes (non-512) only because:
3137 : * - we increased the buffer size originally
3138 : * by 1 sector giving us enough extra space
3139 : * for the second read;
3140 : * - the log start is guaranteed to be sector
3141 : * aligned;
3142 : * - we read the log end (LR header start)
3143 : * _first_, then the log start (LR header end)
3144 : * - order is important.
3145 : */
3146 1224 : error = xlog_bread_noalign(log, 0,
3147 : bblks - split_bblks,
3148 1224 : offset + BBTOB(split_bblks));
3149 1224 : if (error)
3150 0 : goto bread_err2;
3151 : }
3152 :
3153 169623 : error = xlog_recover_process(log, rhash, rhead, offset,
3154 : pass, &buffer_list);
3155 169623 : if (error)
3156 0 : goto bread_err2;
3157 :
3158 : blk_no += bblks;
3159 : rhead_blk = blk_no;
3160 : }
3161 :
3162 1413 : ASSERT(blk_no >= log->l_logBBsize);
3163 1413 : blk_no -= log->l_logBBsize;
3164 1413 : rhead_blk = blk_no;
3165 : }
3166 :
3167 : /* read first part of physical log */
3168 3385310 : while (blk_no < head_blk) {
3169 3344141 : error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3170 3344141 : if (error)
3171 0 : goto bread_err2;
3172 :
3173 3344141 : rhead = (xlog_rec_header_t *)offset;
3174 3344141 : error = xlog_valid_rec_header(log, rhead, blk_no, h_size);
3175 3344141 : if (error)
3176 0 : goto bread_err2;
3177 :
3178 : /* blocks in data section */
3179 3344141 : bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3180 3344141 : error = xlog_bread(log, blk_no+hblks, bblks, dbp,
3181 : &offset);
3182 3344141 : if (error)
3183 0 : goto bread_err2;
3184 :
3185 3344141 : error = xlog_recover_process(log, rhash, rhead, offset, pass,
3186 : &buffer_list);
3187 3344141 : if (error)
3188 11 : goto bread_err2;
3189 :
3190 3344130 : blk_no += bblks + hblks;
3191 3344130 : rhead_blk = blk_no;
3192 : }
3193 :
3194 41169 : bread_err2:
3195 41180 : kmem_free(dbp);
3196 41180 : bread_err1:
3197 41180 : kmem_free(hbp);
3198 :
3199 : /*
3200 : * Submit buffers that have been added from the last record processed,
3201 : * regardless of error status.
3202 : */
3203 41180 : if (!list_empty(&buffer_list))
3204 10016 : error2 = xfs_buf_delwri_submit(&buffer_list);
3205 :
3206 41180 : if (error && first_bad)
3207 11 : *first_bad = rhead_blk;
3208 :
3209 : /*
3210 : * Transactions are freed at commit time but transactions without commit
3211 : * records on disk are never committed. Free any that may be left in the
3212 : * hash table.
3213 : */
3214 700060 : for (i = 0; i < XLOG_RHASH_SIZE; i++) {
3215 658880 : struct hlist_node *tmp;
3216 658880 : struct xlog_recover *trans;
3217 :
3218 1319236 : hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
3219 1476 : xlog_recover_free_trans(trans);
3220 : }
3221 :
3222 41180 : return error ? error : error2;
3223 : }
3224 :
3225 : /*
3226 : * Do the recovery of the log. We actually do this in two phases.
3227 : * The two passes are necessary in order to implement the function
3228 : * of cancelling a record written into the log. The first pass
3229 : * determines those things which have been cancelled, and the
3230 : * second pass replays log items normally except for those which
3231 : * have been cancelled. The handling of the replay and cancellations
3232 : * takes place in the log item type specific routines.
3233 : *
3234 : * The table of items which have cancel records in the log is allocated
3235 : * and freed at this level, since only here do we know when all of
3236 : * the log recovery has been completed.
3237 : */
3238 : STATIC int
3239 10294 : xlog_do_log_recovery(
3240 : struct xlog *log,
3241 : xfs_daddr_t head_blk,
3242 : xfs_daddr_t tail_blk)
3243 : {
3244 10294 : int error;
3245 :
3246 10294 : ASSERT(head_blk != tail_blk);
3247 :
3248 : /*
3249 : * First do a pass to find all of the cancelled buf log items.
3250 : * Store them in the buf_cancel_table for use in the second pass.
3251 : */
3252 10294 : error = xlog_alloc_buf_cancel_table(log);
3253 10294 : if (error)
3254 : return error;
3255 :
3256 10294 : error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3257 : XLOG_RECOVER_PASS1, NULL);
3258 10294 : if (error != 0)
3259 0 : goto out_cancel;
3260 :
3261 : /*
3262 : * Then do a second pass to actually recover the items in the log.
3263 : * When it is complete free the table of buf cancel items.
3264 : */
3265 10294 : error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3266 : XLOG_RECOVER_PASS2, NULL);
3267 10294 : if (!error)
3268 10294 : xlog_check_buf_cancel_table(log);
3269 0 : out_cancel:
3270 10294 : xlog_free_buf_cancel_table(log);
3271 10294 : return error;
3272 : }
3273 :
3274 : /*
3275 : * Do the actual recovery
3276 : */
3277 : STATIC int
3278 10294 : xlog_do_recover(
3279 : struct xlog *log,
3280 : xfs_daddr_t head_blk,
3281 : xfs_daddr_t tail_blk)
3282 : {
3283 10294 : struct xfs_mount *mp = log->l_mp;
3284 10294 : struct xfs_buf *bp = mp->m_sb_bp;
3285 10294 : struct xfs_sb *sbp = &mp->m_sb;
3286 10294 : int error;
3287 :
3288 10294 : trace_xfs_log_recover(log, head_blk, tail_blk);
3289 :
3290 : /*
3291 : * First replay the images in the log.
3292 : */
3293 10294 : error = xlog_do_log_recovery(log, head_blk, tail_blk);
3294 10294 : if (error)
3295 : return error;
3296 :
3297 20588 : if (xlog_is_shutdown(log))
3298 : return -EIO;
3299 :
3300 : /*
3301 : * We now update the tail_lsn since much of the recovery has completed
3302 : * and there may be space available to use. If there were no extent
3303 : * or iunlinks, we can free up the entire log and set the tail_lsn to
3304 : * be the last_sync_lsn. This was set in xlog_find_tail to be the
3305 : * lsn of the last known good LR on disk. If there are extent frees
3306 : * or iunlinks they will have some entries in the AIL; so we look at
3307 : * the AIL to determine how to set the tail_lsn.
3308 : */
3309 10294 : xlog_assign_tail_lsn(mp);
3310 :
3311 : /*
3312 : * Now that we've finished replaying all buffer and inode updates,
3313 : * re-read the superblock and reverify it.
3314 : */
3315 10294 : xfs_buf_lock(bp);
3316 10294 : xfs_buf_hold(bp);
3317 10294 : error = _xfs_buf_read(bp, XBF_READ);
3318 10294 : if (error) {
3319 0 : if (!xlog_is_shutdown(log)) {
3320 0 : xfs_buf_ioerror_alert(bp, __this_address);
3321 0 : ASSERT(0);
3322 : }
3323 0 : xfs_buf_relse(bp);
3324 0 : return error;
3325 : }
3326 :
3327 : /* Convert superblock from on-disk format */
3328 10294 : xfs_sb_from_disk(sbp, bp->b_addr);
3329 10294 : xfs_buf_relse(bp);
3330 :
3331 : /* re-initialise in-core superblock and geometry structures */
3332 10294 : mp->m_features |= xfs_sb_version_to_features(sbp);
3333 10294 : xfs_reinit_percpu_counters(mp);
3334 10294 : error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks,
3335 : &mp->m_maxagi);
3336 10294 : if (error) {
3337 0 : xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
3338 0 : return error;
3339 : }
3340 10294 : mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
3341 :
3342 : /* Normal transactions can now occur */
3343 10294 : clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
3344 : return 0;
3345 : }
3346 :
3347 : /*
3348 : * Perform recovery and re-initialize some log variables in xlog_find_tail.
3349 : *
3350 : * Return error or zero.
3351 : */
3352 : int
3353 22481 : xlog_recover(
3354 : struct xlog *log)
3355 : {
3356 22481 : xfs_daddr_t head_blk, tail_blk;
3357 22481 : int error;
3358 :
3359 : /* find the tail of the log */
3360 22481 : error = xlog_find_tail(log, &head_blk, &tail_blk);
3361 22481 : if (error)
3362 : return error;
3363 :
3364 : /*
3365 : * The superblock was read before the log was available and thus the LSN
3366 : * could not be verified. Check the superblock LSN against the current
3367 : * LSN now that it's known.
3368 : */
3369 44912 : if (xfs_has_crc(log->l_mp) &&
3370 22435 : !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
3371 : return -EINVAL;
3372 :
3373 22475 : if (tail_blk != head_blk) {
3374 : /* There used to be a comment here:
3375 : *
3376 : * disallow recovery on read-only mounts. note -- mount
3377 : * checks for ENOSPC and turns it into an intelligent
3378 : * error message.
3379 : * ...but this is no longer true. Now, unless you specify
3380 : * NORECOVERY (in which case this function would never be
3381 : * called), we just go ahead and recover. We do this all
3382 : * under the vfs layer, so we can get away with it unless
3383 : * the device itself is read-only, in which case we fail.
3384 : */
3385 10296 : if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
3386 : return error;
3387 : }
3388 :
3389 : /*
3390 : * Version 5 superblock log feature mask validation. We know the
3391 : * log is dirty so check if there are any unknown log features
3392 : * in what we need to recover. If there are unknown features
3393 : * (e.g. unsupported transactions, then simply reject the
3394 : * attempt at recovery before touching anything.
3395 : */
3396 10294 : if (xfs_sb_is_v5(&log->l_mp->m_sb) &&
3397 : xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
3398 : XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
3399 0 : xfs_warn(log->l_mp,
3400 : "Superblock has unknown incompatible log features (0x%x) enabled.",
3401 : (log->l_mp->m_sb.sb_features_log_incompat &
3402 : XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
3403 0 : xfs_warn(log->l_mp,
3404 : "The log can not be fully and/or safely recovered by this kernel.");
3405 0 : xfs_warn(log->l_mp,
3406 : "Please recover the log on a kernel that supports the unknown features.");
3407 0 : return -EINVAL;
3408 : }
3409 :
3410 : /*
3411 : * Delay log recovery if the debug hook is set. This is debug
3412 : * instrumentation to coordinate simulation of I/O failures with
3413 : * log recovery.
3414 : */
3415 10294 : if (xfs_globals.log_recovery_delay) {
3416 4 : xfs_notice(log->l_mp,
3417 : "Delaying log recovery for %d seconds.",
3418 : xfs_globals.log_recovery_delay);
3419 4 : msleep(xfs_globals.log_recovery_delay * 1000);
3420 : }
3421 :
3422 10294 : xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
3423 : log->l_mp->m_logname ? log->l_mp->m_logname
3424 : : "internal");
3425 :
3426 10294 : error = xlog_do_recover(log, head_blk, tail_blk);
3427 10294 : set_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
3428 : }
3429 : return error;
3430 : }
3431 :
3432 : /*
3433 : * In the first part of recovery we replay inodes and buffers and build up the
3434 : * list of intents which need to be processed. Here we process the intents and
3435 : * clean up the on disk unlinked inode lists. This is separated from the first
3436 : * part of recovery so that the root and real-time bitmap inodes can be read in
3437 : * from disk in between the two stages. This is necessary so that we can free
3438 : * space in the real-time portion of the file system.
3439 : */
3440 : int
3441 10294 : xlog_recover_finish(
3442 : struct xlog *log)
3443 : {
3444 10294 : int error;
3445 :
3446 10294 : error = xlog_recover_process_intents(log);
3447 : /*
3448 : * Sync the log to get all the intents that have done item out of
3449 : * the AIL. This isn't absolutely necessary, but it helps in case
3450 : * the unlink transactions would have problems pushing the intents
3451 : * out of the way.
3452 : */
3453 10294 : xfs_log_force(log->l_mp, XFS_LOG_SYNC);
3454 10294 : if (error) {
3455 : /*
3456 : * Cancel all the unprocessed intent items now so that we don't
3457 : * leave them pinned in the AIL. This can cause the AIL to
3458 : * livelock on the pinned item if anyone tries to push the AIL
3459 : * (inode reclaim does this) before we get around to
3460 : * xfs_log_mount_cancel.
3461 : */
3462 2 : xlog_recover_cancel_intents(log);
3463 2 : xfs_alert(log->l_mp, "Failed to recover intents");
3464 2 : xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
3465 2 : return error;
3466 : }
3467 :
3468 : /*
3469 : * Now that we've recovered the log and all the intents, we can clear
3470 : * the log incompat feature bits in the superblock because there's no
3471 : * longer anything to protect. We rely on the AIL push to write out the
3472 : * updated superblock after everything else.
3473 : */
3474 10292 : if (xfs_clear_incompat_log_features(log->l_mp)) {
3475 58 : error = xfs_sync_sb(log->l_mp, false);
3476 58 : if (error < 0) {
3477 0 : xfs_alert(log->l_mp,
3478 : "Failed to clear log incompat features on recovery");
3479 0 : return error;
3480 : }
3481 : }
3482 :
3483 10292 : xlog_recover_process_iunlinks(log);
3484 :
3485 : /*
3486 : * Recover any CoW staging blocks that are still referenced by the
3487 : * ondisk refcount metadata. During mount there cannot be any live
3488 : * staging extents as we have not permitted any user modifications.
3489 : * Therefore, it is safe to free them all right now, even on a
3490 : * read-only mount.
3491 : */
3492 10292 : error = xfs_reflink_recover_cow(log->l_mp);
3493 10292 : if (error) {
3494 8 : xfs_alert(log->l_mp,
3495 : "Failed to recover leftover CoW staging extents, err %d.",
3496 : error);
3497 : /*
3498 : * If we get an error here, make sure the log is shut down
3499 : * but return zero so that any log items committed since the
3500 : * end of intents processing can be pushed through the CIL
3501 : * and AIL.
3502 : */
3503 8 : xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
3504 : }
3505 :
3506 : return 0;
3507 : }
3508 :
3509 : void
3510 42 : xlog_recover_cancel(
3511 : struct xlog *log)
3512 : {
3513 84 : if (xlog_recovery_needed(log))
3514 0 : xlog_recover_cancel_intents(log);
3515 42 : }
3516 :
|