Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 : * All Rights Reserved.
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_log_format.h"
11 : #include "xfs_trans_resv.h"
12 : #include "xfs_bit.h"
13 : #include "xfs_sb.h"
14 : #include "xfs_mount.h"
15 : #include "xfs_defer.h"
16 : #include "xfs_inode.h"
17 : #include "xfs_trans.h"
18 : #include "xfs_log.h"
19 : #include "xfs_log_priv.h"
20 : #include "xfs_log_recover.h"
21 : #include "xfs_trans_priv.h"
22 : #include "xfs_alloc.h"
23 : #include "xfs_ialloc.h"
24 : #include "xfs_trace.h"
25 : #include "xfs_icache.h"
26 : #include "xfs_error.h"
27 : #include "xfs_buf_item.h"
28 : #include "xfs_ag.h"
29 : #include "xfs_quota.h"
30 : #include "xfs_reflink.h"
31 :
32 : #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
33 :
34 : STATIC int
35 : xlog_find_zeroed(
36 : struct xlog *,
37 : xfs_daddr_t *);
38 : STATIC int
39 : xlog_clear_stale_blocks(
40 : struct xlog *,
41 : xfs_lsn_t);
42 : STATIC int
43 : xlog_do_recovery_pass(
44 : struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
45 :
46 : /*
47 : * Sector aligned buffer routines for buffer create/read/write/access
48 : */
49 :
50 : /*
51 : * Verify the log-relative block number and length in basic blocks are valid for
52 : * an operation involving the given XFS log buffer. Returns true if the fields
53 : * are valid, false otherwise.
54 : */
55 : static inline bool
56 : xlog_verify_bno(
57 : struct xlog *log,
58 : xfs_daddr_t blk_no,
59 : int bbcount)
60 : {
61 17895372 : if (blk_no < 0 || blk_no >= log->l_logBBsize)
62 : return false;
63 18353956 : if (bbcount <= 0 || (blk_no + bbcount) > log->l_logBBsize)
64 0 : return false;
65 : return true;
66 : }
67 :
68 : /*
69 : * Allocate a buffer to hold log data. The buffer needs to be able to map to
70 : * a range of nbblks basic blocks at any valid offset within the log.
71 : */
72 : static char *
73 458584 : xlog_alloc_buffer(
74 : struct xlog *log,
75 : int nbblks)
76 : {
77 : /*
78 : * Pass log block 0 since we don't have an addr yet, buffer will be
79 : * verified on read.
80 : */
81 917168 : if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, 0, nbblks))) {
82 0 : xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
83 : nbblks);
84 0 : return NULL;
85 : }
86 :
87 : /*
88 : * We do log I/O in units of log sectors (a power-of-2 multiple of the
89 : * basic block size), so we round up the requested size to accommodate
90 : * the basic blocks required for complete log sectors.
91 : *
92 : * In addition, the buffer may be used for a non-sector-aligned block
93 : * offset, in which case an I/O of the requested size could extend
94 : * beyond the end of the buffer. If the requested size is only 1 basic
95 : * block it will never straddle a sector boundary, so this won't be an
96 : * issue. Nor will this be a problem if the log I/O is done in basic
97 : * blocks (sector size 1). But otherwise we extend the buffer by one
98 : * extra log sector to ensure there's space to accommodate this
99 : * possibility.
100 : */
101 458584 : if (nbblks > 1 && log->l_sectBBsize > 1)
102 168073 : nbblks += log->l_sectBBsize;
103 458584 : nbblks = round_up(nbblks, log->l_sectBBsize);
104 458584 : return kvzalloc(BBTOB(nbblks), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
105 : }
106 :
107 : /*
108 : * Return the address of the start of the given block number's data
109 : * in a log buffer. The buffer covers a log sector-aligned region.
110 : */
111 : static inline unsigned int
112 : xlog_align(
113 : struct xlog *log,
114 : xfs_daddr_t blk_no)
115 : {
116 17893729 : return BBTOB(blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1));
117 : }
118 :
119 : static int
120 17895372 : xlog_do_io(
121 : struct xlog *log,
122 : xfs_daddr_t blk_no,
123 : unsigned int nbblks,
124 : char *data,
125 : enum req_op op)
126 : {
127 17895372 : int error;
128 :
129 35790744 : if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, blk_no, nbblks))) {
130 0 : xfs_warn(log->l_mp,
131 : "Invalid log block/length (0x%llx, 0x%x) for buffer",
132 : blk_no, nbblks);
133 0 : return -EFSCORRUPTED;
134 : }
135 :
136 17895372 : blk_no = round_down(blk_no, log->l_sectBBsize);
137 17895372 : nbblks = round_up(nbblks, log->l_sectBBsize);
138 17895372 : ASSERT(nbblks > 0);
139 :
140 17895372 : error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no,
141 : BBTOB(nbblks), data, op);
142 17895372 : if (error && !xlog_is_shutdown(log)) {
143 0 : xfs_alert(log->l_mp,
144 : "log recovery %s I/O error at daddr 0x%llx len %d error %d",
145 : op == REQ_OP_WRITE ? "write" : "read",
146 : blk_no, nbblks, error);
147 : }
148 : return error;
149 : }
150 :
151 : STATIC int
152 1643 : xlog_bread_noalign(
153 : struct xlog *log,
154 : xfs_daddr_t blk_no,
155 : int nbblks,
156 : char *data)
157 : {
158 1643 : return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
159 : }
160 :
161 : STATIC int
162 17403315 : xlog_bread(
163 : struct xlog *log,
164 : xfs_daddr_t blk_no,
165 : int nbblks,
166 : char *data,
167 : char **offset)
168 : {
169 17403315 : int error;
170 :
171 17403315 : error = xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
172 17403315 : if (!error)
173 17403315 : *offset = data + xlog_align(log, blk_no);
174 17403315 : return error;
175 : }
176 :
177 : STATIC int
178 490414 : xlog_bwrite(
179 : struct xlog *log,
180 : xfs_daddr_t blk_no,
181 : int nbblks,
182 : char *data)
183 : {
184 490414 : return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_WRITE);
185 : }
186 :
187 : #ifdef DEBUG
188 : /*
189 : * dump debug superblock and log record information
190 : */
191 : STATIC void
192 0 : xlog_header_check_dump(
193 : xfs_mount_t *mp,
194 : xlog_rec_header_t *head)
195 : {
196 0 : xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d",
197 : __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
198 0 : xfs_debug(mp, " log : uuid = %pU, fmt = %d",
199 : &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
200 0 : }
201 : #else
202 : #define xlog_header_check_dump(mp, head)
203 : #endif
204 :
205 : /*
206 : * check log record header for recovery
207 : */
208 : STATIC int
209 3124608 : xlog_header_check_recover(
210 : xfs_mount_t *mp,
211 : xlog_rec_header_t *head)
212 : {
213 3124608 : ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
214 :
215 : /*
216 : * IRIX doesn't write the h_fmt field and leaves it zeroed
217 : * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
218 : * a dirty log created in IRIX.
219 : */
220 3124608 : if (XFS_IS_CORRUPT(mp, head->h_fmt != cpu_to_be32(XLOG_FMT))) {
221 0 : xfs_warn(mp,
222 : "dirty log written in incompatible format - can't recover");
223 0 : xlog_header_check_dump(mp, head);
224 0 : return -EFSCORRUPTED;
225 : }
226 3124608 : if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
227 : &head->h_fs_uuid))) {
228 0 : xfs_warn(mp,
229 : "dirty log entry has mismatched uuid - can't recover");
230 0 : xlog_header_check_dump(mp, head);
231 0 : return -EFSCORRUPTED;
232 : }
233 : return 0;
234 : }
235 :
236 : /*
237 : * read the head block of the log and check the header
238 : */
239 : STATIC int
240 59215 : xlog_header_check_mount(
241 : xfs_mount_t *mp,
242 : xlog_rec_header_t *head)
243 : {
244 59215 : ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
245 :
246 59215 : if (uuid_is_null(&head->h_fs_uuid)) {
247 : /*
248 : * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
249 : * h_fs_uuid is null, we assume this log was last mounted
250 : * by IRIX and continue.
251 : */
252 0 : xfs_warn(mp, "null uuid in log - IRIX style log");
253 59215 : } else if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
254 : &head->h_fs_uuid))) {
255 0 : xfs_warn(mp, "log has mismatched uuid - can't recover");
256 0 : xlog_header_check_dump(mp, head);
257 0 : return -EFSCORRUPTED;
258 : }
259 : return 0;
260 : }
261 :
262 : /*
263 : * This routine finds (to an approximation) the first block in the physical
264 : * log which contains the given cycle. It uses a binary search algorithm.
265 : * Note that the algorithm can not be perfect because the disk will not
266 : * necessarily be perfect.
267 : */
268 : STATIC int
269 59211 : xlog_find_cycle_start(
270 : struct xlog *log,
271 : char *buffer,
272 : xfs_daddr_t first_blk,
273 : xfs_daddr_t *last_blk,
274 : uint cycle)
275 : {
276 59211 : char *offset;
277 59211 : xfs_daddr_t mid_blk;
278 59211 : xfs_daddr_t end_blk;
279 59211 : uint mid_cycle;
280 59211 : int error;
281 :
282 59211 : end_blk = *last_blk;
283 59211 : mid_blk = BLK_AVG(first_blk, end_blk);
284 1077457 : while (mid_blk != first_blk && mid_blk != end_blk) {
285 1018246 : error = xlog_bread(log, mid_blk, 1, buffer, &offset);
286 1018246 : if (error)
287 0 : return error;
288 1018246 : mid_cycle = xlog_get_cycle(offset);
289 1018246 : if (mid_cycle == cycle)
290 : end_blk = mid_blk; /* last_half_cycle == mid_cycle */
291 : else
292 272371 : first_blk = mid_blk; /* first_half_cycle == mid_cycle */
293 1018246 : mid_blk = BLK_AVG(first_blk, end_blk);
294 : }
295 59211 : ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
296 : (mid_blk == end_blk && mid_blk-1 == first_blk));
297 :
298 59211 : *last_blk = end_blk;
299 :
300 59211 : return 0;
301 : }
302 :
303 : /*
304 : * Check that a range of blocks does not contain stop_on_cycle_no.
305 : * Fill in *new_blk with the block offset where such a block is
306 : * found, or with -1 (an invalid block number) if there is no such
307 : * block in the range. The scan needs to occur from front to back
308 : * and the pointer into the region must be updated since a later
309 : * routine will need to perform another test.
310 : */
311 : STATIC int
312 60197 : xlog_find_verify_cycle(
313 : struct xlog *log,
314 : xfs_daddr_t start_blk,
315 : int nbblks,
316 : uint stop_on_cycle_no,
317 : xfs_daddr_t *new_blk)
318 : {
319 60197 : xfs_daddr_t i, j;
320 60197 : uint cycle;
321 60197 : char *buffer;
322 60197 : xfs_daddr_t bufblks;
323 60197 : char *buf = NULL;
324 60197 : int error = 0;
325 :
326 : /*
327 : * Greedily allocate a buffer big enough to handle the full
328 : * range of basic blocks we'll be examining. If that fails,
329 : * try a smaller size. We need to be able to read at least
330 : * a log sector, or we're out of luck.
331 : */
332 120394 : bufblks = 1 << ffs(nbblks);
333 60197 : while (bufblks > log->l_logBBsize)
334 0 : bufblks >>= 1;
335 60197 : while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
336 0 : bufblks >>= 1;
337 0 : if (bufblks < log->l_sectBBsize)
338 : return -ENOMEM;
339 : }
340 :
341 1748895 : for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
342 1688708 : int bcount;
343 :
344 1688708 : bcount = min(bufblks, (start_blk + nbblks - i));
345 :
346 1688708 : error = xlog_bread(log, i, bcount, buffer, &buf);
347 1688708 : if (error)
348 0 : goto out;
349 :
350 121224019 : for (j = 0; j < bcount; j++) {
351 119535321 : cycle = xlog_get_cycle(buf);
352 119535321 : if (cycle == stop_on_cycle_no) {
353 10 : *new_blk = i+j;
354 10 : goto out;
355 : }
356 :
357 119535311 : buf += BBSIZE;
358 : }
359 : }
360 :
361 60187 : *new_blk = -1;
362 :
363 60197 : out:
364 60197 : kmem_free(buffer);
365 60197 : return error;
366 : }
367 :
368 : static inline int
369 173402 : xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh)
370 : {
371 173402 : if (xfs_has_logv2(log->l_mp)) {
372 173342 : int h_size = be32_to_cpu(rh->h_size);
373 :
374 173342 : if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) &&
375 : h_size > XLOG_HEADER_CYCLE_SIZE)
376 2231 : return DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE);
377 : }
378 : return 1;
379 : }
380 :
381 : /*
382 : * Potentially backup over partial log record write.
383 : *
384 : * In the typical case, last_blk is the number of the block directly after
385 : * a good log record. Therefore, we subtract one to get the block number
386 : * of the last block in the given buffer. extra_bblks contains the number
387 : * of blocks we would have read on a previous read. This happens when the
388 : * last log record is split over the end of the physical log.
389 : *
390 : * extra_bblks is the number of blocks potentially verified on a previous
391 : * call to this routine.
392 : */
393 : STATIC int
394 59238 : xlog_find_verify_log_record(
395 : struct xlog *log,
396 : xfs_daddr_t start_blk,
397 : xfs_daddr_t *last_blk,
398 : int extra_bblks)
399 : {
400 59238 : xfs_daddr_t i;
401 59238 : char *buffer;
402 59238 : char *offset = NULL;
403 59238 : xlog_rec_header_t *head = NULL;
404 59238 : int error = 0;
405 59238 : int smallmem = 0;
406 59238 : int num_blks = *last_blk - start_blk;
407 59238 : int xhdrs;
408 :
409 59238 : ASSERT(start_blk != 0 || *last_blk != start_blk);
410 :
411 59238 : buffer = xlog_alloc_buffer(log, num_blks);
412 59238 : if (!buffer) {
413 0 : buffer = xlog_alloc_buffer(log, 1);
414 0 : if (!buffer)
415 : return -ENOMEM;
416 : smallmem = 1;
417 : } else {
418 59238 : error = xlog_bread(log, start_blk, num_blks, buffer, &offset);
419 59238 : if (error)
420 0 : goto out;
421 59238 : offset += ((num_blks - 1) << BBSHIFT);
422 : }
423 :
424 904408 : for (i = (*last_blk) - 1; i >= 0; i--) {
425 904405 : if (i < start_blk) {
426 : /* valid log record not found */
427 20 : xfs_warn(log->l_mp,
428 : "Log inconsistent (didn't find previous header)");
429 20 : ASSERT(0);
430 20 : error = -EFSCORRUPTED;
431 20 : goto out;
432 : }
433 :
434 904385 : if (smallmem) {
435 0 : error = xlog_bread(log, i, 1, buffer, &offset);
436 0 : if (error)
437 0 : goto out;
438 : }
439 :
440 904385 : head = (xlog_rec_header_t *)offset;
441 :
442 904385 : if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
443 : break;
444 :
445 845170 : if (!smallmem)
446 845170 : offset -= BBSIZE;
447 : }
448 :
449 : /*
450 : * We hit the beginning of the physical log & still no header. Return
451 : * to caller. If caller can handle a return of -1, then this routine
452 : * will be called again for the end of the physical log.
453 : */
454 59218 : if (i == -1) {
455 3 : error = 1;
456 3 : goto out;
457 : }
458 :
459 : /*
460 : * We have the final block of the good log (the first block
461 : * of the log record _before_ the head. So we check the uuid.
462 : */
463 59215 : if ((error = xlog_header_check_mount(log->l_mp, head)))
464 0 : goto out;
465 :
466 : /*
467 : * We may have found a log record header before we expected one.
468 : * last_blk will be the 1st block # with a given cycle #. We may end
469 : * up reading an entire log record. In this case, we don't want to
470 : * reset last_blk. Only when last_blk points in the middle of a log
471 : * record do we update last_blk.
472 : */
473 59215 : xhdrs = xlog_logrec_hblks(log, head);
474 :
475 59215 : if (*last_blk - i + extra_bblks !=
476 59215 : BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
477 133 : *last_blk = i;
478 :
479 59082 : out:
480 59238 : kmem_free(buffer);
481 59238 : return error;
482 : }
483 :
484 : /*
485 : * Head is defined to be the point of the log where the next log write
486 : * could go. This means that incomplete LR writes at the end are
487 : * eliminated when calculating the head. We aren't guaranteed that previous
488 : * LR have complete transactions. We only know that a cycle number of
489 : * current cycle number -1 won't be present in the log if we start writing
490 : * from our current block number.
491 : *
492 : * last_blk contains the block number of the first block with a given
493 : * cycle number.
494 : *
495 : * Return: zero if normal, non-zero if error.
496 : */
497 : STATIC int
498 59235 : xlog_find_head(
499 : struct xlog *log,
500 : xfs_daddr_t *return_head_blk)
501 : {
502 59235 : char *buffer;
503 59235 : char *offset;
504 59235 : xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
505 59235 : int num_scan_bblks;
506 59235 : uint first_half_cycle, last_half_cycle;
507 59235 : uint stop_on_cycle;
508 59235 : int error, log_bbnum = log->l_logBBsize;
509 :
510 : /* Is the end of the log device zeroed? */
511 59235 : error = xlog_find_zeroed(log, &first_blk);
512 59235 : if (error < 0) {
513 0 : xfs_warn(log->l_mp, "empty log check failed");
514 0 : return error;
515 : }
516 59235 : if (error == 1) {
517 36710 : *return_head_blk = first_blk;
518 :
519 : /* Is the whole lot zeroed? */
520 36710 : if (!first_blk) {
521 : /* Linux XFS shouldn't generate totally zeroed logs -
522 : * mkfs etc write a dummy unmount record to a fresh
523 : * log so we can store the uuid in there
524 : */
525 0 : xfs_warn(log->l_mp, "totally zeroed log");
526 : }
527 :
528 36710 : return 0;
529 : }
530 :
531 22525 : first_blk = 0; /* get cycle # of 1st block */
532 22525 : buffer = xlog_alloc_buffer(log, 1);
533 22525 : if (!buffer)
534 : return -ENOMEM;
535 :
536 22525 : error = xlog_bread(log, 0, 1, buffer, &offset);
537 22525 : if (error)
538 0 : goto out_free_buffer;
539 :
540 22525 : first_half_cycle = xlog_get_cycle(offset);
541 :
542 22525 : last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
543 22525 : error = xlog_bread(log, last_blk, 1, buffer, &offset);
544 22525 : if (error)
545 0 : goto out_free_buffer;
546 :
547 22525 : last_half_cycle = xlog_get_cycle(offset);
548 22525 : ASSERT(last_half_cycle != 0);
549 :
550 : /*
551 : * If the 1st half cycle number is equal to the last half cycle number,
552 : * then the entire log is stamped with the same cycle number. In this
553 : * case, head_blk can't be set to zero (which makes sense). The below
554 : * math doesn't work out properly with head_blk equal to zero. Instead,
555 : * we set it to log_bbnum which is an invalid block number, but this
556 : * value makes the math correct. If head_blk doesn't changed through
557 : * all the tests below, *head_blk is set to zero at the very end rather
558 : * than log_bbnum. In a sense, log_bbnum and zero are the same block
559 : * in a circular file.
560 : */
561 22525 : if (first_half_cycle == last_half_cycle) {
562 : /*
563 : * In this case we believe that the entire log should have
564 : * cycle number last_half_cycle. We need to scan backwards
565 : * from the end verifying that there are no holes still
566 : * containing last_half_cycle - 1. If we find such a hole,
567 : * then the start of that hole will be the new head. The
568 : * simple case looks like
569 : * x | x ... | x - 1 | x
570 : * Another case that fits this picture would be
571 : * x | x + 1 | x ... | x
572 : * In this case the head really is somewhere at the end of the
573 : * log, as one of the latest writes at the beginning was
574 : * incomplete.
575 : * One more case is
576 : * x | x + 1 | x ... | x - 1 | x
577 : * This is really the combination of the above two cases, and
578 : * the head has to end up at the start of the x-1 hole at the
579 : * end of the log.
580 : *
581 : * In the 256k log case, we will read from the beginning to the
582 : * end of the log and search for cycle numbers equal to x-1.
583 : * We don't worry about the x+1 blocks that we encounter,
584 : * because we know that they cannot be the head since the log
585 : * started with x.
586 : */
587 24 : head_blk = log_bbnum;
588 24 : stop_on_cycle = last_half_cycle - 1;
589 : } else {
590 : /*
591 : * In this case we want to find the first block with cycle
592 : * number matching last_half_cycle. We expect the log to be
593 : * some variation on
594 : * x + 1 ... | x ... | x
595 : * The first block with cycle number x (last_half_cycle) will
596 : * be where the new head belongs. First we do a binary search
597 : * for the first occurrence of last_half_cycle. The binary
598 : * search may not be totally accurate, so then we scan back
599 : * from there looking for occurrences of last_half_cycle before
600 : * us. If that backwards scan wraps around the beginning of
601 : * the log, then we look for occurrences of last_half_cycle - 1
602 : * at the end of the log. The cases we're looking for look
603 : * like
604 : * v binary search stopped here
605 : * x + 1 ... | x | x + 1 | x ... | x
606 : * ^ but we want to locate this spot
607 : * or
608 : * <---------> less than scan distance
609 : * x + 1 ... | x ... | x - 1 | x
610 : * ^ we want to locate this spot
611 : */
612 22501 : stop_on_cycle = last_half_cycle;
613 22501 : error = xlog_find_cycle_start(log, buffer, first_blk, &head_blk,
614 : last_half_cycle);
615 22501 : if (error)
616 0 : goto out_free_buffer;
617 : }
618 :
619 : /*
620 : * Now validate the answer. Scan back some number of maximum possible
621 : * blocks and make sure each one has the expected cycle number. The
622 : * maximum is determined by the total possible amount of buffering
623 : * in the in-core log. The following number can be made tighter if
624 : * we actually look at the block size of the filesystem.
625 : */
626 22525 : num_scan_bblks = min_t(int, log_bbnum, XLOG_TOTAL_REC_SHIFT(log));
627 22525 : if (head_blk >= num_scan_bblks) {
628 : /*
629 : * We are guaranteed that the entire check can be performed
630 : * in one buffer.
631 : */
632 21563 : start_blk = head_blk - num_scan_bblks;
633 21563 : if ((error = xlog_find_verify_cycle(log,
634 : start_blk, num_scan_bblks,
635 : stop_on_cycle, &new_blk)))
636 0 : goto out_free_buffer;
637 21563 : if (new_blk != -1)
638 10 : head_blk = new_blk;
639 : } else { /* need to read 2 parts of log */
640 : /*
641 : * We are going to scan backwards in the log in two parts.
642 : * First we scan the physical end of the log. In this part
643 : * of the log, we are looking for blocks with cycle number
644 : * last_half_cycle - 1.
645 : * If we find one, then we know that the log starts there, as
646 : * we've found a hole that didn't get written in going around
647 : * the end of the physical log. The simple case for this is
648 : * x + 1 ... | x ... | x - 1 | x
649 : * <---------> less than scan distance
650 : * If all of the blocks at the end of the log have cycle number
651 : * last_half_cycle, then we check the blocks at the start of
652 : * the log looking for occurrences of last_half_cycle. If we
653 : * find one, then our current estimate for the location of the
654 : * first occurrence of last_half_cycle is wrong and we move
655 : * back to the hole we've found. This case looks like
656 : * x + 1 ... | x | x + 1 | x ...
657 : * ^ binary search stopped here
658 : * Another case we need to handle that only occurs in 256k
659 : * logs is
660 : * x + 1 ... | x ... | x+1 | x ...
661 : * ^ binary search stops here
662 : * In a 256k log, the scan at the end of the log will see the
663 : * x + 1 blocks. We need to skip past those since that is
664 : * certainly not the head of the log. By searching for
665 : * last_half_cycle-1 we accomplish that.
666 : */
667 962 : ASSERT(head_blk <= INT_MAX &&
668 : (xfs_daddr_t) num_scan_bblks >= head_blk);
669 962 : start_blk = log_bbnum - (num_scan_bblks - head_blk);
670 962 : if ((error = xlog_find_verify_cycle(log, start_blk,
671 962 : num_scan_bblks - (int)head_blk,
672 : (stop_on_cycle - 1), &new_blk)))
673 0 : goto out_free_buffer;
674 962 : if (new_blk != -1) {
675 0 : head_blk = new_blk;
676 0 : goto validate_head;
677 : }
678 :
679 : /*
680 : * Scan beginning of log now. The last part of the physical
681 : * log is good. This scan needs to verify that it doesn't find
682 : * the last_half_cycle.
683 : */
684 962 : start_blk = 0;
685 962 : ASSERT(head_blk <= INT_MAX);
686 962 : if ((error = xlog_find_verify_cycle(log,
687 : start_blk, (int)head_blk,
688 : stop_on_cycle, &new_blk)))
689 0 : goto out_free_buffer;
690 962 : if (new_blk != -1)
691 0 : head_blk = new_blk;
692 : }
693 :
694 962 : validate_head:
695 : /*
696 : * Now we need to make sure head_blk is not pointing to a block in
697 : * the middle of a log record.
698 : */
699 22525 : num_scan_bblks = XLOG_REC_SHIFT(log);
700 22525 : if (head_blk >= num_scan_bblks) {
701 22297 : start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
702 :
703 : /* start ptr at last block ptr before head_blk */
704 22297 : error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
705 22297 : if (error == 1)
706 : error = -EIO;
707 22297 : if (error)
708 20 : goto out_free_buffer;
709 : } else {
710 228 : start_blk = 0;
711 228 : ASSERT(head_blk <= INT_MAX);
712 228 : error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
713 228 : if (error < 0)
714 0 : goto out_free_buffer;
715 228 : if (error == 1) {
716 : /* We hit the beginning of the log during our search */
717 3 : start_blk = log_bbnum - (num_scan_bblks - head_blk);
718 3 : new_blk = log_bbnum;
719 3 : ASSERT(start_blk <= INT_MAX &&
720 : (xfs_daddr_t) log_bbnum-start_blk >= 0);
721 3 : ASSERT(head_blk <= INT_MAX);
722 3 : error = xlog_find_verify_log_record(log, start_blk,
723 : &new_blk, (int)head_blk);
724 3 : if (error == 1)
725 : error = -EIO;
726 3 : if (error)
727 0 : goto out_free_buffer;
728 3 : if (new_blk != log_bbnum)
729 0 : head_blk = new_blk;
730 225 : } else if (error)
731 0 : goto out_free_buffer;
732 : }
733 :
734 22505 : kmem_free(buffer);
735 22505 : if (head_blk == log_bbnum)
736 4 : *return_head_blk = 0;
737 : else
738 22501 : *return_head_blk = head_blk;
739 : /*
740 : * When returning here, we have a good block number. Bad block
741 : * means that during a previous crash, we didn't have a clean break
742 : * from cycle number N to cycle number N-1. In this case, we need
743 : * to find the first block with cycle number N-1.
744 : */
745 : return 0;
746 :
747 20 : out_free_buffer:
748 20 : kmem_free(buffer);
749 20 : if (error)
750 20 : xfs_warn(log->l_mp, "failed to find log head");
751 20 : return error;
752 : }
753 :
754 : /*
755 : * Seek backwards in the log for log record headers.
756 : *
757 : * Given a starting log block, walk backwards until we find the provided number
758 : * of records or hit the provided tail block. The return value is the number of
759 : * records encountered or a negative error code. The log block and buffer
760 : * pointer of the last record seen are returned in rblk and rhead respectively.
761 : */
762 : STATIC int
763 73006 : xlog_rseek_logrec_hdr(
764 : struct xlog *log,
765 : xfs_daddr_t head_blk,
766 : xfs_daddr_t tail_blk,
767 : int count,
768 : char *buffer,
769 : xfs_daddr_t *rblk,
770 : struct xlog_rec_header **rhead,
771 : bool *wrapped)
772 : {
773 73006 : int i;
774 73006 : int error;
775 73006 : int found = 0;
776 73006 : char *offset = NULL;
777 73006 : xfs_daddr_t end_blk;
778 :
779 73006 : *wrapped = false;
780 :
781 : /*
782 : * Walk backwards from the head block until we hit the tail or the first
783 : * block in the log.
784 : */
785 73006 : end_blk = head_blk > tail_blk ? tail_blk : 0;
786 4763711 : for (i = (int) head_blk - 1; i >= end_blk; i--) {
787 4758045 : error = xlog_bread(log, i, 1, buffer, &offset);
788 4758045 : if (error)
789 0 : goto out_error;
790 :
791 4758045 : if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
792 137368 : *rblk = i;
793 137368 : *rhead = (struct xlog_rec_header *) offset;
794 137368 : if (++found == count)
795 : break;
796 : }
797 : }
798 :
799 : /*
800 : * If we haven't hit the tail block or the log record header count,
801 : * start looking again from the end of the physical log. Note that
802 : * callers can pass head == tail if the tail is not yet known.
803 : */
804 73006 : if (tail_blk >= head_blk && found != count) {
805 70737 : for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
806 70736 : error = xlog_bread(log, i, 1, buffer, &offset);
807 70736 : if (error)
808 0 : goto out_error;
809 :
810 70736 : if (*(__be32 *)offset ==
811 : cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
812 265 : *wrapped = true;
813 265 : *rblk = i;
814 265 : *rhead = (struct xlog_rec_header *) offset;
815 265 : if (++found == count)
816 : break;
817 : }
818 : }
819 : }
820 :
821 : return found;
822 :
823 : out_error:
824 : return error;
825 : }
826 :
827 : /*
828 : * Seek forward in the log for log record headers.
829 : *
830 : * Given head and tail blocks, walk forward from the tail block until we find
831 : * the provided number of records or hit the head block. The return value is the
832 : * number of records encountered or a negative error code. The log block and
833 : * buffer pointer of the last record seen are returned in rblk and rhead
834 : * respectively.
835 : */
836 : STATIC int
837 13735 : xlog_seek_logrec_hdr(
838 : struct xlog *log,
839 : xfs_daddr_t head_blk,
840 : xfs_daddr_t tail_blk,
841 : int count,
842 : char *buffer,
843 : xfs_daddr_t *rblk,
844 : struct xlog_rec_header **rhead,
845 : bool *wrapped)
846 : {
847 13735 : int i;
848 13735 : int error;
849 13735 : int found = 0;
850 13735 : char *offset = NULL;
851 13735 : xfs_daddr_t end_blk;
852 :
853 13735 : *wrapped = false;
854 :
855 : /*
856 : * Walk forward from the tail block until we hit the head or the last
857 : * block in the log.
858 : */
859 13735 : end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
860 13735 : for (i = (int) tail_blk; i <= end_blk; i++) {
861 13735 : error = xlog_bread(log, i, 1, buffer, &offset);
862 13735 : if (error)
863 0 : goto out_error;
864 :
865 13735 : if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
866 13735 : *rblk = i;
867 13735 : *rhead = (struct xlog_rec_header *) offset;
868 13735 : if (++found == count)
869 : break;
870 : }
871 : }
872 :
873 : /*
874 : * If we haven't hit the head block or the log record header count,
875 : * start looking again from the start of the physical log.
876 : */
877 13735 : if (tail_blk > head_blk && found != count) {
878 0 : for (i = 0; i < (int) head_blk; i++) {
879 0 : error = xlog_bread(log, i, 1, buffer, &offset);
880 0 : if (error)
881 0 : goto out_error;
882 :
883 0 : if (*(__be32 *)offset ==
884 : cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
885 0 : *wrapped = true;
886 0 : *rblk = i;
887 0 : *rhead = (struct xlog_rec_header *) offset;
888 0 : if (++found == count)
889 : break;
890 : }
891 : }
892 : }
893 :
894 : return found;
895 :
896 : out_error:
897 : return error;
898 : }
899 :
900 : /*
901 : * Calculate distance from head to tail (i.e., unused space in the log).
902 : */
903 : static inline int
904 : xlog_tail_distance(
905 : struct xlog *log,
906 : xfs_daddr_t head_blk,
907 : xfs_daddr_t tail_blk)
908 : {
909 0 : if (head_blk < tail_blk)
910 0 : return tail_blk - head_blk;
911 :
912 0 : return tail_blk + (log->l_logBBsize - head_blk);
913 : }
914 :
915 : /*
916 : * Verify the log tail. This is particularly important when torn or incomplete
917 : * writes have been detected near the front of the log and the head has been
918 : * walked back accordingly.
919 : *
920 : * We also have to handle the case where the tail was pinned and the head
921 : * blocked behind the tail right before a crash. If the tail had been pushed
922 : * immediately prior to the crash and the subsequent checkpoint was only
923 : * partially written, it's possible it overwrote the last referenced tail in the
924 : * log with garbage. This is not a coherency problem because the tail must have
925 : * been pushed before it can be overwritten, but appears as log corruption to
926 : * recovery because we have no way to know the tail was updated if the
927 : * subsequent checkpoint didn't write successfully.
928 : *
929 : * Therefore, CRC check the log from tail to head. If a failure occurs and the
930 : * offending record is within max iclog bufs from the head, walk the tail
931 : * forward and retry until a valid tail is found or corruption is detected out
932 : * of the range of a possible overwrite.
933 : */
934 : STATIC int
935 13735 : xlog_verify_tail(
936 : struct xlog *log,
937 : xfs_daddr_t head_blk,
938 : xfs_daddr_t *tail_blk,
939 : int hsize)
940 : {
941 13735 : struct xlog_rec_header *thead;
942 13735 : char *buffer;
943 13735 : xfs_daddr_t first_bad;
944 13735 : int error = 0;
945 13735 : bool wrapped;
946 13735 : xfs_daddr_t tmp_tail;
947 13735 : xfs_daddr_t orig_tail = *tail_blk;
948 :
949 13735 : buffer = xlog_alloc_buffer(log, 1);
950 13735 : if (!buffer)
951 : return -ENOMEM;
952 :
953 : /*
954 : * Make sure the tail points to a record (returns positive count on
955 : * success).
956 : */
957 13735 : error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, buffer,
958 : &tmp_tail, &thead, &wrapped);
959 13735 : if (error < 0)
960 0 : goto out;
961 13735 : if (*tail_blk != tmp_tail)
962 0 : *tail_blk = tmp_tail;
963 :
964 : /*
965 : * Run a CRC check from the tail to the head. We can't just check
966 : * MAX_ICLOGS records past the tail because the tail may point to stale
967 : * blocks cleared during the search for the head/tail. These blocks are
968 : * overwritten with zero-length records and thus record count is not a
969 : * reliable indicator of the iclog state before a crash.
970 : */
971 13735 : first_bad = 0;
972 13735 : error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
973 : XLOG_RECOVER_CRCPASS, &first_bad);
974 13735 : while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
975 0 : int tail_distance;
976 :
977 : /*
978 : * Is corruption within range of the head? If so, retry from
979 : * the next record. Otherwise return an error.
980 : */
981 0 : tail_distance = xlog_tail_distance(log, head_blk, first_bad);
982 0 : if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
983 : break;
984 :
985 : /* skip to the next record; returns positive count on success */
986 0 : error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2,
987 : buffer, &tmp_tail, &thead, &wrapped);
988 0 : if (error < 0)
989 0 : goto out;
990 :
991 0 : *tail_blk = tmp_tail;
992 0 : first_bad = 0;
993 0 : error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
994 : XLOG_RECOVER_CRCPASS, &first_bad);
995 : }
996 :
997 13735 : if (!error && *tail_blk != orig_tail)
998 0 : xfs_warn(log->l_mp,
999 : "Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
1000 : orig_tail, *tail_blk);
1001 13735 : out:
1002 13735 : kmem_free(buffer);
1003 13735 : return error;
1004 : }
1005 :
1006 : /*
1007 : * Detect and trim torn writes from the head of the log.
1008 : *
1009 : * Storage without sector atomicity guarantees can result in torn writes in the
1010 : * log in the event of a crash. Our only means to detect this scenario is via
1011 : * CRC verification. While we can't always be certain that CRC verification
1012 : * failure is due to a torn write vs. an unrelated corruption, we do know that
1013 : * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
1014 : * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
1015 : * the log and treat failures in this range as torn writes as a matter of
1016 : * policy. In the event of CRC failure, the head is walked back to the last good
1017 : * record in the log and the tail is updated from that record and verified.
1018 : */
1019 : STATIC int
1020 13735 : xlog_verify_head(
1021 : struct xlog *log,
1022 : xfs_daddr_t *head_blk, /* in/out: unverified head */
1023 : xfs_daddr_t *tail_blk, /* out: tail block */
1024 : char *buffer,
1025 : xfs_daddr_t *rhead_blk, /* start blk of last record */
1026 : struct xlog_rec_header **rhead, /* ptr to last record */
1027 : bool *wrapped) /* last rec. wraps phys. log */
1028 : {
1029 13735 : struct xlog_rec_header *tmp_rhead;
1030 13735 : char *tmp_buffer;
1031 13735 : xfs_daddr_t first_bad;
1032 13735 : xfs_daddr_t tmp_rhead_blk;
1033 13735 : int found;
1034 13735 : int error;
1035 13735 : bool tmp_wrapped;
1036 :
1037 : /*
1038 : * Check the head of the log for torn writes. Search backwards from the
1039 : * head until we hit the tail or the maximum number of log record I/Os
1040 : * that could have been in flight at one time. Use a temporary buffer so
1041 : * we don't trash the rhead/buffer pointers from the caller.
1042 : */
1043 13735 : tmp_buffer = xlog_alloc_buffer(log, 1);
1044 13735 : if (!tmp_buffer)
1045 : return -ENOMEM;
1046 13735 : error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
1047 : XLOG_MAX_ICLOGS, tmp_buffer,
1048 : &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
1049 13735 : kmem_free(tmp_buffer);
1050 13735 : if (error < 0)
1051 : return error;
1052 :
1053 : /*
1054 : * Now run a CRC verification pass over the records starting at the
1055 : * block found above to the current head. If a CRC failure occurs, the
1056 : * log block of the first bad record is saved in first_bad.
1057 : */
1058 13735 : error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
1059 : XLOG_RECOVER_CRCPASS, &first_bad);
1060 13735 : if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
1061 : /*
1062 : * We've hit a potential torn write. Reset the error and warn
1063 : * about it.
1064 : */
1065 56 : error = 0;
1066 56 : xfs_warn(log->l_mp,
1067 : "Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
1068 : first_bad, *head_blk);
1069 :
1070 : /*
1071 : * Get the header block and buffer pointer for the last good
1072 : * record before the bad record.
1073 : *
1074 : * Note that xlog_find_tail() clears the blocks at the new head
1075 : * (i.e., the records with invalid CRC) if the cycle number
1076 : * matches the current cycle.
1077 : */
1078 56 : found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
1079 : buffer, rhead_blk, rhead, wrapped);
1080 56 : if (found < 0)
1081 : return found;
1082 56 : if (found == 0) /* XXX: right thing to do here? */
1083 : return -EIO;
1084 :
1085 : /*
1086 : * Reset the head block to the starting block of the first bad
1087 : * log record and set the tail block based on the last good
1088 : * record.
1089 : *
1090 : * Bail out if the updated head/tail match as this indicates
1091 : * possible corruption outside of the acceptable
1092 : * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
1093 : */
1094 56 : *head_blk = first_bad;
1095 56 : *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
1096 56 : if (*head_blk == *tail_blk) {
1097 0 : ASSERT(0);
1098 0 : return 0;
1099 : }
1100 : }
1101 13679 : if (error)
1102 : return error;
1103 :
1104 13735 : return xlog_verify_tail(log, *head_blk, tail_blk,
1105 13735 : be32_to_cpu((*rhead)->h_size));
1106 : }
1107 :
1108 : /*
1109 : * We need to make sure we handle log wrapping properly, so we can't use the
1110 : * calculated logbno directly. Make sure it wraps to the correct bno inside the
1111 : * log.
1112 : *
1113 : * The log is limited to 32 bit sizes, so we use the appropriate modulus
1114 : * operation here and cast it back to a 64 bit daddr on return.
1115 : */
1116 : static inline xfs_daddr_t
1117 : xlog_wrap_logbno(
1118 : struct xlog *log,
1119 : xfs_daddr_t bno)
1120 : {
1121 385713 : int mod;
1122 :
1123 385713 : div_s64_rem(bno, log->l_logBBsize, &mod);
1124 385713 : return mod;
1125 : }
1126 :
1127 : /*
1128 : * Check whether the head of the log points to an unmount record. In other
1129 : * words, determine whether the log is clean. If so, update the in-core state
1130 : * appropriately.
1131 : */
1132 : static int
1133 59271 : xlog_check_unmount_rec(
1134 : struct xlog *log,
1135 : xfs_daddr_t *head_blk,
1136 : xfs_daddr_t *tail_blk,
1137 : struct xlog_rec_header *rhead,
1138 : xfs_daddr_t rhead_blk,
1139 : char *buffer,
1140 : bool *clean)
1141 : {
1142 59271 : struct xlog_op_header *op_head;
1143 59271 : xfs_daddr_t umount_data_blk;
1144 59271 : xfs_daddr_t after_umount_blk;
1145 59271 : int hblks;
1146 59271 : int error;
1147 59271 : char *offset;
1148 :
1149 59271 : *clean = false;
1150 :
1151 : /*
1152 : * Look for unmount record. If we find it, then we know there was a
1153 : * clean unmount. Since 'i' could be the last block in the physical
1154 : * log, we convert to a log block before comparing to the head_blk.
1155 : *
1156 : * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
1157 : * below. We won't want to clear the unmount record if there is one, so
1158 : * we pass the lsn of the unmount record rather than the block after it.
1159 : */
1160 59271 : hblks = xlog_logrec_hblks(log, rhead);
1161 59271 : after_umount_blk = xlog_wrap_logbno(log,
1162 59271 : rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)));
1163 :
1164 59271 : if (*head_blk == after_umount_blk &&
1165 59271 : be32_to_cpu(rhead->h_num_logops) == 1) {
1166 45486 : umount_data_blk = xlog_wrap_logbno(log, rhead_blk + hblks);
1167 45486 : error = xlog_bread(log, umount_data_blk, 1, buffer, &offset);
1168 45486 : if (error)
1169 : return error;
1170 :
1171 45486 : op_head = (struct xlog_op_header *)offset;
1172 45486 : if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
1173 : /*
1174 : * Set tail and last sync so that newly written log
1175 : * records will point recovery to after the current
1176 : * unmount record.
1177 : */
1178 45481 : xlog_assign_atomic_lsn(&log->l_tail_lsn,
1179 45481 : log->l_curr_cycle, after_umount_blk);
1180 45481 : xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
1181 : log->l_curr_cycle, after_umount_blk);
1182 45481 : *tail_blk = after_umount_blk;
1183 :
1184 45481 : *clean = true;
1185 : }
1186 : }
1187 :
1188 : return 0;
1189 : }
1190 :
1191 : static void
1192 59271 : xlog_set_state(
1193 : struct xlog *log,
1194 : xfs_daddr_t head_blk,
1195 : struct xlog_rec_header *rhead,
1196 : xfs_daddr_t rhead_blk,
1197 : bool bump_cycle)
1198 : {
1199 : /*
1200 : * Reset log values according to the state of the log when we
1201 : * crashed. In the case where head_blk == 0, we bump curr_cycle
1202 : * one because the next write starts a new cycle rather than
1203 : * continuing the cycle of the last good log record. At this
1204 : * point we have guaranteed that all partial log records have been
1205 : * accounted for. Therefore, we know that the last good log record
1206 : * written was complete and ended exactly on the end boundary
1207 : * of the physical log.
1208 : */
1209 59271 : log->l_prev_block = rhead_blk;
1210 59271 : log->l_curr_block = (int)head_blk;
1211 59271 : log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
1212 59271 : if (bump_cycle)
1213 140 : log->l_curr_cycle++;
1214 59271 : atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
1215 59271 : atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
1216 59271 : xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
1217 : BBTOB(log->l_curr_block));
1218 59271 : xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
1219 : BBTOB(log->l_curr_block));
1220 59271 : }
1221 :
1222 : /*
1223 : * Find the sync block number or the tail of the log.
1224 : *
1225 : * This will be the block number of the last record to have its
1226 : * associated buffers synced to disk. Every log record header has
1227 : * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
1228 : * to get a sync block number. The only concern is to figure out which
1229 : * log record header to believe.
1230 : *
1231 : * The following algorithm uses the log record header with the largest
1232 : * lsn. The entire log record does not need to be valid. We only care
1233 : * that the header is valid.
1234 : *
1235 : * We could speed up search by using current head_blk buffer, but it is not
1236 : * available.
1237 : */
1238 : STATIC int
1239 59235 : xlog_find_tail(
1240 : struct xlog *log,
1241 : xfs_daddr_t *head_blk,
1242 : xfs_daddr_t *tail_blk)
1243 : {
1244 59235 : xlog_rec_header_t *rhead;
1245 59235 : char *offset = NULL;
1246 59235 : char *buffer;
1247 59235 : int error;
1248 59235 : xfs_daddr_t rhead_blk;
1249 59235 : xfs_lsn_t tail_lsn;
1250 59235 : bool wrapped = false;
1251 59235 : bool clean = false;
1252 :
1253 : /*
1254 : * Find previous log record
1255 : */
1256 59235 : if ((error = xlog_find_head(log, head_blk)))
1257 : return error;
1258 59215 : ASSERT(*head_blk < INT_MAX);
1259 :
1260 59215 : buffer = xlog_alloc_buffer(log, 1);
1261 59215 : if (!buffer)
1262 : return -ENOMEM;
1263 59215 : if (*head_blk == 0) { /* special case */
1264 137 : error = xlog_bread(log, 0, 1, buffer, &offset);
1265 137 : if (error)
1266 0 : goto done;
1267 :
1268 274 : if (xlog_get_cycle(offset) == 0) {
1269 0 : *tail_blk = 0;
1270 : /* leave all other log inited values alone */
1271 0 : goto done;
1272 : }
1273 : }
1274 :
1275 : /*
1276 : * Search backwards through the log looking for the log record header
1277 : * block. This wraps all the way back around to the head so something is
1278 : * seriously wrong if we can't find it.
1279 : */
1280 59215 : error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer,
1281 : &rhead_blk, &rhead, &wrapped);
1282 59215 : if (error < 0)
1283 0 : goto done;
1284 59215 : if (!error) {
1285 0 : xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
1286 0 : error = -EFSCORRUPTED;
1287 0 : goto done;
1288 : }
1289 59215 : *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
1290 :
1291 : /*
1292 : * Set the log state based on the current head record.
1293 : */
1294 59215 : xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
1295 59215 : tail_lsn = atomic64_read(&log->l_tail_lsn);
1296 :
1297 : /*
1298 : * Look for an unmount record at the head of the log. This sets the log
1299 : * state to determine whether recovery is necessary.
1300 : */
1301 59215 : error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
1302 : rhead_blk, buffer, &clean);
1303 59215 : if (error)
1304 0 : goto done;
1305 :
1306 : /*
1307 : * Verify the log head if the log is not clean (e.g., we have anything
1308 : * but an unmount record at the head). This uses CRC verification to
1309 : * detect and trim torn writes. If discovered, CRC failures are
1310 : * considered torn writes and the log head is trimmed accordingly.
1311 : *
1312 : * Note that we can only run CRC verification when the log is dirty
1313 : * because there's no guarantee that the log data behind an unmount
1314 : * record is compatible with the current architecture.
1315 : */
1316 59215 : if (!clean) {
1317 13735 : xfs_daddr_t orig_head = *head_blk;
1318 :
1319 13735 : error = xlog_verify_head(log, head_blk, tail_blk, buffer,
1320 : &rhead_blk, &rhead, &wrapped);
1321 13735 : if (error)
1322 0 : goto done;
1323 :
1324 : /* update in-core state again if the head changed */
1325 13735 : if (*head_blk != orig_head) {
1326 56 : xlog_set_state(log, *head_blk, rhead, rhead_blk,
1327 : wrapped);
1328 56 : tail_lsn = atomic64_read(&log->l_tail_lsn);
1329 56 : error = xlog_check_unmount_rec(log, head_blk, tail_blk,
1330 : rhead, rhead_blk, buffer,
1331 : &clean);
1332 56 : if (error)
1333 0 : goto done;
1334 : }
1335 : }
1336 :
1337 : /*
1338 : * Note that the unmount was clean. If the unmount was not clean, we
1339 : * need to know this to rebuild the superblock counters from the perag
1340 : * headers if we have a filesystem using non-persistent counters.
1341 : */
1342 59215 : if (clean)
1343 45481 : set_bit(XFS_OPSTATE_CLEAN, &log->l_mp->m_opstate);
1344 :
1345 : /*
1346 : * Make sure that there are no blocks in front of the head
1347 : * with the same cycle number as the head. This can happen
1348 : * because we allow multiple outstanding log writes concurrently,
1349 : * and the later writes might make it out before earlier ones.
1350 : *
1351 : * We use the lsn from before modifying it so that we'll never
1352 : * overwrite the unmount record after a clean unmount.
1353 : *
1354 : * Do this only if we are going to recover the filesystem
1355 : *
1356 : * NOTE: This used to say "if (!readonly)"
1357 : * However on Linux, we can & do recover a read-only filesystem.
1358 : * We only skip recovery if NORECOVERY is specified on mount,
1359 : * in which case we would not be here.
1360 : *
1361 : * But... if the -device- itself is readonly, just skip this.
1362 : * We can't recover this device anyway, so it won't matter.
1363 : */
1364 59215 : if (!xfs_readonly_buftarg(log->l_targ))
1365 59203 : error = xlog_clear_stale_blocks(log, tail_lsn);
1366 :
1367 12 : done:
1368 59215 : kmem_free(buffer);
1369 :
1370 59215 : if (error)
1371 0 : xfs_warn(log->l_mp, "failed to locate log tail");
1372 : return error;
1373 : }
1374 :
1375 : /*
1376 : * Is the log zeroed at all?
1377 : *
1378 : * The last binary search should be changed to perform an X block read
1379 : * once X becomes small enough. You can then search linearly through
1380 : * the X blocks. This will cut down on the number of reads we need to do.
1381 : *
1382 : * If the log is partially zeroed, this routine will pass back the blkno
1383 : * of the first block with cycle number 0. It won't have a complete LR
1384 : * preceding it.
1385 : *
1386 : * Return:
1387 : * 0 => the log is completely written to
1388 : * 1 => use *blk_no as the first block of the log
1389 : * <0 => error has occurred
1390 : */
1391 : STATIC int
1392 59235 : xlog_find_zeroed(
1393 : struct xlog *log,
1394 : xfs_daddr_t *blk_no)
1395 : {
1396 59235 : char *buffer;
1397 59235 : char *offset;
1398 59235 : uint first_cycle, last_cycle;
1399 59235 : xfs_daddr_t new_blk, last_blk, start_blk;
1400 59235 : xfs_daddr_t num_scan_bblks;
1401 59235 : int error, log_bbnum = log->l_logBBsize;
1402 :
1403 59235 : *blk_no = 0;
1404 :
1405 : /* check totally zeroed log */
1406 59235 : buffer = xlog_alloc_buffer(log, 1);
1407 59235 : if (!buffer)
1408 : return -ENOMEM;
1409 59235 : error = xlog_bread(log, 0, 1, buffer, &offset);
1410 59235 : if (error)
1411 0 : goto out_free_buffer;
1412 :
1413 59235 : first_cycle = xlog_get_cycle(offset);
1414 59235 : if (first_cycle == 0) { /* completely zeroed log */
1415 0 : *blk_no = 0;
1416 0 : kmem_free(buffer);
1417 0 : return 1;
1418 : }
1419 :
1420 : /* check partially zeroed log */
1421 59235 : error = xlog_bread(log, log_bbnum-1, 1, buffer, &offset);
1422 59235 : if (error)
1423 0 : goto out_free_buffer;
1424 :
1425 59235 : last_cycle = xlog_get_cycle(offset);
1426 59235 : if (last_cycle != 0) { /* log completely written to */
1427 22525 : kmem_free(buffer);
1428 22525 : return 0;
1429 : }
1430 :
1431 : /* we have a partially zeroed log */
1432 36710 : last_blk = log_bbnum-1;
1433 36710 : error = xlog_find_cycle_start(log, buffer, 0, &last_blk, 0);
1434 36710 : if (error)
1435 0 : goto out_free_buffer;
1436 :
1437 : /*
1438 : * Validate the answer. Because there is no way to guarantee that
1439 : * the entire log is made up of log records which are the same size,
1440 : * we scan over the defined maximum blocks. At this point, the maximum
1441 : * is not chosen to mean anything special. XXXmiken
1442 : */
1443 36710 : num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1444 36710 : ASSERT(num_scan_bblks <= INT_MAX);
1445 :
1446 36710 : if (last_blk < num_scan_bblks)
1447 : num_scan_bblks = last_blk;
1448 36710 : start_blk = last_blk - num_scan_bblks;
1449 :
1450 : /*
1451 : * We search for any instances of cycle number 0 that occur before
1452 : * our current estimate of the head. What we're trying to detect is
1453 : * 1 ... | 0 | 1 | 0...
1454 : * ^ binary search ends here
1455 : */
1456 36710 : if ((error = xlog_find_verify_cycle(log, start_blk,
1457 : (int)num_scan_bblks, 0, &new_blk)))
1458 0 : goto out_free_buffer;
1459 36710 : if (new_blk != -1)
1460 0 : last_blk = new_blk;
1461 :
1462 : /*
1463 : * Potentially backup over partial log record write. We don't need
1464 : * to search the end of the log because we know it is zero.
1465 : */
1466 36710 : error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
1467 36710 : if (error == 1)
1468 : error = -EIO;
1469 36710 : if (error)
1470 0 : goto out_free_buffer;
1471 :
1472 36710 : *blk_no = last_blk;
1473 36710 : out_free_buffer:
1474 36710 : kmem_free(buffer);
1475 36710 : if (error)
1476 0 : return error;
1477 : return 1;
1478 : }
1479 :
1480 : /*
1481 : * These are simple subroutines used by xlog_clear_stale_blocks() below
1482 : * to initialize a buffer full of empty log record headers and write
1483 : * them into the log.
1484 : */
1485 : STATIC void
1486 242384832 : xlog_add_record(
1487 : struct xlog *log,
1488 : char *buf,
1489 : int cycle,
1490 : int block,
1491 : int tail_cycle,
1492 : int tail_block)
1493 : {
1494 242384832 : xlog_rec_header_t *recp = (xlog_rec_header_t *)buf;
1495 :
1496 242384832 : memset(buf, 0, BBSIZE);
1497 242384832 : recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1498 242384832 : recp->h_cycle = cpu_to_be32(cycle);
1499 242384832 : recp->h_version = cpu_to_be32(
1500 : xfs_has_logv2(log->l_mp) ? 2 : 1);
1501 242384832 : recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1502 242384832 : recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1503 242384832 : recp->h_fmt = cpu_to_be32(XLOG_FMT);
1504 484769664 : memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1505 242384832 : }
1506 :
1507 : STATIC int
1508 60076 : xlog_write_log_records(
1509 : struct xlog *log,
1510 : int cycle,
1511 : int start_block,
1512 : int blocks,
1513 : int tail_cycle,
1514 : int tail_block)
1515 : {
1516 60076 : char *offset;
1517 60076 : char *buffer;
1518 60076 : int balign, ealign;
1519 60076 : int sectbb = log->l_sectBBsize;
1520 60076 : int end_block = start_block + blocks;
1521 60076 : int bufblks;
1522 60076 : int error = 0;
1523 60076 : int i, j = 0;
1524 :
1525 : /*
1526 : * Greedily allocate a buffer big enough to handle the full
1527 : * range of basic blocks to be written. If that fails, try
1528 : * a smaller size. We need to be able to write at least a
1529 : * log sector, or we're out of luck.
1530 : */
1531 120152 : bufblks = 1 << ffs(blocks);
1532 60111 : while (bufblks > log->l_logBBsize)
1533 35 : bufblks >>= 1;
1534 60076 : while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
1535 0 : bufblks >>= 1;
1536 0 : if (bufblks < sectbb)
1537 : return -ENOMEM;
1538 : }
1539 :
1540 : /* We may need to do a read at the start to fill in part of
1541 : * the buffer in the starting sector not covered by the first
1542 : * write below.
1543 : */
1544 60076 : balign = round_down(start_block, sectbb);
1545 60076 : if (balign != start_block) {
1546 0 : error = xlog_bread_noalign(log, start_block, 1, buffer);
1547 0 : if (error)
1548 0 : goto out_free_buffer;
1549 :
1550 0 : j = start_block - balign;
1551 : }
1552 :
1553 550490 : for (i = start_block; i < end_block; i += bufblks) {
1554 490414 : int bcount, endcount;
1555 :
1556 490414 : bcount = min(bufblks, end_block - start_block);
1557 490414 : endcount = bcount - j;
1558 :
1559 : /* We may need to do a read at the end to fill in part of
1560 : * the buffer in the final sector not covered by the write.
1561 : * If this is the same sector as the above read, skip it.
1562 : */
1563 490414 : ealign = round_down(end_block, sectbb);
1564 490414 : if (j == 0 && (start_block + endcount > ealign)) {
1565 0 : error = xlog_bread_noalign(log, ealign, sectbb,
1566 0 : buffer + BBTOB(ealign - start_block));
1567 0 : if (error)
1568 : break;
1569 :
1570 : }
1571 :
1572 490414 : offset = buffer + xlog_align(log, start_block);
1573 242875246 : for (; j < endcount; j++) {
1574 242384832 : xlog_add_record(log, offset, cycle, i+j,
1575 : tail_cycle, tail_block);
1576 242384832 : offset += BBSIZE;
1577 : }
1578 490414 : error = xlog_bwrite(log, start_block, endcount, buffer);
1579 490414 : if (error)
1580 : break;
1581 490414 : start_block += endcount;
1582 490414 : j = 0;
1583 : }
1584 :
1585 60076 : out_free_buffer:
1586 60076 : kmem_free(buffer);
1587 60076 : return error;
1588 : }
1589 :
1590 : /*
1591 : * This routine is called to blow away any incomplete log writes out
1592 : * in front of the log head. We do this so that we won't become confused
1593 : * if we come up, write only a little bit more, and then crash again.
1594 : * If we leave the partial log records out there, this situation could
1595 : * cause us to think those partial writes are valid blocks since they
1596 : * have the current cycle number. We get rid of them by overwriting them
1597 : * with empty log records with the old cycle number rather than the
1598 : * current one.
1599 : *
1600 : * The tail lsn is passed in rather than taken from
1601 : * the log so that we will not write over the unmount record after a
1602 : * clean unmount in a 512 block log. Doing so would leave the log without
1603 : * any valid log records in it until a new one was written. If we crashed
1604 : * during that time we would not be able to recover.
1605 : */
1606 : STATIC int
1607 59203 : xlog_clear_stale_blocks(
1608 : struct xlog *log,
1609 : xfs_lsn_t tail_lsn)
1610 : {
1611 59203 : int tail_cycle, head_cycle;
1612 59203 : int tail_block, head_block;
1613 59203 : int tail_distance, max_distance;
1614 59203 : int distance;
1615 59203 : int error;
1616 :
1617 59203 : tail_cycle = CYCLE_LSN(tail_lsn);
1618 59203 : tail_block = BLOCK_LSN(tail_lsn);
1619 59203 : head_cycle = log->l_curr_cycle;
1620 59203 : head_block = log->l_curr_block;
1621 :
1622 : /*
1623 : * Figure out the distance between the new head of the log
1624 : * and the tail. We want to write over any blocks beyond the
1625 : * head that we may have written just before the crash, but
1626 : * we don't want to overwrite the tail of the log.
1627 : */
1628 59203 : if (head_cycle == tail_cycle) {
1629 : /*
1630 : * The tail is behind the head in the physical log,
1631 : * so the distance from the head to the tail is the
1632 : * distance from the head to the end of the log plus
1633 : * the distance from the beginning of the log to the
1634 : * tail.
1635 : */
1636 58420 : if (XFS_IS_CORRUPT(log->l_mp,
1637 : head_block < tail_block ||
1638 : head_block >= log->l_logBBsize))
1639 0 : return -EFSCORRUPTED;
1640 58420 : tail_distance = tail_block + (log->l_logBBsize - head_block);
1641 : } else {
1642 : /*
1643 : * The head is behind the tail in the physical log,
1644 : * so the distance from the head to the tail is just
1645 : * the tail block minus the head block.
1646 : */
1647 783 : if (XFS_IS_CORRUPT(log->l_mp,
1648 : head_block >= tail_block ||
1649 : head_cycle != tail_cycle + 1))
1650 0 : return -EFSCORRUPTED;
1651 783 : tail_distance = tail_block - head_block;
1652 : }
1653 :
1654 : /*
1655 : * If the head is right up against the tail, we can't clear
1656 : * anything.
1657 : */
1658 59203 : if (tail_distance <= 0) {
1659 0 : ASSERT(tail_distance == 0);
1660 0 : return 0;
1661 : }
1662 :
1663 59203 : max_distance = XLOG_TOTAL_REC_SHIFT(log);
1664 : /*
1665 : * Take the smaller of the maximum amount of outstanding I/O
1666 : * we could have and the distance to the tail to clear out.
1667 : * We take the smaller so that we don't overwrite the tail and
1668 : * we don't waste all day writing from the head to the tail
1669 : * for no reason.
1670 : */
1671 59203 : max_distance = min(max_distance, tail_distance);
1672 :
1673 59203 : if ((head_block + max_distance) <= log->l_logBBsize) {
1674 : /*
1675 : * We can stomp all the blocks we need to without
1676 : * wrapping around the end of the log. Just do it
1677 : * in a single write. Use the cycle number of the
1678 : * current cycle minus one so that the log will look like:
1679 : * n ... | n - 1 ...
1680 : */
1681 58330 : error = xlog_write_log_records(log, (head_cycle - 1),
1682 : head_block, max_distance, tail_cycle,
1683 : tail_block);
1684 58330 : if (error)
1685 0 : return error;
1686 : } else {
1687 : /*
1688 : * We need to wrap around the end of the physical log in
1689 : * order to clear all the blocks. Do it in two separate
1690 : * I/Os. The first write should be from the head to the
1691 : * end of the physical log, and it should use the current
1692 : * cycle number minus one just like above.
1693 : */
1694 873 : distance = log->l_logBBsize - head_block;
1695 873 : error = xlog_write_log_records(log, (head_cycle - 1),
1696 : head_block, distance, tail_cycle,
1697 : tail_block);
1698 :
1699 873 : if (error)
1700 : return error;
1701 :
1702 : /*
1703 : * Now write the blocks at the start of the physical log.
1704 : * This writes the remainder of the blocks we want to clear.
1705 : * It uses the current cycle number since we're now on the
1706 : * same cycle as the head so that we get:
1707 : * n ... n ... | n - 1 ...
1708 : * ^^^^^ blocks we're writing
1709 : */
1710 873 : distance = max_distance - (log->l_logBBsize - head_block);
1711 873 : error = xlog_write_log_records(log, head_cycle, 0, distance,
1712 : tail_cycle, tail_block);
1713 873 : if (error)
1714 0 : return error;
1715 : }
1716 :
1717 : return 0;
1718 : }
1719 :
1720 : /*
1721 : * Release the recovered intent item in the AIL that matches the given intent
1722 : * type and intent id.
1723 : */
1724 : void
1725 150951 : xlog_recover_release_intent(
1726 : struct xlog *log,
1727 : unsigned short intent_type,
1728 : uint64_t intent_id)
1729 : {
1730 150951 : struct xfs_ail_cursor cur;
1731 150951 : struct xfs_log_item *lip;
1732 150951 : struct xfs_ail *ailp = log->l_ailp;
1733 :
1734 150951 : spin_lock(&ailp->ail_lock);
1735 180775 : for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL;
1736 29824 : lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
1737 180119 : if (lip->li_type != intent_type)
1738 22378 : continue;
1739 157741 : if (!lip->li_ops->iop_match(lip, intent_id))
1740 7446 : continue;
1741 :
1742 150295 : spin_unlock(&ailp->ail_lock);
1743 150295 : lip->li_ops->iop_release(lip);
1744 150295 : spin_lock(&ailp->ail_lock);
1745 : break;
1746 : }
1747 :
1748 150951 : xfs_trans_ail_cursor_done(&cur);
1749 150951 : spin_unlock(&ailp->ail_lock);
1750 150951 : }
1751 :
1752 : int
1753 732 : xlog_recover_iget(
1754 : struct xfs_mount *mp,
1755 : xfs_ino_t ino,
1756 : struct xfs_inode **ipp)
1757 : {
1758 732 : int error;
1759 :
1760 732 : error = xfs_iget(mp, NULL, ino, 0, 0, ipp);
1761 732 : if (error)
1762 : return error;
1763 :
1764 732 : error = xfs_qm_dqattach(*ipp);
1765 732 : if (error) {
1766 0 : xfs_irele(*ipp);
1767 0 : return error;
1768 : }
1769 :
1770 732 : if (VFS_I(*ipp)->i_nlink == 0)
1771 30 : xfs_iflags_set(*ipp, XFS_IRECOVERY);
1772 :
1773 : return 0;
1774 : }
1775 :
1776 : /******************************************************************************
1777 : *
1778 : * Log recover routines
1779 : *
1780 : ******************************************************************************
1781 : */
1782 : static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
1783 : &xlog_buf_item_ops,
1784 : &xlog_inode_item_ops,
1785 : &xlog_dquot_item_ops,
1786 : &xlog_quotaoff_item_ops,
1787 : &xlog_icreate_item_ops,
1788 : &xlog_efi_item_ops,
1789 : &xlog_efd_item_ops,
1790 : &xlog_rui_item_ops,
1791 : &xlog_rud_item_ops,
1792 : &xlog_cui_item_ops,
1793 : &xlog_cud_item_ops,
1794 : &xlog_bui_item_ops,
1795 : &xlog_bud_item_ops,
1796 : &xlog_attri_item_ops,
1797 : &xlog_attrd_item_ops,
1798 : };
1799 :
1800 : static const struct xlog_recover_item_ops *
1801 65203640 : xlog_find_item_ops(
1802 : struct xlog_recover_item *item)
1803 : {
1804 65203640 : unsigned int i;
1805 :
1806 115624724 : for (i = 0; i < ARRAY_SIZE(xlog_recover_item_ops); i++)
1807 115624724 : if (ITEM_TYPE(item) == xlog_recover_item_ops[i]->item_type)
1808 65203640 : return xlog_recover_item_ops[i];
1809 :
1810 : return NULL;
1811 : }
1812 :
1813 : /*
1814 : * Sort the log items in the transaction.
1815 : *
1816 : * The ordering constraints are defined by the inode allocation and unlink
1817 : * behaviour. The rules are:
1818 : *
1819 : * 1. Every item is only logged once in a given transaction. Hence it
1820 : * represents the last logged state of the item. Hence ordering is
1821 : * dependent on the order in which operations need to be performed so
1822 : * required initial conditions are always met.
1823 : *
1824 : * 2. Cancelled buffers are recorded in pass 1 in a separate table and
1825 : * there's nothing to replay from them so we can simply cull them
1826 : * from the transaction. However, we can't do that until after we've
1827 : * replayed all the other items because they may be dependent on the
1828 : * cancelled buffer and replaying the cancelled buffer can remove it
1829 : * form the cancelled buffer table. Hence they have tobe done last.
1830 : *
1831 : * 3. Inode allocation buffers must be replayed before inode items that
1832 : * read the buffer and replay changes into it. For filesystems using the
1833 : * ICREATE transactions, this means XFS_LI_ICREATE objects need to get
1834 : * treated the same as inode allocation buffers as they create and
1835 : * initialise the buffers directly.
1836 : *
1837 : * 4. Inode unlink buffers must be replayed after inode items are replayed.
1838 : * This ensures that inodes are completely flushed to the inode buffer
1839 : * in a "free" state before we remove the unlinked inode list pointer.
1840 : *
1841 : * Hence the ordering needs to be inode allocation buffers first, inode items
1842 : * second, inode unlink buffers third and cancelled buffers last.
1843 : *
1844 : * But there's a problem with that - we can't tell an inode allocation buffer
1845 : * apart from a regular buffer, so we can't separate them. We can, however,
1846 : * tell an inode unlink buffer from the others, and so we can separate them out
1847 : * from all the other buffers and move them to last.
1848 : *
1849 : * Hence, 4 lists, in order from head to tail:
1850 : * - buffer_list for all buffers except cancelled/inode unlink buffers
1851 : * - item_list for all non-buffer items
1852 : * - inode_buffer_list for inode unlink buffers
1853 : * - cancel_list for the cancelled buffers
1854 : *
1855 : * Note that we add objects to the tail of the lists so that first-to-last
1856 : * ordering is preserved within the lists. Adding objects to the head of the
1857 : * list means when we traverse from the head we walk them in last-to-first
1858 : * order. For cancelled buffers and inode unlink buffers this doesn't matter,
1859 : * but for all other items there may be specific ordering that we need to
1860 : * preserve.
1861 : */
1862 : STATIC int
1863 904380 : xlog_recover_reorder_trans(
1864 : struct xlog *log,
1865 : struct xlog_recover *trans,
1866 : int pass)
1867 : {
1868 904380 : struct xlog_recover_item *item, *n;
1869 904380 : int error = 0;
1870 904380 : LIST_HEAD(sort_list);
1871 904380 : LIST_HEAD(cancel_list);
1872 904380 : LIST_HEAD(buffer_list);
1873 904380 : LIST_HEAD(inode_buffer_list);
1874 904380 : LIST_HEAD(item_list);
1875 :
1876 904380 : list_splice_init(&trans->r_itemq, &sort_list);
1877 66108020 : list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1878 65203640 : enum xlog_recover_reorder fate = XLOG_REORDER_ITEM_LIST;
1879 :
1880 65203640 : item->ri_ops = xlog_find_item_ops(item);
1881 65203640 : if (!item->ri_ops) {
1882 0 : xfs_warn(log->l_mp,
1883 : "%s: unrecognized type of log operation (%d)",
1884 : __func__, ITEM_TYPE(item));
1885 0 : ASSERT(0);
1886 : /*
1887 : * return the remaining items back to the transaction
1888 : * item list so they can be freed in caller.
1889 : */
1890 0 : if (!list_empty(&sort_list))
1891 0 : list_splice_init(&sort_list, &trans->r_itemq);
1892 : error = -EFSCORRUPTED;
1893 : break;
1894 : }
1895 :
1896 65203640 : if (item->ri_ops->reorder)
1897 33154902 : fate = item->ri_ops->reorder(item);
1898 :
1899 33154902 : switch (fate) {
1900 32490146 : case XLOG_REORDER_BUFFER_LIST:
1901 32490146 : list_move_tail(&item->ri_list, &buffer_list);
1902 32490146 : break;
1903 598062 : case XLOG_REORDER_CANCEL_LIST:
1904 598062 : trace_xfs_log_recover_item_reorder_head(log,
1905 : trans, item, pass);
1906 598062 : list_move(&item->ri_list, &cancel_list);
1907 598062 : break;
1908 66694 : case XLOG_REORDER_INODE_BUFFER_LIST:
1909 66694 : list_move(&item->ri_list, &inode_buffer_list);
1910 66694 : break;
1911 32048738 : case XLOG_REORDER_ITEM_LIST:
1912 32048738 : trace_xfs_log_recover_item_reorder_tail(log,
1913 : trans, item, pass);
1914 32048738 : list_move_tail(&item->ri_list, &item_list);
1915 32048738 : break;
1916 : }
1917 : }
1918 :
1919 904380 : ASSERT(list_empty(&sort_list));
1920 904380 : if (!list_empty(&buffer_list))
1921 865410 : list_splice(&buffer_list, &trans->r_itemq);
1922 904380 : if (!list_empty(&item_list))
1923 901362 : list_splice_tail(&item_list, &trans->r_itemq);
1924 904380 : if (!list_empty(&inode_buffer_list))
1925 4858 : list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1926 904380 : if (!list_empty(&cancel_list))
1927 109340 : list_splice_tail(&cancel_list, &trans->r_itemq);
1928 904380 : return error;
1929 : }
1930 :
1931 : void
1932 32266373 : xlog_buf_readahead(
1933 : struct xlog *log,
1934 : xfs_daddr_t blkno,
1935 : uint len,
1936 : const struct xfs_buf_ops *ops)
1937 : {
1938 32266373 : if (!xlog_is_buffer_cancelled(log, blkno, len))
1939 31417942 : xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops);
1940 32266373 : }
1941 :
1942 : STATIC int
1943 612874 : xlog_recover_items_pass2(
1944 : struct xlog *log,
1945 : struct xlog_recover *trans,
1946 : struct list_head *buffer_list,
1947 : struct list_head *item_list)
1948 : {
1949 612874 : struct xlog_recover_item *item;
1950 612874 : int error = 0;
1951 :
1952 33214694 : list_for_each_entry(item, item_list, ri_list) {
1953 32601820 : trace_xfs_log_recover_item_recover(log, trans, item,
1954 : XLOG_RECOVER_PASS2);
1955 :
1956 32601820 : if (item->ri_ops->commit_pass2)
1957 32601820 : error = item->ri_ops->commit_pass2(log, buffer_list,
1958 : item, trans->r_lsn);
1959 32601820 : if (error)
1960 0 : return error;
1961 : }
1962 :
1963 : return error;
1964 : }
1965 :
1966 : /*
1967 : * Perform the transaction.
1968 : *
1969 : * If the transaction modifies a buffer or inode, do it now. Otherwise,
1970 : * EFIs and EFDs get queued up by adding entries into the AIL for them.
1971 : */
1972 : STATIC int
1973 904380 : xlog_recover_commit_trans(
1974 : struct xlog *log,
1975 : struct xlog_recover *trans,
1976 : int pass,
1977 : struct list_head *buffer_list)
1978 : {
1979 904380 : int error = 0;
1980 904380 : int items_queued = 0;
1981 904380 : struct xlog_recover_item *item;
1982 904380 : struct xlog_recover_item *next;
1983 904380 : LIST_HEAD (ra_list);
1984 904380 : LIST_HEAD (done_list);
1985 :
1986 : #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
1987 :
1988 904380 : hlist_del_init(&trans->r_list);
1989 :
1990 904380 : error = xlog_recover_reorder_trans(log, trans, pass);
1991 904380 : if (error)
1992 : return error;
1993 :
1994 66108020 : list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
1995 65203640 : trace_xfs_log_recover_item_recover(log, trans, item, pass);
1996 :
1997 65203640 : switch (pass) {
1998 32601820 : case XLOG_RECOVER_PASS1:
1999 32601820 : if (item->ri_ops->commit_pass1)
2000 16548752 : error = item->ri_ops->commit_pass1(log, item);
2001 : break;
2002 32601820 : case XLOG_RECOVER_PASS2:
2003 32601820 : if (item->ri_ops->ra_pass2)
2004 32266373 : item->ri_ops->ra_pass2(log, item);
2005 32601820 : list_move_tail(&item->ri_list, &ra_list);
2006 32601820 : items_queued++;
2007 32601820 : if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
2008 162776 : error = xlog_recover_items_pass2(log, trans,
2009 : buffer_list, &ra_list);
2010 162776 : list_splice_tail_init(&ra_list, &done_list);
2011 : items_queued = 0;
2012 : }
2013 :
2014 : break;
2015 0 : default:
2016 0 : ASSERT(0);
2017 : }
2018 :
2019 65203640 : if (error)
2020 0 : goto out;
2021 : }
2022 :
2023 904380 : out:
2024 904380 : if (!list_empty(&ra_list)) {
2025 450098 : if (!error)
2026 450098 : error = xlog_recover_items_pass2(log, trans,
2027 : buffer_list, &ra_list);
2028 450098 : list_splice_tail_init(&ra_list, &done_list);
2029 : }
2030 :
2031 904380 : if (!list_empty(&done_list))
2032 452190 : list_splice_init(&done_list, &trans->r_itemq);
2033 :
2034 : return error;
2035 : }
2036 :
2037 : STATIC void
2038 65436486 : xlog_recover_add_item(
2039 : struct list_head *head)
2040 : {
2041 65436486 : struct xlog_recover_item *item;
2042 :
2043 65436486 : item = kmem_zalloc(sizeof(struct xlog_recover_item), 0);
2044 65436486 : INIT_LIST_HEAD(&item->ri_list);
2045 65436486 : list_add_tail(&item->ri_list, head);
2046 65436486 : }
2047 :
2048 : STATIC int
2049 2162154 : xlog_recover_add_to_cont_trans(
2050 : struct xlog *log,
2051 : struct xlog_recover *trans,
2052 : char *dp,
2053 : int len)
2054 : {
2055 2162154 : struct xlog_recover_item *item;
2056 2162154 : char *ptr, *old_ptr;
2057 2162154 : int old_len;
2058 :
2059 : /*
2060 : * If the transaction is empty, the header was split across this and the
2061 : * previous record. Copy the rest of the header.
2062 : */
2063 2162154 : if (list_empty(&trans->r_itemq)) {
2064 10 : ASSERT(len <= sizeof(struct xfs_trans_header));
2065 10 : if (len > sizeof(struct xfs_trans_header)) {
2066 0 : xfs_warn(log->l_mp, "%s: bad header length", __func__);
2067 0 : return -EFSCORRUPTED;
2068 : }
2069 :
2070 10 : xlog_recover_add_item(&trans->r_itemq);
2071 10 : ptr = (char *)&trans->r_theader +
2072 10 : sizeof(struct xfs_trans_header) - len;
2073 20 : memcpy(ptr, dp, len);
2074 10 : return 0;
2075 : }
2076 :
2077 : /* take the tail entry */
2078 2162144 : item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
2079 : ri_list);
2080 :
2081 2162144 : old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
2082 2162144 : old_len = item->ri_buf[item->ri_cnt-1].i_len;
2083 :
2084 2162144 : ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL);
2085 2162144 : if (!ptr)
2086 : return -ENOMEM;
2087 4324288 : memcpy(&ptr[old_len], dp, len);
2088 2162144 : item->ri_buf[item->ri_cnt-1].i_len += len;
2089 2162144 : item->ri_buf[item->ri_cnt-1].i_addr = ptr;
2090 2162144 : trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
2091 2162144 : return 0;
2092 : }
2093 :
2094 : /*
2095 : * The next region to add is the start of a new region. It could be
2096 : * a whole region or it could be the first part of a new region. Because
2097 : * of this, the assumption here is that the type and size fields of all
2098 : * format structures fit into the first 32 bits of the structure.
2099 : *
2100 : * This works because all regions must be 32 bit aligned. Therefore, we
2101 : * either have both fields or we have neither field. In the case we have
2102 : * neither field, the data part of the region is zero length. We only have
2103 : * a log_op_header and can throw away the header since a new one will appear
2104 : * later. If we have at least 4 bytes, then we can determine how many regions
2105 : * will appear in the current log item.
2106 : */
2107 : STATIC int
2108 162847070 : xlog_recover_add_to_trans(
2109 : struct xlog *log,
2110 : struct xlog_recover *trans,
2111 : char *dp,
2112 : int len)
2113 : {
2114 162847070 : struct xfs_inode_log_format *in_f; /* any will do */
2115 162847070 : struct xlog_recover_item *item;
2116 162847070 : char *ptr;
2117 :
2118 162847070 : if (!len)
2119 : return 0;
2120 162847070 : if (list_empty(&trans->r_itemq)) {
2121 : /* we need to catch log corruptions here */
2122 906408 : if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
2123 0 : xfs_warn(log->l_mp, "%s: bad header magic number",
2124 : __func__);
2125 0 : ASSERT(0);
2126 0 : return -EFSCORRUPTED;
2127 : }
2128 :
2129 906408 : if (len > sizeof(struct xfs_trans_header)) {
2130 0 : xfs_warn(log->l_mp, "%s: bad header length", __func__);
2131 0 : ASSERT(0);
2132 0 : return -EFSCORRUPTED;
2133 : }
2134 :
2135 : /*
2136 : * The transaction header can be arbitrarily split across op
2137 : * records. If we don't have the whole thing here, copy what we
2138 : * do have and handle the rest in the next record.
2139 : */
2140 906408 : if (len == sizeof(struct xfs_trans_header))
2141 906398 : xlog_recover_add_item(&trans->r_itemq);
2142 1812816 : memcpy(&trans->r_theader, dp, len);
2143 906408 : return 0;
2144 : }
2145 :
2146 161940662 : ptr = kmem_alloc(len, 0);
2147 323881324 : memcpy(ptr, dp, len);
2148 161940662 : in_f = (struct xfs_inode_log_format *)ptr;
2149 :
2150 : /* take the tail entry */
2151 161940662 : item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
2152 : ri_list);
2153 161940662 : if (item->ri_total != 0 &&
2154 161034254 : item->ri_total == item->ri_cnt) {
2155 : /* tail item is in use, get a new one */
2156 64530078 : xlog_recover_add_item(&trans->r_itemq);
2157 64530078 : item = list_entry(trans->r_itemq.prev,
2158 : struct xlog_recover_item, ri_list);
2159 : }
2160 :
2161 161940662 : if (item->ri_total == 0) { /* first region to be added */
2162 65436486 : if (in_f->ilf_size == 0 ||
2163 : in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
2164 0 : xfs_warn(log->l_mp,
2165 : "bad number of regions (%d) in inode log format",
2166 : in_f->ilf_size);
2167 0 : ASSERT(0);
2168 0 : kmem_free(ptr);
2169 0 : return -EFSCORRUPTED;
2170 : }
2171 :
2172 65436486 : item->ri_total = in_f->ilf_size;
2173 65436486 : item->ri_buf =
2174 65436486 : kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
2175 : 0);
2176 : }
2177 :
2178 161940662 : if (item->ri_total <= item->ri_cnt) {
2179 0 : xfs_warn(log->l_mp,
2180 : "log item region count (%d) overflowed size (%d)",
2181 : item->ri_cnt, item->ri_total);
2182 0 : ASSERT(0);
2183 0 : kmem_free(ptr);
2184 0 : return -EFSCORRUPTED;
2185 : }
2186 :
2187 : /* Description region is ri_buf[0] */
2188 161940662 : item->ri_buf[item->ri_cnt].i_addr = ptr;
2189 161940662 : item->ri_buf[item->ri_cnt].i_len = len;
2190 161940662 : item->ri_cnt++;
2191 161940662 : trace_xfs_log_recover_item_add(log, trans, item, 0);
2192 161940662 : return 0;
2193 : }
2194 :
2195 : /*
2196 : * Free up any resources allocated by the transaction
2197 : *
2198 : * Remember that EFIs, EFDs, and IUNLINKs are handled later.
2199 : */
2200 : STATIC void
2201 906408 : xlog_recover_free_trans(
2202 : struct xlog_recover *trans)
2203 : {
2204 906408 : struct xlog_recover_item *item, *n;
2205 906408 : int i;
2206 :
2207 906408 : hlist_del_init(&trans->r_list);
2208 :
2209 66342894 : list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
2210 : /* Free the regions in the item. */
2211 65436486 : list_del(&item->ri_list);
2212 227377148 : for (i = 0; i < item->ri_cnt; i++)
2213 161940662 : kmem_free(item->ri_buf[i].i_addr);
2214 : /* Free the item itself */
2215 65436486 : kmem_free(item->ri_buf);
2216 65436486 : kmem_free(item);
2217 : }
2218 : /* Free the transaction recover structure */
2219 906408 : kmem_free(trans);
2220 906408 : }
2221 :
2222 : /*
2223 : * On error or completion, trans is freed.
2224 : */
2225 : STATIC int
2226 165913604 : xlog_recovery_process_trans(
2227 : struct xlog *log,
2228 : struct xlog_recover *trans,
2229 : char *dp,
2230 : unsigned int len,
2231 : unsigned int flags,
2232 : int pass,
2233 : struct list_head *buffer_list)
2234 : {
2235 165913604 : int error = 0;
2236 165913604 : bool freeit = false;
2237 :
2238 : /* mask off ophdr transaction container flags */
2239 165913604 : flags &= ~XLOG_END_TRANS;
2240 165913604 : if (flags & XLOG_WAS_CONT_TRANS)
2241 2162154 : flags &= ~XLOG_CONTINUE_TRANS;
2242 :
2243 : /*
2244 : * Callees must not free the trans structure. We'll decide if we need to
2245 : * free it or not based on the operation being done and it's result.
2246 : */
2247 165913604 : switch (flags) {
2248 : /* expected flag values */
2249 162847070 : case 0:
2250 : case XLOG_CONTINUE_TRANS:
2251 162847070 : error = xlog_recover_add_to_trans(log, trans, dp, len);
2252 162847070 : break;
2253 2162154 : case XLOG_WAS_CONT_TRANS:
2254 2162154 : error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
2255 2162154 : break;
2256 904380 : case XLOG_COMMIT_TRANS:
2257 904380 : error = xlog_recover_commit_trans(log, trans, pass,
2258 : buffer_list);
2259 : /* success or fail, we are now done with this transaction. */
2260 904380 : freeit = true;
2261 904380 : break;
2262 :
2263 : /* unexpected flag values */
2264 0 : case XLOG_UNMOUNT_TRANS:
2265 : /* just skip trans */
2266 0 : xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
2267 0 : freeit = true;
2268 0 : break;
2269 0 : case XLOG_START_TRANS:
2270 : default:
2271 0 : xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
2272 0 : ASSERT(0);
2273 0 : error = -EFSCORRUPTED;
2274 0 : break;
2275 : }
2276 165913604 : if (error || freeit)
2277 904380 : xlog_recover_free_trans(trans);
2278 165913604 : return error;
2279 : }
2280 :
2281 : /*
2282 : * Lookup the transaction recovery structure associated with the ID in the
2283 : * current ophdr. If the transaction doesn't exist and the start flag is set in
2284 : * the ophdr, then allocate a new transaction for future ID matches to find.
2285 : * Either way, return what we found during the lookup - an existing transaction
2286 : * or nothing.
2287 : */
2288 : STATIC struct xlog_recover *
2289 166875628 : xlog_recover_ophdr_to_trans(
2290 : struct hlist_head rhash[],
2291 : struct xlog_rec_header *rhead,
2292 : struct xlog_op_header *ohead)
2293 : {
2294 166875628 : struct xlog_recover *trans;
2295 166875628 : xlog_tid_t tid;
2296 166875628 : struct hlist_head *rhp;
2297 :
2298 166875628 : tid = be32_to_cpu(ohead->oh_tid);
2299 166875628 : rhp = &rhash[XLOG_RHASH(tid)];
2300 333775914 : hlist_for_each_entry(trans, rhp, r_list) {
2301 165938262 : if (trans->r_log_tid == tid)
2302 165913604 : return trans;
2303 : }
2304 :
2305 : /*
2306 : * skip over non-start transaction headers - we could be
2307 : * processing slack space before the next transaction starts
2308 : */
2309 962024 : if (!(ohead->oh_flags & XLOG_START_TRANS))
2310 : return NULL;
2311 :
2312 906408 : ASSERT(be32_to_cpu(ohead->oh_len) == 0);
2313 :
2314 : /*
2315 : * This is a new transaction so allocate a new recovery container to
2316 : * hold the recovery ops that will follow.
2317 : */
2318 906408 : trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
2319 906408 : trans->r_log_tid = tid;
2320 906408 : trans->r_lsn = be64_to_cpu(rhead->h_lsn);
2321 906408 : INIT_LIST_HEAD(&trans->r_itemq);
2322 906408 : INIT_HLIST_NODE(&trans->r_list);
2323 906408 : hlist_add_head(&trans->r_list, rhp);
2324 :
2325 : /*
2326 : * Nothing more to do for this ophdr. Items to be added to this new
2327 : * transaction will be in subsequent ophdr containers.
2328 : */
2329 906408 : return NULL;
2330 : }
2331 :
2332 : STATIC int
2333 166875628 : xlog_recover_process_ophdr(
2334 : struct xlog *log,
2335 : struct hlist_head rhash[],
2336 : struct xlog_rec_header *rhead,
2337 : struct xlog_op_header *ohead,
2338 : char *dp,
2339 : char *end,
2340 : int pass,
2341 : struct list_head *buffer_list)
2342 : {
2343 166875628 : struct xlog_recover *trans;
2344 166875628 : unsigned int len;
2345 166875628 : int error;
2346 :
2347 : /* Do we understand who wrote this op? */
2348 166875628 : if (ohead->oh_clientid != XFS_TRANSACTION &&
2349 : ohead->oh_clientid != XFS_LOG) {
2350 0 : xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
2351 : __func__, ohead->oh_clientid);
2352 0 : ASSERT(0);
2353 0 : return -EFSCORRUPTED;
2354 : }
2355 :
2356 : /*
2357 : * Check the ophdr contains all the data it is supposed to contain.
2358 : */
2359 166875628 : len = be32_to_cpu(ohead->oh_len);
2360 166875628 : if (dp + len > end) {
2361 0 : xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
2362 0 : WARN_ON(1);
2363 0 : return -EFSCORRUPTED;
2364 : }
2365 :
2366 166875628 : trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
2367 166875628 : if (!trans) {
2368 : /* nothing to do, so skip over this ophdr */
2369 : return 0;
2370 : }
2371 :
2372 : /*
2373 : * The recovered buffer queue is drained only once we know that all
2374 : * recovery items for the current LSN have been processed. This is
2375 : * required because:
2376 : *
2377 : * - Buffer write submission updates the metadata LSN of the buffer.
2378 : * - Log recovery skips items with a metadata LSN >= the current LSN of
2379 : * the recovery item.
2380 : * - Separate recovery items against the same metadata buffer can share
2381 : * a current LSN. I.e., consider that the LSN of a recovery item is
2382 : * defined as the starting LSN of the first record in which its
2383 : * transaction appears, that a record can hold multiple transactions,
2384 : * and/or that a transaction can span multiple records.
2385 : *
2386 : * In other words, we are allowed to submit a buffer from log recovery
2387 : * once per current LSN. Otherwise, we may incorrectly skip recovery
2388 : * items and cause corruption.
2389 : *
2390 : * We don't know up front whether buffers are updated multiple times per
2391 : * LSN. Therefore, track the current LSN of each commit log record as it
2392 : * is processed and drain the queue when it changes. Use commit records
2393 : * because they are ordered correctly by the logging code.
2394 : */
2395 165913604 : if (log->l_recovery_lsn != trans->r_lsn &&
2396 165210814 : ohead->oh_flags & XLOG_COMMIT_TRANS) {
2397 901069 : error = xfs_buf_delwri_submit(buffer_list);
2398 901069 : if (error)
2399 : return error;
2400 901069 : log->l_recovery_lsn = trans->r_lsn;
2401 : }
2402 :
2403 165913604 : return xlog_recovery_process_trans(log, trans, dp, len,
2404 165913604 : ohead->oh_flags, pass, buffer_list);
2405 : }
2406 :
2407 : /*
2408 : * There are two valid states of the r_state field. 0 indicates that the
2409 : * transaction structure is in a normal state. We have either seen the
2410 : * start of the transaction or the last operation we added was not a partial
2411 : * operation. If the last operation we added to the transaction was a
2412 : * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
2413 : *
2414 : * NOTE: skip LRs with 0 data length.
2415 : */
2416 : STATIC int
2417 3124608 : xlog_recover_process_data(
2418 : struct xlog *log,
2419 : struct hlist_head rhash[],
2420 : struct xlog_rec_header *rhead,
2421 : char *dp,
2422 : int pass,
2423 : struct list_head *buffer_list)
2424 : {
2425 3124608 : struct xlog_op_header *ohead;
2426 3124608 : char *end;
2427 3124608 : int num_logops;
2428 3124608 : int error;
2429 :
2430 3124608 : end = dp + be32_to_cpu(rhead->h_len);
2431 3124608 : num_logops = be32_to_cpu(rhead->h_num_logops);
2432 :
2433 : /* check the log format matches our own - else we can't recover */
2434 3124608 : if (xlog_header_check_recover(log->l_mp, rhead))
2435 : return -EIO;
2436 :
2437 3124608 : trace_xfs_log_recover_record(log, rhead, pass);
2438 170000236 : while ((dp < end) && num_logops) {
2439 :
2440 166875628 : ohead = (struct xlog_op_header *)dp;
2441 166875628 : dp += sizeof(*ohead);
2442 166875628 : ASSERT(dp <= end);
2443 :
2444 : /* errors will abort recovery */
2445 166875628 : error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
2446 : dp, end, pass, buffer_list);
2447 166875628 : if (error)
2448 0 : return error;
2449 :
2450 166875628 : dp += be32_to_cpu(ohead->oh_len);
2451 166875628 : num_logops--;
2452 : }
2453 : return 0;
2454 : }
2455 :
2456 : /* Take all the collected deferred ops and finish them in order. */
2457 : static int
2458 13717 : xlog_finish_defer_ops(
2459 : struct xfs_mount *mp,
2460 : struct list_head *capture_list)
2461 : {
2462 13717 : struct xfs_defer_capture *dfc, *next;
2463 13717 : struct xfs_trans *tp;
2464 13717 : int error = 0;
2465 :
2466 15057 : list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
2467 1340 : struct xfs_trans_res resv;
2468 1340 : struct xfs_defer_resources dres;
2469 :
2470 : /*
2471 : * Create a new transaction reservation from the captured
2472 : * information. Set logcount to 1 to force the new transaction
2473 : * to regrant every roll so that we can make forward progress
2474 : * in recovery no matter how full the log might be.
2475 : */
2476 1340 : resv.tr_logres = dfc->dfc_logres;
2477 1340 : resv.tr_logcount = 1;
2478 1340 : resv.tr_logflags = XFS_TRANS_PERM_LOG_RES;
2479 :
2480 1340 : error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
2481 : dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
2482 1340 : if (error) {
2483 0 : xlog_force_shutdown(mp->m_log, SHUTDOWN_LOG_IO_ERROR);
2484 0 : return error;
2485 : }
2486 :
2487 : /*
2488 : * Transfer to this new transaction all the dfops we captured
2489 : * from recovering a single intent item.
2490 : */
2491 1340 : list_del_init(&dfc->dfc_list);
2492 1340 : xfs_defer_ops_continue(dfc, tp, &dres);
2493 1340 : error = xfs_trans_commit(tp);
2494 1340 : xfs_defer_resources_rele(&dres);
2495 1340 : if (error)
2496 0 : return error;
2497 : }
2498 :
2499 13717 : ASSERT(list_empty(capture_list));
2500 : return 0;
2501 : }
2502 :
2503 : /* Release all the captured defer ops and capture structures in this list. */
2504 : static void
2505 6 : xlog_abort_defer_ops(
2506 : struct xfs_mount *mp,
2507 : struct list_head *capture_list)
2508 : {
2509 6 : struct xfs_defer_capture *dfc;
2510 6 : struct xfs_defer_capture *next;
2511 :
2512 6 : list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
2513 0 : list_del_init(&dfc->dfc_list);
2514 0 : xfs_defer_ops_capture_abort(mp, dfc);
2515 : }
2516 6 : }
2517 :
2518 : /*
2519 : * When this is called, all of the log intent items which did not have
2520 : * corresponding log done items should be in the AIL. What we do now is update
2521 : * the data structures associated with each one.
2522 : *
2523 : * Since we process the log intent items in normal transactions, they will be
2524 : * removed at some point after the commit. This prevents us from just walking
2525 : * down the list processing each one. We'll use a flag in the intent item to
2526 : * skip those that we've already processed and use the AIL iteration mechanism's
2527 : * generation count to try to speed this up at least a bit.
2528 : *
2529 : * When we start, we know that the intents are the only things in the AIL. As we
2530 : * process them, however, other items are added to the AIL. Hence we know we
2531 : * have started recovery on all the pending intents when we find an non-intent
2532 : * item in the AIL.
2533 : */
2534 : STATIC int
2535 13723 : xlog_recover_process_intents(
2536 : struct xlog *log)
2537 : {
2538 13723 : LIST_HEAD(capture_list);
2539 13723 : struct xfs_ail_cursor cur;
2540 13723 : struct xfs_log_item *lip;
2541 13723 : struct xfs_ail *ailp;
2542 13723 : int error = 0;
2543 : #if defined(DEBUG) || defined(XFS_WARN)
2544 13723 : xfs_lsn_t last_lsn;
2545 : #endif
2546 :
2547 13723 : ailp = log->l_ailp;
2548 13723 : spin_lock(&ailp->ail_lock);
2549 : #if defined(DEBUG) || defined(XFS_WARN)
2550 13723 : last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
2551 : #endif
2552 13723 : for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2553 19219 : lip != NULL;
2554 5496 : lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
2555 5502 : const struct xfs_item_ops *ops;
2556 :
2557 5502 : if (!xlog_item_is_intent(lip))
2558 : break;
2559 :
2560 : /*
2561 : * We should never see a redo item with a LSN higher than
2562 : * the last transaction we found in the log at the start
2563 : * of recovery.
2564 : */
2565 11004 : ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
2566 :
2567 : /*
2568 : * NOTE: If your intent processing routine can create more
2569 : * deferred ops, you /must/ attach them to the capture list in
2570 : * the recover routine or else those subsequent intents will be
2571 : * replayed in the wrong order!
2572 : *
2573 : * The recovery function can free the log item, so we must not
2574 : * access lip after it returns.
2575 : */
2576 5502 : spin_unlock(&ailp->ail_lock);
2577 5502 : ops = lip->li_ops;
2578 5502 : error = ops->iop_recover(lip, &capture_list);
2579 5502 : spin_lock(&ailp->ail_lock);
2580 5502 : if (error) {
2581 6 : trace_xlog_intent_recovery_failed(log->l_mp, error,
2582 6 : ops->iop_recover);
2583 6 : break;
2584 : }
2585 : }
2586 :
2587 13723 : xfs_trans_ail_cursor_done(&cur);
2588 13723 : spin_unlock(&ailp->ail_lock);
2589 13723 : if (error)
2590 6 : goto err;
2591 :
2592 13717 : error = xlog_finish_defer_ops(log->l_mp, &capture_list);
2593 13717 : if (error)
2594 0 : goto err;
2595 :
2596 : return 0;
2597 6 : err:
2598 6 : xlog_abort_defer_ops(log->l_mp, &capture_list);
2599 6 : return error;
2600 : }
2601 :
2602 : /*
2603 : * A cancel occurs when the mount has failed and we're bailing out. Release all
2604 : * pending log intent items that we haven't started recovery on so they don't
2605 : * pin the AIL.
2606 : */
2607 : STATIC void
2608 6 : xlog_recover_cancel_intents(
2609 : struct xlog *log)
2610 : {
2611 6 : struct xfs_log_item *lip;
2612 6 : struct xfs_ail_cursor cur;
2613 6 : struct xfs_ail *ailp;
2614 :
2615 6 : ailp = log->l_ailp;
2616 6 : spin_lock(&ailp->ail_lock);
2617 6 : lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2618 6 : while (lip != NULL) {
2619 0 : if (!xlog_item_is_intent(lip))
2620 : break;
2621 :
2622 0 : spin_unlock(&ailp->ail_lock);
2623 0 : lip->li_ops->iop_release(lip);
2624 0 : spin_lock(&ailp->ail_lock);
2625 0 : lip = xfs_trans_ail_cursor_next(ailp, &cur);
2626 : }
2627 :
2628 6 : xfs_trans_ail_cursor_done(&cur);
2629 6 : spin_unlock(&ailp->ail_lock);
2630 6 : }
2631 :
2632 : /*
2633 : * This routine performs a transaction to null out a bad inode pointer
2634 : * in an agi unlinked inode hash bucket.
2635 : */
2636 : STATIC void
2637 10 : xlog_recover_clear_agi_bucket(
2638 : struct xfs_perag *pag,
2639 : int bucket)
2640 : {
2641 10 : struct xfs_mount *mp = pag->pag_mount;
2642 10 : struct xfs_trans *tp;
2643 10 : struct xfs_agi *agi;
2644 10 : struct xfs_buf *agibp;
2645 10 : int offset;
2646 10 : int error;
2647 :
2648 10 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
2649 10 : if (error)
2650 10 : goto out_error;
2651 :
2652 0 : error = xfs_read_agi(pag, tp, &agibp);
2653 0 : if (error)
2654 0 : goto out_abort;
2655 :
2656 0 : agi = agibp->b_addr;
2657 0 : agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
2658 0 : offset = offsetof(xfs_agi_t, agi_unlinked) +
2659 : (sizeof(xfs_agino_t) * bucket);
2660 0 : xfs_trans_log_buf(tp, agibp, offset,
2661 : (offset + sizeof(xfs_agino_t) - 1));
2662 :
2663 0 : error = xfs_trans_commit(tp);
2664 0 : if (error)
2665 0 : goto out_error;
2666 : return;
2667 :
2668 : out_abort:
2669 0 : xfs_trans_cancel(tp);
2670 10 : out_error:
2671 10 : xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__,
2672 : pag->pag_agno);
2673 10 : return;
2674 : }
2675 :
2676 : static int
2677 3740160 : xlog_recover_iunlink_bucket(
2678 : struct xfs_perag *pag,
2679 : struct xfs_agi *agi,
2680 : int bucket)
2681 : {
2682 3740160 : struct xfs_mount *mp = pag->pag_mount;
2683 3740160 : struct xfs_inode *prev_ip = NULL;
2684 3740160 : struct xfs_inode *ip;
2685 3740160 : xfs_agino_t prev_agino, agino;
2686 3740160 : int error = 0;
2687 :
2688 3740160 : agino = be32_to_cpu(agi->agi_unlinked[bucket]);
2689 4635345 : while (agino != NULLAGINO) {
2690 1790370 : error = xfs_iget(mp, NULL,
2691 895185 : XFS_AGINO_TO_INO(mp, pag->pag_agno, agino),
2692 : 0, 0, &ip);
2693 895185 : if (error)
2694 : break;
2695 :
2696 895185 : ASSERT(VFS_I(ip)->i_nlink == 0);
2697 895185 : ASSERT(VFS_I(ip)->i_mode != 0);
2698 895185 : xfs_iflags_clear(ip, XFS_IRECOVERY);
2699 895185 : agino = ip->i_next_unlinked;
2700 :
2701 895185 : if (prev_ip) {
2702 885979 : ip->i_prev_unlinked = prev_agino;
2703 885979 : xfs_irele(prev_ip);
2704 :
2705 : /*
2706 : * Ensure the inode is removed from the unlinked list
2707 : * before we continue so that it won't race with
2708 : * building the in-memory list here. This could be
2709 : * serialised with the agibp lock, but that just
2710 : * serialises via lockstepping and it's much simpler
2711 : * just to flush the inodegc queue and wait for it to
2712 : * complete.
2713 : */
2714 885979 : error = xfs_inodegc_flush(mp);
2715 885979 : if (error)
2716 : break;
2717 : }
2718 :
2719 895185 : prev_agino = agino;
2720 895185 : prev_ip = ip;
2721 : }
2722 :
2723 3740160 : if (prev_ip) {
2724 9206 : int error2;
2725 :
2726 9206 : ip->i_prev_unlinked = prev_agino;
2727 9206 : xfs_irele(prev_ip);
2728 :
2729 9206 : error2 = xfs_inodegc_flush(mp);
2730 9206 : if (error2 && !error)
2731 10 : return error2;
2732 : }
2733 : return error;
2734 : }
2735 :
2736 : /*
2737 : * Recover AGI unlinked lists
2738 : *
2739 : * This is called during recovery to process any inodes which we unlinked but
2740 : * not freed when the system crashed. These inodes will be on the lists in the
2741 : * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
2742 : * any inodes found on the lists. Each inode is removed from the lists when it
2743 : * has been fully truncated and is freed. The freeing of the inode and its
2744 : * removal from the list must be atomic.
2745 : *
2746 : * If everything we touch in the agi processing loop is already in memory, this
2747 : * loop can hold the cpu for a long time. It runs without lock contention,
2748 : * memory allocation contention, the need wait for IO, etc, and so will run
2749 : * until we either run out of inodes to process, run low on memory or we run out
2750 : * of log space.
2751 : *
2752 : * This behaviour is bad for latency on single CPU and non-preemptible kernels,
2753 : * and can prevent other filesystem work (such as CIL pushes) from running. This
2754 : * can lead to deadlocks if the recovery process runs out of log reservation
2755 : * space. Hence we need to yield the CPU when there is other kernel work
2756 : * scheduled on this CPU to ensure other scheduled work can run without undue
2757 : * latency.
2758 : */
2759 : static void
2760 58494 : xlog_recover_iunlink_ag(
2761 : struct xfs_perag *pag)
2762 : {
2763 58494 : struct xfs_agi *agi;
2764 58494 : struct xfs_buf *agibp;
2765 58494 : int bucket;
2766 58494 : int error;
2767 :
2768 58494 : error = xfs_read_agi(pag, NULL, &agibp);
2769 58494 : if (error) {
2770 : /*
2771 : * AGI is b0rked. Don't process it.
2772 : *
2773 : * We should probably mark the filesystem as corrupt after we've
2774 : * recovered all the ag's we can....
2775 : */
2776 54 : return;
2777 : }
2778 :
2779 : /*
2780 : * Unlock the buffer so that it can be acquired in the normal course of
2781 : * the transaction to truncate and free each inode. Because we are not
2782 : * racing with anyone else here for the AGI buffer, we don't even need
2783 : * to hold it locked to read the initial unlinked bucket entries out of
2784 : * the buffer. We keep buffer reference though, so that it stays pinned
2785 : * in memory while we need the buffer.
2786 : */
2787 58440 : agi = agibp->b_addr;
2788 58440 : xfs_buf_unlock(agibp);
2789 :
2790 3857040 : for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
2791 3740160 : error = xlog_recover_iunlink_bucket(pag, agi, bucket);
2792 3740160 : if (error) {
2793 : /*
2794 : * Bucket is unrecoverable, so only a repair scan can
2795 : * free the remaining unlinked inodes. Just empty the
2796 : * bucket and remaining inodes on it unreferenced and
2797 : * unfreeable.
2798 : */
2799 10 : xlog_recover_clear_agi_bucket(pag, bucket);
2800 : }
2801 : }
2802 :
2803 58440 : xfs_buf_rele(agibp);
2804 : }
2805 :
2806 : static void
2807 13717 : xlog_recover_process_iunlinks(
2808 : struct xlog *log)
2809 : {
2810 13717 : struct xfs_perag *pag;
2811 13717 : xfs_agnumber_t agno;
2812 :
2813 72211 : for_each_perag(log->l_mp, agno, pag)
2814 58494 : xlog_recover_iunlink_ag(pag);
2815 13717 : }
2816 :
2817 : STATIC void
2818 3124608 : xlog_unpack_data(
2819 : struct xlog_rec_header *rhead,
2820 : char *dp,
2821 : struct xlog *log)
2822 : {
2823 3124608 : int i, j, k;
2824 :
2825 171072186 : for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
2826 167947578 : i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
2827 167947578 : *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
2828 167947578 : dp += BBSIZE;
2829 : }
2830 :
2831 3124608 : if (xfs_has_logv2(log->l_mp)) {
2832 : xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
2833 4749354 : for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
2834 1624746 : j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
2835 1624746 : k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
2836 1624746 : *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
2837 1624746 : dp += BBSIZE;
2838 : }
2839 : }
2840 3124608 : }
2841 :
2842 : /*
2843 : * CRC check, unpack and process a log record.
2844 : */
2845 : STATIC int
2846 4765274 : xlog_recover_process(
2847 : struct xlog *log,
2848 : struct hlist_head rhash[],
2849 : struct xlog_rec_header *rhead,
2850 : char *dp,
2851 : int pass,
2852 : struct list_head *buffer_list)
2853 : {
2854 4765274 : __le32 old_crc = rhead->h_crc;
2855 4765274 : __le32 crc;
2856 :
2857 4765274 : crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
2858 :
2859 : /*
2860 : * Nothing else to do if this is a CRC verification pass. Just return
2861 : * if this a record with a non-zero crc. Unfortunately, mkfs always
2862 : * sets old_crc to 0 so we must consider this valid even on v5 supers.
2863 : * Otherwise, return EFSBADCRC on failure so the callers up the stack
2864 : * know precisely what failed.
2865 : */
2866 4765274 : if (pass == XLOG_RECOVER_CRCPASS) {
2867 1640666 : if (old_crc && crc != old_crc)
2868 : return -EFSBADCRC;
2869 1640610 : return 0;
2870 : }
2871 :
2872 : /*
2873 : * We're in the normal recovery path. Issue a warning if and only if the
2874 : * CRC in the header is non-zero. This is an advisory warning and the
2875 : * zero CRC check prevents warnings from being emitted when upgrading
2876 : * the kernel from one that does not add CRCs by default.
2877 : */
2878 3124608 : if (crc != old_crc) {
2879 0 : if (old_crc || xfs_has_crc(log->l_mp)) {
2880 0 : xfs_alert(log->l_mp,
2881 : "log record CRC mismatch: found 0x%x, expected 0x%x.",
2882 : le32_to_cpu(old_crc),
2883 : le32_to_cpu(crc));
2884 0 : xfs_hex_dump(dp, 32);
2885 : }
2886 :
2887 : /*
2888 : * If the filesystem is CRC enabled, this mismatch becomes a
2889 : * fatal log corruption failure.
2890 : */
2891 0 : if (xfs_has_crc(log->l_mp)) {
2892 0 : XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
2893 0 : return -EFSCORRUPTED;
2894 : }
2895 : }
2896 :
2897 3124608 : xlog_unpack_data(rhead, dp, log);
2898 :
2899 3124608 : return xlog_recover_process_data(log, rhash, rhead, dp, pass,
2900 : buffer_list);
2901 : }
2902 :
2903 : STATIC int
2904 4820190 : xlog_valid_rec_header(
2905 : struct xlog *log,
2906 : struct xlog_rec_header *rhead,
2907 : xfs_daddr_t blkno,
2908 : int bufsize)
2909 : {
2910 4820190 : int hlen;
2911 :
2912 4820190 : if (XFS_IS_CORRUPT(log->l_mp,
2913 : rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)))
2914 0 : return -EFSCORRUPTED;
2915 4820190 : if (XFS_IS_CORRUPT(log->l_mp,
2916 : (!rhead->h_version ||
2917 : (be32_to_cpu(rhead->h_version) &
2918 : (~XLOG_VERSION_OKBITS))))) {
2919 0 : xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
2920 : __func__, be32_to_cpu(rhead->h_version));
2921 0 : return -EFSCORRUPTED;
2922 : }
2923 :
2924 : /*
2925 : * LR body must have data (or it wouldn't have been written)
2926 : * and h_len must not be greater than LR buffer size.
2927 : */
2928 4820190 : hlen = be32_to_cpu(rhead->h_len);
2929 4820190 : if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > bufsize))
2930 0 : return -EFSCORRUPTED;
2931 :
2932 4820190 : if (XFS_IS_CORRUPT(log->l_mp,
2933 : blkno > log->l_logBBsize || blkno > INT_MAX))
2934 0 : return -EFSCORRUPTED;
2935 : return 0;
2936 : }
2937 :
2938 : /*
2939 : * Read the log from tail to head and process the log records found.
2940 : * Handle the two cases where the tail and head are in the same cycle
2941 : * and where the active portion of the log wraps around the end of
2942 : * the physical log separately. The pass parameter is passed through
2943 : * to the routines called to process the data and is not looked at
2944 : * here.
2945 : */
2946 : STATIC int
2947 54916 : xlog_do_recovery_pass(
2948 : struct xlog *log,
2949 : xfs_daddr_t head_blk,
2950 : xfs_daddr_t tail_blk,
2951 : int pass,
2952 : xfs_daddr_t *first_bad) /* out: first bad log rec */
2953 : {
2954 54916 : xlog_rec_header_t *rhead;
2955 54916 : xfs_daddr_t blk_no, rblk_no;
2956 54916 : xfs_daddr_t rhead_blk;
2957 54916 : char *offset;
2958 54916 : char *hbp, *dbp;
2959 54916 : int error = 0, h_size, h_len;
2960 54916 : int error2 = 0;
2961 54916 : int bblks, split_bblks;
2962 54916 : int hblks, split_hblks, wrapped_hblks;
2963 54916 : int i;
2964 54916 : struct hlist_head rhash[XLOG_RHASH_SIZE];
2965 54916 : LIST_HEAD (buffer_list);
2966 :
2967 54916 : ASSERT(head_blk != tail_blk);
2968 : blk_no = rhead_blk = tail_blk;
2969 :
2970 933572 : for (i = 0; i < XLOG_RHASH_SIZE; i++)
2971 878656 : INIT_HLIST_HEAD(&rhash[i]);
2972 :
2973 : /*
2974 : * Read the header of the tail block and get the iclog buffer size from
2975 : * h_size. Use this to tell how many sectors make up the log header.
2976 : */
2977 54916 : if (xfs_has_logv2(log->l_mp)) {
2978 : /*
2979 : * When using variable length iclogs, read first sector of
2980 : * iclog header and extract the header size from it. Get a
2981 : * new hbp that is the correct size.
2982 : */
2983 54916 : hbp = xlog_alloc_buffer(log, 1);
2984 54916 : if (!hbp)
2985 : return -ENOMEM;
2986 :
2987 54916 : error = xlog_bread(log, tail_blk, 1, hbp, &offset);
2988 54916 : if (error)
2989 0 : goto bread_err1;
2990 :
2991 54916 : rhead = (xlog_rec_header_t *)offset;
2992 :
2993 : /*
2994 : * xfsprogs has a bug where record length is based on lsunit but
2995 : * h_size (iclog size) is hardcoded to 32k. Now that we
2996 : * unconditionally CRC verify the unmount record, this means the
2997 : * log buffer can be too small for the record and cause an
2998 : * overrun.
2999 : *
3000 : * Detect this condition here. Use lsunit for the buffer size as
3001 : * long as this looks like the mkfs case. Otherwise, return an
3002 : * error to avoid a buffer overrun.
3003 : */
3004 54916 : h_size = be32_to_cpu(rhead->h_size);
3005 54916 : h_len = be32_to_cpu(rhead->h_len);
3006 54916 : if (h_len > h_size && h_len <= log->l_mp->m_logbsize &&
3007 0 : rhead->h_num_logops == cpu_to_be32(1)) {
3008 0 : xfs_warn(log->l_mp,
3009 : "invalid iclog size (%d bytes), using lsunit (%d bytes)",
3010 : h_size, log->l_mp->m_logbsize);
3011 0 : h_size = log->l_mp->m_logbsize;
3012 : }
3013 :
3014 54916 : error = xlog_valid_rec_header(log, rhead, tail_blk, h_size);
3015 54916 : if (error)
3016 0 : goto bread_err1;
3017 :
3018 54916 : hblks = xlog_logrec_hblks(log, rhead);
3019 54916 : if (hblks != 1) {
3020 796 : kmem_free(hbp);
3021 796 : hbp = xlog_alloc_buffer(log, hblks);
3022 : }
3023 : } else {
3024 0 : ASSERT(log->l_sectBBsize == 1);
3025 0 : hblks = 1;
3026 0 : hbp = xlog_alloc_buffer(log, 1);
3027 0 : h_size = XLOG_BIG_RECORD_BSIZE;
3028 : }
3029 :
3030 54916 : if (!hbp)
3031 : return -ENOMEM;
3032 54916 : dbp = xlog_alloc_buffer(log, BTOBB(h_size));
3033 54916 : if (!dbp) {
3034 0 : kmem_free(hbp);
3035 0 : return -ENOMEM;
3036 : }
3037 :
3038 54916 : memset(rhash, 0, sizeof(rhash));
3039 54916 : if (tail_blk > head_blk) {
3040 : /*
3041 : * Perform recovery around the end of the physical log.
3042 : * When the head is not on the same cycle number as the tail,
3043 : * we can't do a sequential recovery.
3044 : */
3045 284494 : while (blk_no < log->l_logBBsize) {
3046 : /*
3047 : * Check for header wrapping around physical end-of-log
3048 : */
3049 282599 : offset = hbp;
3050 282599 : split_hblks = 0;
3051 282599 : wrapped_hblks = 0;
3052 282599 : if (blk_no + hblks <= log->l_logBBsize) {
3053 : /* Read header in one read */
3054 282599 : error = xlog_bread(log, blk_no, hblks, hbp,
3055 : &offset);
3056 282599 : if (error)
3057 0 : goto bread_err2;
3058 : } else {
3059 : /* This LR is split across physical log end */
3060 0 : if (blk_no != log->l_logBBsize) {
3061 : /* some data before physical log end */
3062 0 : ASSERT(blk_no <= INT_MAX);
3063 0 : split_hblks = log->l_logBBsize - (int)blk_no;
3064 0 : ASSERT(split_hblks > 0);
3065 0 : error = xlog_bread(log, blk_no,
3066 : split_hblks, hbp,
3067 : &offset);
3068 0 : if (error)
3069 0 : goto bread_err2;
3070 : }
3071 :
3072 : /*
3073 : * Note: this black magic still works with
3074 : * large sector sizes (non-512) only because:
3075 : * - we increased the buffer size originally
3076 : * by 1 sector giving us enough extra space
3077 : * for the second read;
3078 : * - the log start is guaranteed to be sector
3079 : * aligned;
3080 : * - we read the log end (LR header start)
3081 : * _first_, then the log start (LR header end)
3082 : * - order is important.
3083 : */
3084 0 : wrapped_hblks = hblks - split_hblks;
3085 0 : error = xlog_bread_noalign(log, 0,
3086 : wrapped_hblks,
3087 0 : offset + BBTOB(split_hblks));
3088 0 : if (error)
3089 0 : goto bread_err2;
3090 : }
3091 282599 : rhead = (xlog_rec_header_t *)offset;
3092 282599 : error = xlog_valid_rec_header(log, rhead,
3093 : split_hblks ? blk_no : 0, h_size);
3094 282599 : if (error)
3095 0 : goto bread_err2;
3096 :
3097 282599 : bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3098 282599 : blk_no += hblks;
3099 :
3100 : /*
3101 : * Read the log record data in multiple reads if it
3102 : * wraps around the end of the log. Note that if the
3103 : * header already wrapped, blk_no could point past the
3104 : * end of the log. The record data is contiguous in
3105 : * that case.
3106 : */
3107 282599 : if (blk_no + bblks <= log->l_logBBsize ||
3108 : blk_no >= log->l_logBBsize) {
3109 280956 : rblk_no = xlog_wrap_logbno(log, blk_no);
3110 280956 : error = xlog_bread(log, rblk_no, bblks, dbp,
3111 : &offset);
3112 280956 : if (error)
3113 0 : goto bread_err2;
3114 : } else {
3115 : /* This log record is split across the
3116 : * physical end of log */
3117 1643 : offset = dbp;
3118 1643 : split_bblks = 0;
3119 1643 : if (blk_no != log->l_logBBsize) {
3120 : /* some data is before the physical
3121 : * end of log */
3122 1643 : ASSERT(!wrapped_hblks);
3123 1643 : ASSERT(blk_no <= INT_MAX);
3124 1643 : split_bblks =
3125 1643 : log->l_logBBsize - (int)blk_no;
3126 1643 : ASSERT(split_bblks > 0);
3127 1643 : error = xlog_bread(log, blk_no,
3128 : split_bblks, dbp,
3129 : &offset);
3130 1643 : if (error)
3131 0 : goto bread_err2;
3132 : }
3133 :
3134 : /*
3135 : * Note: this black magic still works with
3136 : * large sector sizes (non-512) only because:
3137 : * - we increased the buffer size originally
3138 : * by 1 sector giving us enough extra space
3139 : * for the second read;
3140 : * - the log start is guaranteed to be sector
3141 : * aligned;
3142 : * - we read the log end (LR header start)
3143 : * _first_, then the log start (LR header end)
3144 : * - order is important.
3145 : */
3146 1643 : error = xlog_bread_noalign(log, 0,
3147 : bblks - split_bblks,
3148 1643 : offset + BBTOB(split_bblks));
3149 1643 : if (error)
3150 0 : goto bread_err2;
3151 : }
3152 :
3153 282599 : error = xlog_recover_process(log, rhash, rhead, offset,
3154 : pass, &buffer_list);
3155 282599 : if (error)
3156 0 : goto bread_err2;
3157 :
3158 : blk_no += bblks;
3159 : rhead_blk = blk_no;
3160 : }
3161 :
3162 1895 : ASSERT(blk_no >= log->l_logBBsize);
3163 1895 : blk_no -= log->l_logBBsize;
3164 1895 : rhead_blk = blk_no;
3165 : }
3166 :
3167 : /* read first part of physical log */
3168 4537535 : while (blk_no < head_blk) {
3169 4482675 : error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3170 4482675 : if (error)
3171 0 : goto bread_err2;
3172 :
3173 4482675 : rhead = (xlog_rec_header_t *)offset;
3174 4482675 : error = xlog_valid_rec_header(log, rhead, blk_no, h_size);
3175 4482675 : if (error)
3176 0 : goto bread_err2;
3177 :
3178 : /* blocks in data section */
3179 4482675 : bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3180 4482675 : error = xlog_bread(log, blk_no+hblks, bblks, dbp,
3181 : &offset);
3182 4482675 : if (error)
3183 0 : goto bread_err2;
3184 :
3185 4482675 : error = xlog_recover_process(log, rhash, rhead, offset, pass,
3186 : &buffer_list);
3187 4482675 : if (error)
3188 56 : goto bread_err2;
3189 :
3190 4482619 : blk_no += bblks + hblks;
3191 4482619 : rhead_blk = blk_no;
3192 : }
3193 :
3194 54860 : bread_err2:
3195 54916 : kmem_free(dbp);
3196 54916 : bread_err1:
3197 54916 : kmem_free(hbp);
3198 :
3199 : /*
3200 : * Submit buffers that have been added from the last record processed,
3201 : * regardless of error status.
3202 : */
3203 54916 : if (!list_empty(&buffer_list))
3204 12849 : error2 = xfs_buf_delwri_submit(&buffer_list);
3205 :
3206 54916 : if (error && first_bad)
3207 56 : *first_bad = rhead_blk;
3208 :
3209 : /*
3210 : * Transactions are freed at commit time but transactions without commit
3211 : * records on disk are never committed. Free any that may be left in the
3212 : * hash table.
3213 : */
3214 933572 : for (i = 0; i < XLOG_RHASH_SIZE; i++) {
3215 878656 : struct hlist_node *tmp;
3216 878656 : struct xlog_recover *trans;
3217 :
3218 1759340 : hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
3219 2028 : xlog_recover_free_trans(trans);
3220 : }
3221 :
3222 54916 : return error ? error : error2;
3223 : }
3224 :
3225 : /*
3226 : * Do the recovery of the log. We actually do this in two phases.
3227 : * The two passes are necessary in order to implement the function
3228 : * of cancelling a record written into the log. The first pass
3229 : * determines those things which have been cancelled, and the
3230 : * second pass replays log items normally except for those which
3231 : * have been cancelled. The handling of the replay and cancellations
3232 : * takes place in the log item type specific routines.
3233 : *
3234 : * The table of items which have cancel records in the log is allocated
3235 : * and freed at this level, since only here do we know when all of
3236 : * the log recovery has been completed.
3237 : */
3238 : STATIC int
3239 13723 : xlog_do_log_recovery(
3240 : struct xlog *log,
3241 : xfs_daddr_t head_blk,
3242 : xfs_daddr_t tail_blk)
3243 : {
3244 13723 : int error;
3245 :
3246 13723 : ASSERT(head_blk != tail_blk);
3247 :
3248 : /*
3249 : * First do a pass to find all of the cancelled buf log items.
3250 : * Store them in the buf_cancel_table for use in the second pass.
3251 : */
3252 13723 : error = xlog_alloc_buf_cancel_table(log);
3253 13723 : if (error)
3254 : return error;
3255 :
3256 13723 : error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3257 : XLOG_RECOVER_PASS1, NULL);
3258 13723 : if (error != 0)
3259 0 : goto out_cancel;
3260 :
3261 : /*
3262 : * Then do a second pass to actually recover the items in the log.
3263 : * When it is complete free the table of buf cancel items.
3264 : */
3265 13723 : error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3266 : XLOG_RECOVER_PASS2, NULL);
3267 13723 : if (!error)
3268 13723 : xlog_check_buf_cancel_table(log);
3269 0 : out_cancel:
3270 13723 : xlog_free_buf_cancel_table(log);
3271 13723 : return error;
3272 : }
3273 :
3274 : /*
3275 : * Do the actual recovery
3276 : */
3277 : STATIC int
3278 13723 : xlog_do_recover(
3279 : struct xlog *log,
3280 : xfs_daddr_t head_blk,
3281 : xfs_daddr_t tail_blk)
3282 : {
3283 13723 : struct xfs_mount *mp = log->l_mp;
3284 13723 : struct xfs_buf *bp = mp->m_sb_bp;
3285 13723 : struct xfs_sb *sbp = &mp->m_sb;
3286 13723 : int error;
3287 :
3288 13723 : trace_xfs_log_recover(log, head_blk, tail_blk);
3289 :
3290 : /*
3291 : * First replay the images in the log.
3292 : */
3293 13723 : error = xlog_do_log_recovery(log, head_blk, tail_blk);
3294 13723 : if (error)
3295 : return error;
3296 :
3297 27446 : if (xlog_is_shutdown(log))
3298 : return -EIO;
3299 :
3300 : /*
3301 : * We now update the tail_lsn since much of the recovery has completed
3302 : * and there may be space available to use. If there were no extent
3303 : * or iunlinks, we can free up the entire log and set the tail_lsn to
3304 : * be the last_sync_lsn. This was set in xlog_find_tail to be the
3305 : * lsn of the last known good LR on disk. If there are extent frees
3306 : * or iunlinks they will have some entries in the AIL; so we look at
3307 : * the AIL to determine how to set the tail_lsn.
3308 : */
3309 13723 : xlog_assign_tail_lsn(mp);
3310 :
3311 : /*
3312 : * Now that we've finished replaying all buffer and inode updates,
3313 : * re-read the superblock and reverify it.
3314 : */
3315 13723 : xfs_buf_lock(bp);
3316 13723 : xfs_buf_hold(bp);
3317 13723 : error = _xfs_buf_read(bp, XBF_READ);
3318 13723 : if (error) {
3319 0 : if (!xlog_is_shutdown(log)) {
3320 0 : xfs_buf_ioerror_alert(bp, __this_address);
3321 0 : ASSERT(0);
3322 : }
3323 0 : xfs_buf_relse(bp);
3324 0 : return error;
3325 : }
3326 :
3327 : /* Convert superblock from on-disk format */
3328 13723 : xfs_sb_from_disk(sbp, bp->b_addr);
3329 13723 : xfs_buf_relse(bp);
3330 :
3331 : /* re-initialise in-core superblock and geometry structures */
3332 13723 : mp->m_features |= xfs_sb_version_to_features(sbp);
3333 13723 : xfs_reinit_percpu_counters(mp);
3334 13723 : error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks,
3335 : &mp->m_maxagi);
3336 13723 : if (error) {
3337 0 : xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
3338 0 : return error;
3339 : }
3340 13723 : mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
3341 :
3342 : /* Normal transactions can now occur */
3343 13723 : clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
3344 13723 : return 0;
3345 : }
3346 :
3347 : /*
3348 : * Perform recovery and re-initialize some log variables in xlog_find_tail.
3349 : *
3350 : * Return error or zero.
3351 : */
3352 : int
3353 59235 : xlog_recover(
3354 : struct xlog *log)
3355 : {
3356 59235 : xfs_daddr_t head_blk, tail_blk;
3357 59235 : int error;
3358 :
3359 : /* find the tail of the log */
3360 59235 : error = xlog_find_tail(log, &head_blk, &tail_blk);
3361 59235 : if (error)
3362 : return error;
3363 :
3364 : /*
3365 : * The superblock was read before the log was available and thus the LSN
3366 : * could not be verified. Check the superblock LSN against the current
3367 : * LSN now that it's known.
3368 : */
3369 118235 : if (xfs_has_crc(log->l_mp) &&
3370 59020 : !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
3371 : return -EINVAL;
3372 :
3373 59204 : if (tail_blk != head_blk) {
3374 : /* There used to be a comment here:
3375 : *
3376 : * disallow recovery on read-only mounts. note -- mount
3377 : * checks for ENOSPC and turns it into an intelligent
3378 : * error message.
3379 : * ...but this is no longer true. Now, unless you specify
3380 : * NORECOVERY (in which case this function would never be
3381 : * called), we just go ahead and recover. We do this all
3382 : * under the vfs layer, so we can get away with it unless
3383 : * the device itself is read-only, in which case we fail.
3384 : */
3385 13734 : if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
3386 : return error;
3387 : }
3388 :
3389 : /*
3390 : * Version 5 superblock log feature mask validation. We know the
3391 : * log is dirty so check if there are any unknown log features
3392 : * in what we need to recover. If there are unknown features
3393 : * (e.g. unsupported transactions, then simply reject the
3394 : * attempt at recovery before touching anything.
3395 : */
3396 13723 : if (xfs_sb_is_v5(&log->l_mp->m_sb) &&
3397 : xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
3398 : XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
3399 0 : xfs_warn(log->l_mp,
3400 : "Superblock has unknown incompatible log features (0x%x) enabled.",
3401 : (log->l_mp->m_sb.sb_features_log_incompat &
3402 : XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
3403 0 : xfs_warn(log->l_mp,
3404 : "The log can not be fully and/or safely recovered by this kernel.");
3405 0 : xfs_warn(log->l_mp,
3406 : "Please recover the log on a kernel that supports the unknown features.");
3407 0 : return -EINVAL;
3408 : }
3409 :
3410 : /*
3411 : * Delay log recovery if the debug hook is set. This is debug
3412 : * instrumentation to coordinate simulation of I/O failures with
3413 : * log recovery.
3414 : */
3415 13723 : if (xfs_globals.log_recovery_delay) {
3416 22 : xfs_notice(log->l_mp,
3417 : "Delaying log recovery for %d seconds.",
3418 : xfs_globals.log_recovery_delay);
3419 22 : msleep(xfs_globals.log_recovery_delay * 1000);
3420 : }
3421 :
3422 15349 : xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
3423 : log->l_mp->m_logname ? log->l_mp->m_logname
3424 : : "internal");
3425 :
3426 13723 : error = xlog_do_recover(log, head_blk, tail_blk);
3427 13723 : set_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
3428 : }
3429 : return error;
3430 : }
3431 :
3432 : /*
3433 : * In the first part of recovery we replay inodes and buffers and build up the
3434 : * list of intents which need to be processed. Here we process the intents and
3435 : * clean up the on disk unlinked inode lists. This is separated from the first
3436 : * part of recovery so that the root and real-time bitmap inodes can be read in
3437 : * from disk in between the two stages. This is necessary so that we can free
3438 : * space in the real-time portion of the file system.
3439 : */
3440 : int
3441 13723 : xlog_recover_finish(
3442 : struct xlog *log)
3443 : {
3444 13723 : int error;
3445 :
3446 13723 : error = xlog_recover_process_intents(log);
3447 : /*
3448 : * Sync the log to get all the intents that have done item out of
3449 : * the AIL. This isn't absolutely necessary, but it helps in case
3450 : * the unlink transactions would have problems pushing the intents
3451 : * out of the way.
3452 : */
3453 13723 : xfs_log_force(log->l_mp, XFS_LOG_SYNC);
3454 13723 : if (error) {
3455 : /*
3456 : * Cancel all the unprocessed intent items now so that we don't
3457 : * leave them pinned in the AIL. This can cause the AIL to
3458 : * livelock on the pinned item if anyone tries to push the AIL
3459 : * (inode reclaim does this) before we get around to
3460 : * xfs_log_mount_cancel.
3461 : */
3462 6 : xlog_recover_cancel_intents(log);
3463 6 : xfs_alert(log->l_mp, "Failed to recover intents");
3464 6 : xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
3465 6 : return error;
3466 : }
3467 :
3468 : /*
3469 : * Now that we've recovered the log and all the intents, we can clear
3470 : * the log incompat feature bits in the superblock because there's no
3471 : * longer anything to protect. We rely on the AIL push to write out the
3472 : * updated superblock after everything else.
3473 : */
3474 13717 : if (xfs_clear_incompat_log_features(log->l_mp)) {
3475 344 : error = xfs_sync_sb(log->l_mp, false);
3476 344 : if (error < 0) {
3477 0 : xfs_alert(log->l_mp,
3478 : "Failed to clear log incompat features on recovery");
3479 0 : return error;
3480 : }
3481 : }
3482 :
3483 13717 : xlog_recover_process_iunlinks(log);
3484 :
3485 : /*
3486 : * Recover any CoW staging blocks that are still referenced by the
3487 : * ondisk refcount metadata. During mount there cannot be any live
3488 : * staging extents as we have not permitted any user modifications.
3489 : * Therefore, it is safe to free them all right now, even on a
3490 : * read-only mount.
3491 : */
3492 13717 : error = xfs_reflink_recover_cow(log->l_mp);
3493 13717 : if (error) {
3494 22 : xfs_alert(log->l_mp,
3495 : "Failed to recover leftover CoW staging extents, err %d.",
3496 : error);
3497 : /*
3498 : * If we get an error here, make sure the log is shut down
3499 : * but return zero so that any log items committed since the
3500 : * end of intents processing can be pushed through the CIL
3501 : * and AIL.
3502 : */
3503 22 : xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
3504 : }
3505 :
3506 : return 0;
3507 : }
3508 :
3509 : void
3510 190 : xlog_recover_cancel(
3511 : struct xlog *log)
3512 : {
3513 380 : if (xlog_recovery_needed(log))
3514 0 : xlog_recover_cancel_intents(log);
3515 190 : }
3516 :
|