Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (C) 2012 Fusion-io All rights reserved.
4 : * Copyright (C) 2012 Intel Corp. All rights reserved.
5 : */
6 :
7 : #include <linux/sched.h>
8 : #include <linux/bio.h>
9 : #include <linux/slab.h>
10 : #include <linux/blkdev.h>
11 : #include <linux/raid/pq.h>
12 : #include <linux/hash.h>
13 : #include <linux/list_sort.h>
14 : #include <linux/raid/xor.h>
15 : #include <linux/mm.h>
16 : #include "messages.h"
17 : #include "misc.h"
18 : #include "ctree.h"
19 : #include "disk-io.h"
20 : #include "volumes.h"
21 : #include "raid56.h"
22 : #include "async-thread.h"
23 : #include "file-item.h"
24 : #include "btrfs_inode.h"
25 :
26 : /* set when additional merges to this rbio are not allowed */
27 : #define RBIO_RMW_LOCKED_BIT 1
28 :
29 : /*
30 : * set when this rbio is sitting in the hash, but it is just a cache
31 : * of past RMW
32 : */
33 : #define RBIO_CACHE_BIT 2
34 :
35 : /*
36 : * set when it is safe to trust the stripe_pages for caching
37 : */
38 : #define RBIO_CACHE_READY_BIT 3
39 :
40 : #define RBIO_CACHE_SIZE 1024
41 :
42 : #define BTRFS_STRIPE_HASH_TABLE_BITS 11
43 :
44 : /* Used by the raid56 code to lock stripes for read/modify/write */
45 : struct btrfs_stripe_hash {
46 : struct list_head hash_list;
47 : spinlock_t lock;
48 : };
49 :
50 : /* Used by the raid56 code to lock stripes for read/modify/write */
51 : struct btrfs_stripe_hash_table {
52 : struct list_head stripe_cache;
53 : spinlock_t cache_lock;
54 : int cache_size;
55 : struct btrfs_stripe_hash table[];
56 : };
57 :
58 : /*
59 : * A bvec like structure to present a sector inside a page.
60 : *
61 : * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
62 : */
63 : struct sector_ptr {
64 : struct page *page;
65 : unsigned int pgoff:24;
66 : unsigned int uptodate:8;
67 : };
68 :
69 : static void rmw_rbio_work(struct work_struct *work);
70 : static void rmw_rbio_work_locked(struct work_struct *work);
71 : static void index_rbio_pages(struct btrfs_raid_bio *rbio);
72 : static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
73 :
74 : static int finish_parity_scrub(struct btrfs_raid_bio *rbio);
75 : static void scrub_rbio_work_locked(struct work_struct *work);
76 :
77 0 : static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
78 : {
79 0 : bitmap_free(rbio->error_bitmap);
80 0 : kfree(rbio->stripe_pages);
81 0 : kfree(rbio->bio_sectors);
82 0 : kfree(rbio->stripe_sectors);
83 0 : kfree(rbio->finish_pointers);
84 0 : }
85 :
86 0 : static void free_raid_bio(struct btrfs_raid_bio *rbio)
87 : {
88 0 : int i;
89 :
90 0 : if (!refcount_dec_and_test(&rbio->refs))
91 : return;
92 :
93 0 : WARN_ON(!list_empty(&rbio->stripe_cache));
94 0 : WARN_ON(!list_empty(&rbio->hash_list));
95 0 : WARN_ON(!bio_list_empty(&rbio->bio_list));
96 :
97 0 : for (i = 0; i < rbio->nr_pages; i++) {
98 0 : if (rbio->stripe_pages[i]) {
99 0 : __free_page(rbio->stripe_pages[i]);
100 0 : rbio->stripe_pages[i] = NULL;
101 : }
102 : }
103 :
104 0 : btrfs_put_bioc(rbio->bioc);
105 0 : free_raid_bio_pointers(rbio);
106 0 : kfree(rbio);
107 : }
108 :
109 0 : static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
110 : {
111 0 : INIT_WORK(&rbio->work, work_func);
112 0 : queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
113 0 : }
114 :
115 : /*
116 : * the stripe hash table is used for locking, and to collect
117 : * bios in hopes of making a full stripe
118 : */
119 3242 : int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
120 : {
121 3242 : struct btrfs_stripe_hash_table *table;
122 3242 : struct btrfs_stripe_hash_table *x;
123 3242 : struct btrfs_stripe_hash *cur;
124 3242 : struct btrfs_stripe_hash *h;
125 3242 : int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
126 3242 : int i;
127 :
128 3242 : if (info->stripe_hash_table)
129 : return 0;
130 :
131 : /*
132 : * The table is large, starting with order 4 and can go as high as
133 : * order 7 in case lock debugging is turned on.
134 : *
135 : * Try harder to allocate and fallback to vmalloc to lower the chance
136 : * of a failing mount.
137 : */
138 3242 : table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
139 3242 : if (!table)
140 : return -ENOMEM;
141 :
142 3242 : spin_lock_init(&table->cache_lock);
143 3242 : INIT_LIST_HEAD(&table->stripe_cache);
144 :
145 3242 : h = table->table;
146 :
147 6642858 : for (i = 0; i < num_entries; i++) {
148 6639616 : cur = h + i;
149 6639616 : INIT_LIST_HEAD(&cur->hash_list);
150 6639616 : spin_lock_init(&cur->lock);
151 : }
152 :
153 3242 : x = cmpxchg(&info->stripe_hash_table, NULL, table);
154 3242 : kvfree(x);
155 3242 : return 0;
156 : }
157 :
158 : /*
159 : * caching an rbio means to copy anything from the
160 : * bio_sectors array into the stripe_pages array. We
161 : * use the page uptodate bit in the stripe cache array
162 : * to indicate if it has valid data
163 : *
164 : * once the caching is done, we set the cache ready
165 : * bit.
166 : */
167 0 : static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
168 : {
169 0 : int i;
170 0 : int ret;
171 :
172 0 : ret = alloc_rbio_pages(rbio);
173 0 : if (ret)
174 : return;
175 :
176 0 : for (i = 0; i < rbio->nr_sectors; i++) {
177 : /* Some range not covered by bio (partial write), skip it */
178 0 : if (!rbio->bio_sectors[i].page) {
179 : /*
180 : * Even if the sector is not covered by bio, if it is
181 : * a data sector it should still be uptodate as it is
182 : * read from disk.
183 : */
184 0 : if (i < rbio->nr_data * rbio->stripe_nsectors)
185 : ASSERT(rbio->stripe_sectors[i].uptodate);
186 0 : continue;
187 : }
188 :
189 0 : ASSERT(rbio->stripe_sectors[i].page);
190 0 : memcpy_page(rbio->stripe_sectors[i].page,
191 0 : rbio->stripe_sectors[i].pgoff,
192 : rbio->bio_sectors[i].page,
193 0 : rbio->bio_sectors[i].pgoff,
194 0 : rbio->bioc->fs_info->sectorsize);
195 0 : rbio->stripe_sectors[i].uptodate = 1;
196 : }
197 0 : set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
198 : }
199 :
200 : /*
201 : * we hash on the first logical address of the stripe
202 : */
203 : static int rbio_bucket(struct btrfs_raid_bio *rbio)
204 : {
205 0 : u64 num = rbio->bioc->full_stripe_logical;
206 :
207 : /*
208 : * we shift down quite a bit. We're using byte
209 : * addressing, and most of the lower bits are zeros.
210 : * This tends to upset hash_64, and it consistently
211 : * returns just one or two different values.
212 : *
213 : * shifting off the lower bits fixes things.
214 : */
215 0 : return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
216 : }
217 :
218 : static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
219 : unsigned int page_nr)
220 : {
221 : const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
222 : const u32 sectors_per_page = PAGE_SIZE / sectorsize;
223 : int i;
224 :
225 : ASSERT(page_nr < rbio->nr_pages);
226 :
227 : for (i = sectors_per_page * page_nr;
228 : i < sectors_per_page * page_nr + sectors_per_page;
229 : i++) {
230 : if (!rbio->stripe_sectors[i].uptodate)
231 : return false;
232 : }
233 : return true;
234 : }
235 :
236 : /*
237 : * Update the stripe_sectors[] array to use correct page and pgoff
238 : *
239 : * Should be called every time any page pointer in stripes_pages[] got modified.
240 : */
241 0 : static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
242 : {
243 0 : const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
244 0 : u32 offset;
245 0 : int i;
246 :
247 0 : for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
248 0 : int page_index = offset >> PAGE_SHIFT;
249 :
250 0 : ASSERT(page_index < rbio->nr_pages);
251 0 : rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
252 0 : rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
253 : }
254 0 : }
255 :
256 0 : static void steal_rbio_page(struct btrfs_raid_bio *src,
257 : struct btrfs_raid_bio *dest, int page_nr)
258 : {
259 0 : const u32 sectorsize = src->bioc->fs_info->sectorsize;
260 0 : const u32 sectors_per_page = PAGE_SIZE / sectorsize;
261 0 : int i;
262 :
263 0 : if (dest->stripe_pages[page_nr])
264 0 : __free_page(dest->stripe_pages[page_nr]);
265 0 : dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
266 0 : src->stripe_pages[page_nr] = NULL;
267 :
268 : /* Also update the sector->uptodate bits. */
269 0 : for (i = sectors_per_page * page_nr;
270 0 : i < sectors_per_page * page_nr + sectors_per_page; i++)
271 0 : dest->stripe_sectors[i].uptodate = true;
272 0 : }
273 :
274 : static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
275 : {
276 0 : const int sector_nr = (page_nr << PAGE_SHIFT) >>
277 0 : rbio->bioc->fs_info->sectorsize_bits;
278 :
279 : /*
280 : * We have ensured PAGE_SIZE is aligned with sectorsize, thus
281 : * we won't have a page which is half data half parity.
282 : *
283 : * Thus if the first sector of the page belongs to data stripes, then
284 : * the full page belongs to data stripes.
285 : */
286 0 : return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
287 : }
288 :
289 : /*
290 : * Stealing an rbio means taking all the uptodate pages from the stripe array
291 : * in the source rbio and putting them into the destination rbio.
292 : *
293 : * This will also update the involved stripe_sectors[] which are referring to
294 : * the old pages.
295 : */
296 0 : static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
297 : {
298 0 : int i;
299 :
300 0 : if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
301 : return;
302 :
303 0 : for (i = 0; i < dest->nr_pages; i++) {
304 0 : struct page *p = src->stripe_pages[i];
305 :
306 : /*
307 : * We don't need to steal P/Q pages as they will always be
308 : * regenerated for RMW or full write anyway.
309 : */
310 0 : if (!is_data_stripe_page(src, i))
311 0 : continue;
312 :
313 : /*
314 : * If @src already has RBIO_CACHE_READY_BIT, it should have
315 : * all data stripe pages present and uptodate.
316 : */
317 0 : ASSERT(p);
318 0 : ASSERT(full_page_sectors_uptodate(src, i));
319 0 : steal_rbio_page(src, dest, i);
320 : }
321 0 : index_stripe_sectors(dest);
322 0 : index_stripe_sectors(src);
323 : }
324 :
325 : /*
326 : * merging means we take the bio_list from the victim and
327 : * splice it into the destination. The victim should
328 : * be discarded afterwards.
329 : *
330 : * must be called with dest->rbio_list_lock held
331 : */
332 0 : static void merge_rbio(struct btrfs_raid_bio *dest,
333 : struct btrfs_raid_bio *victim)
334 : {
335 0 : bio_list_merge(&dest->bio_list, &victim->bio_list);
336 0 : dest->bio_list_bytes += victim->bio_list_bytes;
337 : /* Also inherit the bitmaps from @victim. */
338 0 : bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
339 0 : dest->stripe_nsectors);
340 0 : bio_list_init(&victim->bio_list);
341 0 : }
342 :
343 : /*
344 : * used to prune items that are in the cache. The caller
345 : * must hold the hash table lock.
346 : */
347 0 : static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
348 : {
349 0 : int bucket = rbio_bucket(rbio);
350 0 : struct btrfs_stripe_hash_table *table;
351 0 : struct btrfs_stripe_hash *h;
352 0 : int freeit = 0;
353 :
354 : /*
355 : * check the bit again under the hash table lock.
356 : */
357 0 : if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
358 : return;
359 :
360 0 : table = rbio->bioc->fs_info->stripe_hash_table;
361 0 : h = table->table + bucket;
362 :
363 : /* hold the lock for the bucket because we may be
364 : * removing it from the hash table
365 : */
366 0 : spin_lock(&h->lock);
367 :
368 : /*
369 : * hold the lock for the bio list because we need
370 : * to make sure the bio list is empty
371 : */
372 0 : spin_lock(&rbio->bio_list_lock);
373 :
374 0 : if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
375 0 : list_del_init(&rbio->stripe_cache);
376 0 : table->cache_size -= 1;
377 0 : freeit = 1;
378 :
379 : /* if the bio list isn't empty, this rbio is
380 : * still involved in an IO. We take it out
381 : * of the cache list, and drop the ref that
382 : * was held for the list.
383 : *
384 : * If the bio_list was empty, we also remove
385 : * the rbio from the hash_table, and drop
386 : * the corresponding ref
387 : */
388 0 : if (bio_list_empty(&rbio->bio_list)) {
389 0 : if (!list_empty(&rbio->hash_list)) {
390 0 : list_del_init(&rbio->hash_list);
391 0 : refcount_dec(&rbio->refs);
392 0 : BUG_ON(!list_empty(&rbio->plug_list));
393 : }
394 : }
395 : }
396 :
397 0 : spin_unlock(&rbio->bio_list_lock);
398 0 : spin_unlock(&h->lock);
399 :
400 0 : if (freeit)
401 0 : free_raid_bio(rbio);
402 : }
403 :
404 : /*
405 : * prune a given rbio from the cache
406 : */
407 0 : static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
408 : {
409 0 : struct btrfs_stripe_hash_table *table;
410 :
411 0 : if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
412 : return;
413 :
414 0 : table = rbio->bioc->fs_info->stripe_hash_table;
415 :
416 0 : spin_lock(&table->cache_lock);
417 0 : __remove_rbio_from_cache(rbio);
418 0 : spin_unlock(&table->cache_lock);
419 : }
420 :
421 : /*
422 : * remove everything in the cache
423 : */
424 3242 : static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
425 : {
426 3242 : struct btrfs_stripe_hash_table *table;
427 3242 : struct btrfs_raid_bio *rbio;
428 :
429 3242 : table = info->stripe_hash_table;
430 :
431 3242 : spin_lock(&table->cache_lock);
432 3242 : while (!list_empty(&table->stripe_cache)) {
433 0 : rbio = list_entry(table->stripe_cache.next,
434 : struct btrfs_raid_bio,
435 : stripe_cache);
436 0 : __remove_rbio_from_cache(rbio);
437 : }
438 3242 : spin_unlock(&table->cache_lock);
439 3242 : }
440 :
441 : /*
442 : * remove all cached entries and free the hash table
443 : * used by unmount
444 : */
445 3472 : void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
446 : {
447 3472 : if (!info->stripe_hash_table)
448 : return;
449 3242 : btrfs_clear_rbio_cache(info);
450 3242 : kvfree(info->stripe_hash_table);
451 3242 : info->stripe_hash_table = NULL;
452 : }
453 :
454 : /*
455 : * insert an rbio into the stripe cache. It
456 : * must have already been prepared by calling
457 : * cache_rbio_pages
458 : *
459 : * If this rbio was already cached, it gets
460 : * moved to the front of the lru.
461 : *
462 : * If the size of the rbio cache is too big, we
463 : * prune an item.
464 : */
465 0 : static void cache_rbio(struct btrfs_raid_bio *rbio)
466 : {
467 0 : struct btrfs_stripe_hash_table *table;
468 :
469 0 : if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
470 : return;
471 :
472 0 : table = rbio->bioc->fs_info->stripe_hash_table;
473 :
474 0 : spin_lock(&table->cache_lock);
475 0 : spin_lock(&rbio->bio_list_lock);
476 :
477 : /* bump our ref if we were not in the list before */
478 0 : if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
479 0 : refcount_inc(&rbio->refs);
480 :
481 0 : if (!list_empty(&rbio->stripe_cache)){
482 0 : list_move(&rbio->stripe_cache, &table->stripe_cache);
483 : } else {
484 0 : list_add(&rbio->stripe_cache, &table->stripe_cache);
485 0 : table->cache_size += 1;
486 : }
487 :
488 0 : spin_unlock(&rbio->bio_list_lock);
489 :
490 0 : if (table->cache_size > RBIO_CACHE_SIZE) {
491 0 : struct btrfs_raid_bio *found;
492 :
493 0 : found = list_entry(table->stripe_cache.prev,
494 : struct btrfs_raid_bio,
495 : stripe_cache);
496 :
497 0 : if (found != rbio)
498 0 : __remove_rbio_from_cache(found);
499 : }
500 :
501 0 : spin_unlock(&table->cache_lock);
502 : }
503 :
504 : /*
505 : * helper function to run the xor_blocks api. It is only
506 : * able to do MAX_XOR_BLOCKS at a time, so we need to
507 : * loop through.
508 : */
509 0 : static void run_xor(void **pages, int src_cnt, ssize_t len)
510 : {
511 0 : int src_off = 0;
512 0 : int xor_src_cnt = 0;
513 0 : void *dest = pages[src_cnt];
514 :
515 0 : while(src_cnt > 0) {
516 0 : xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
517 0 : xor_blocks(xor_src_cnt, len, dest, pages + src_off);
518 :
519 0 : src_cnt -= xor_src_cnt;
520 0 : src_off += xor_src_cnt;
521 : }
522 0 : }
523 :
524 : /*
525 : * Returns true if the bio list inside this rbio covers an entire stripe (no
526 : * rmw required).
527 : */
528 0 : static int rbio_is_full(struct btrfs_raid_bio *rbio)
529 : {
530 0 : unsigned long size = rbio->bio_list_bytes;
531 0 : int ret = 1;
532 :
533 0 : spin_lock(&rbio->bio_list_lock);
534 0 : if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
535 0 : ret = 0;
536 0 : BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
537 0 : spin_unlock(&rbio->bio_list_lock);
538 :
539 0 : return ret;
540 : }
541 :
542 : /*
543 : * returns 1 if it is safe to merge two rbios together.
544 : * The merging is safe if the two rbios correspond to
545 : * the same stripe and if they are both going in the same
546 : * direction (read vs write), and if neither one is
547 : * locked for final IO
548 : *
549 : * The caller is responsible for locking such that
550 : * rmw_locked is safe to test
551 : */
552 0 : static int rbio_can_merge(struct btrfs_raid_bio *last,
553 : struct btrfs_raid_bio *cur)
554 : {
555 0 : if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
556 0 : test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
557 : return 0;
558 :
559 : /*
560 : * we can't merge with cached rbios, since the
561 : * idea is that when we merge the destination
562 : * rbio is going to run our IO for us. We can
563 : * steal from cached rbios though, other functions
564 : * handle that.
565 : */
566 0 : if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
567 0 : test_bit(RBIO_CACHE_BIT, &cur->flags))
568 : return 0;
569 :
570 0 : if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
571 : return 0;
572 :
573 : /* we can't merge with different operations */
574 0 : if (last->operation != cur->operation)
575 : return 0;
576 : /*
577 : * We've need read the full stripe from the drive.
578 : * check and repair the parity and write the new results.
579 : *
580 : * We're not allowed to add any new bios to the
581 : * bio list here, anyone else that wants to
582 : * change this stripe needs to do their own rmw.
583 : */
584 0 : if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
585 : return 0;
586 :
587 0 : if (last->operation == BTRFS_RBIO_REBUILD_MISSING ||
588 : last->operation == BTRFS_RBIO_READ_REBUILD)
589 0 : return 0;
590 :
591 : return 1;
592 : }
593 :
594 : static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
595 : unsigned int stripe_nr,
596 : unsigned int sector_nr)
597 : {
598 0 : ASSERT(stripe_nr < rbio->real_stripes);
599 0 : ASSERT(sector_nr < rbio->stripe_nsectors);
600 :
601 0 : return stripe_nr * rbio->stripe_nsectors + sector_nr;
602 : }
603 :
604 : /* Return a sector from rbio->stripe_sectors, not from the bio list */
605 : static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
606 : unsigned int stripe_nr,
607 : unsigned int sector_nr)
608 : {
609 0 : return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
610 : sector_nr)];
611 : }
612 :
613 : /* Grab a sector inside P stripe */
614 : static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
615 : unsigned int sector_nr)
616 : {
617 0 : return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
618 : }
619 :
620 : /* Grab a sector inside Q stripe, return NULL if not RAID6 */
621 : static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
622 : unsigned int sector_nr)
623 : {
624 0 : if (rbio->nr_data + 1 == rbio->real_stripes)
625 : return NULL;
626 0 : return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
627 : }
628 :
629 : /*
630 : * The first stripe in the table for a logical address
631 : * has the lock. rbios are added in one of three ways:
632 : *
633 : * 1) Nobody has the stripe locked yet. The rbio is given
634 : * the lock and 0 is returned. The caller must start the IO
635 : * themselves.
636 : *
637 : * 2) Someone has the stripe locked, but we're able to merge
638 : * with the lock owner. The rbio is freed and the IO will
639 : * start automatically along with the existing rbio. 1 is returned.
640 : *
641 : * 3) Someone has the stripe locked, but we're not able to merge.
642 : * The rbio is added to the lock owner's plug list, or merged into
643 : * an rbio already on the plug list. When the lock owner unlocks,
644 : * the next rbio on the list is run and the IO is started automatically.
645 : * 1 is returned
646 : *
647 : * If we return 0, the caller still owns the rbio and must continue with
648 : * IO submission. If we return 1, the caller must assume the rbio has
649 : * already been freed.
650 : */
651 0 : static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
652 : {
653 0 : struct btrfs_stripe_hash *h;
654 0 : struct btrfs_raid_bio *cur;
655 0 : struct btrfs_raid_bio *pending;
656 0 : struct btrfs_raid_bio *freeit = NULL;
657 0 : struct btrfs_raid_bio *cache_drop = NULL;
658 0 : int ret = 0;
659 :
660 0 : h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
661 :
662 0 : spin_lock(&h->lock);
663 0 : list_for_each_entry(cur, &h->hash_list, hash_list) {
664 0 : if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
665 0 : continue;
666 :
667 0 : spin_lock(&cur->bio_list_lock);
668 :
669 : /* Can we steal this cached rbio's pages? */
670 0 : if (bio_list_empty(&cur->bio_list) &&
671 0 : list_empty(&cur->plug_list) &&
672 0 : test_bit(RBIO_CACHE_BIT, &cur->flags) &&
673 0 : !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
674 0 : list_del_init(&cur->hash_list);
675 0 : refcount_dec(&cur->refs);
676 :
677 0 : steal_rbio(cur, rbio);
678 0 : cache_drop = cur;
679 0 : spin_unlock(&cur->bio_list_lock);
680 :
681 0 : goto lockit;
682 : }
683 :
684 : /* Can we merge into the lock owner? */
685 0 : if (rbio_can_merge(cur, rbio)) {
686 0 : merge_rbio(cur, rbio);
687 0 : spin_unlock(&cur->bio_list_lock);
688 0 : freeit = rbio;
689 0 : ret = 1;
690 0 : goto out;
691 : }
692 :
693 :
694 : /*
695 : * We couldn't merge with the running rbio, see if we can merge
696 : * with the pending ones. We don't have to check for rmw_locked
697 : * because there is no way they are inside finish_rmw right now
698 : */
699 0 : list_for_each_entry(pending, &cur->plug_list, plug_list) {
700 0 : if (rbio_can_merge(pending, rbio)) {
701 0 : merge_rbio(pending, rbio);
702 0 : spin_unlock(&cur->bio_list_lock);
703 0 : freeit = rbio;
704 0 : ret = 1;
705 0 : goto out;
706 : }
707 : }
708 :
709 : /*
710 : * No merging, put us on the tail of the plug list, our rbio
711 : * will be started with the currently running rbio unlocks
712 : */
713 0 : list_add_tail(&rbio->plug_list, &cur->plug_list);
714 0 : spin_unlock(&cur->bio_list_lock);
715 0 : ret = 1;
716 0 : goto out;
717 : }
718 0 : lockit:
719 0 : refcount_inc(&rbio->refs);
720 0 : list_add(&rbio->hash_list, &h->hash_list);
721 0 : out:
722 0 : spin_unlock(&h->lock);
723 0 : if (cache_drop)
724 0 : remove_rbio_from_cache(cache_drop);
725 0 : if (freeit)
726 0 : free_raid_bio(freeit);
727 0 : return ret;
728 : }
729 :
730 : static void recover_rbio_work_locked(struct work_struct *work);
731 :
732 : /*
733 : * called as rmw or parity rebuild is completed. If the plug list has more
734 : * rbios waiting for this stripe, the next one on the list will be started
735 : */
736 0 : static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
737 : {
738 0 : int bucket;
739 0 : struct btrfs_stripe_hash *h;
740 0 : int keep_cache = 0;
741 :
742 0 : bucket = rbio_bucket(rbio);
743 0 : h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
744 :
745 0 : if (list_empty(&rbio->plug_list))
746 0 : cache_rbio(rbio);
747 :
748 0 : spin_lock(&h->lock);
749 0 : spin_lock(&rbio->bio_list_lock);
750 :
751 0 : if (!list_empty(&rbio->hash_list)) {
752 : /*
753 : * if we're still cached and there is no other IO
754 : * to perform, just leave this rbio here for others
755 : * to steal from later
756 : */
757 0 : if (list_empty(&rbio->plug_list) &&
758 0 : test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
759 0 : keep_cache = 1;
760 0 : clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
761 0 : BUG_ON(!bio_list_empty(&rbio->bio_list));
762 0 : goto done;
763 : }
764 :
765 0 : list_del_init(&rbio->hash_list);
766 0 : refcount_dec(&rbio->refs);
767 :
768 : /*
769 : * we use the plug list to hold all the rbios
770 : * waiting for the chance to lock this stripe.
771 : * hand the lock over to one of them.
772 : */
773 0 : if (!list_empty(&rbio->plug_list)) {
774 0 : struct btrfs_raid_bio *next;
775 0 : struct list_head *head = rbio->plug_list.next;
776 :
777 0 : next = list_entry(head, struct btrfs_raid_bio,
778 : plug_list);
779 :
780 0 : list_del_init(&rbio->plug_list);
781 :
782 0 : list_add(&next->hash_list, &h->hash_list);
783 0 : refcount_inc(&next->refs);
784 0 : spin_unlock(&rbio->bio_list_lock);
785 0 : spin_unlock(&h->lock);
786 :
787 0 : if (next->operation == BTRFS_RBIO_READ_REBUILD)
788 0 : start_async_work(next, recover_rbio_work_locked);
789 0 : else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
790 0 : steal_rbio(rbio, next);
791 0 : start_async_work(next, recover_rbio_work_locked);
792 0 : } else if (next->operation == BTRFS_RBIO_WRITE) {
793 0 : steal_rbio(rbio, next);
794 0 : start_async_work(next, rmw_rbio_work_locked);
795 0 : } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
796 0 : steal_rbio(rbio, next);
797 0 : start_async_work(next, scrub_rbio_work_locked);
798 : }
799 :
800 0 : goto done_nolock;
801 : }
802 : }
803 0 : done:
804 0 : spin_unlock(&rbio->bio_list_lock);
805 0 : spin_unlock(&h->lock);
806 :
807 : done_nolock:
808 0 : if (!keep_cache)
809 0 : remove_rbio_from_cache(rbio);
810 0 : }
811 :
812 0 : static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
813 : {
814 0 : struct bio *next;
815 :
816 0 : while (cur) {
817 0 : next = cur->bi_next;
818 0 : cur->bi_next = NULL;
819 0 : cur->bi_status = err;
820 0 : bio_endio(cur);
821 0 : cur = next;
822 : }
823 0 : }
824 :
825 : /*
826 : * this frees the rbio and runs through all the bios in the
827 : * bio_list and calls end_io on them
828 : */
829 0 : static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
830 : {
831 0 : struct bio *cur = bio_list_get(&rbio->bio_list);
832 0 : struct bio *extra;
833 :
834 0 : kfree(rbio->csum_buf);
835 0 : bitmap_free(rbio->csum_bitmap);
836 0 : rbio->csum_buf = NULL;
837 0 : rbio->csum_bitmap = NULL;
838 :
839 : /*
840 : * Clear the data bitmap, as the rbio may be cached for later usage.
841 : * do this before before unlock_stripe() so there will be no new bio
842 : * for this bio.
843 : */
844 0 : bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
845 :
846 : /*
847 : * At this moment, rbio->bio_list is empty, however since rbio does not
848 : * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
849 : * hash list, rbio may be merged with others so that rbio->bio_list
850 : * becomes non-empty.
851 : * Once unlock_stripe() is done, rbio->bio_list will not be updated any
852 : * more and we can call bio_endio() on all queued bios.
853 : */
854 0 : unlock_stripe(rbio);
855 0 : extra = bio_list_get(&rbio->bio_list);
856 0 : free_raid_bio(rbio);
857 :
858 0 : rbio_endio_bio_list(cur, err);
859 0 : if (extra)
860 0 : rbio_endio_bio_list(extra, err);
861 0 : }
862 :
863 : /*
864 : * Get a sector pointer specified by its @stripe_nr and @sector_nr.
865 : *
866 : * @rbio: The raid bio
867 : * @stripe_nr: Stripe number, valid range [0, real_stripe)
868 : * @sector_nr: Sector number inside the stripe,
869 : * valid range [0, stripe_nsectors)
870 : * @bio_list_only: Whether to use sectors inside the bio list only.
871 : *
872 : * The read/modify/write code wants to reuse the original bio page as much
873 : * as possible, and only use stripe_sectors as fallback.
874 : */
875 0 : static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
876 : int stripe_nr, int sector_nr,
877 : bool bio_list_only)
878 : {
879 0 : struct sector_ptr *sector;
880 0 : int index;
881 :
882 0 : ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
883 0 : ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
884 :
885 0 : index = stripe_nr * rbio->stripe_nsectors + sector_nr;
886 0 : ASSERT(index >= 0 && index < rbio->nr_sectors);
887 :
888 0 : spin_lock(&rbio->bio_list_lock);
889 0 : sector = &rbio->bio_sectors[index];
890 0 : if (sector->page || bio_list_only) {
891 : /* Don't return sector without a valid page pointer */
892 0 : if (!sector->page)
893 0 : sector = NULL;
894 0 : spin_unlock(&rbio->bio_list_lock);
895 0 : return sector;
896 : }
897 0 : spin_unlock(&rbio->bio_list_lock);
898 :
899 0 : return &rbio->stripe_sectors[index];
900 : }
901 :
902 : /*
903 : * allocation and initial setup for the btrfs_raid_bio. Not
904 : * this does not allocate any pages for rbio->pages.
905 : */
906 0 : static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
907 : struct btrfs_io_context *bioc)
908 : {
909 0 : const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
910 0 : const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
911 0 : const unsigned int num_pages = stripe_npages * real_stripes;
912 0 : const unsigned int stripe_nsectors =
913 0 : BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
914 0 : const unsigned int num_sectors = stripe_nsectors * real_stripes;
915 0 : struct btrfs_raid_bio *rbio;
916 :
917 : /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
918 0 : ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
919 : /*
920 : * Our current stripe len should be fixed to 64k thus stripe_nsectors
921 : * (at most 16) should be no larger than BITS_PER_LONG.
922 : */
923 0 : ASSERT(stripe_nsectors <= BITS_PER_LONG);
924 :
925 0 : rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
926 0 : if (!rbio)
927 : return ERR_PTR(-ENOMEM);
928 0 : rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
929 : GFP_NOFS);
930 0 : rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
931 : GFP_NOFS);
932 0 : rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
933 : GFP_NOFS);
934 0 : rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
935 0 : rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
936 :
937 0 : if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors ||
938 0 : !rbio->finish_pointers || !rbio->error_bitmap) {
939 0 : free_raid_bio_pointers(rbio);
940 0 : kfree(rbio);
941 0 : return ERR_PTR(-ENOMEM);
942 : }
943 :
944 0 : bio_list_init(&rbio->bio_list);
945 0 : init_waitqueue_head(&rbio->io_wait);
946 0 : INIT_LIST_HEAD(&rbio->plug_list);
947 0 : spin_lock_init(&rbio->bio_list_lock);
948 0 : INIT_LIST_HEAD(&rbio->stripe_cache);
949 0 : INIT_LIST_HEAD(&rbio->hash_list);
950 0 : btrfs_get_bioc(bioc);
951 0 : rbio->bioc = bioc;
952 0 : rbio->nr_pages = num_pages;
953 0 : rbio->nr_sectors = num_sectors;
954 0 : rbio->real_stripes = real_stripes;
955 0 : rbio->stripe_npages = stripe_npages;
956 0 : rbio->stripe_nsectors = stripe_nsectors;
957 0 : refcount_set(&rbio->refs, 1);
958 0 : atomic_set(&rbio->stripes_pending, 0);
959 :
960 0 : ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
961 0 : rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
962 :
963 0 : return rbio;
964 : }
965 :
966 : /* allocate pages for all the stripes in the bio, including parity */
967 0 : static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
968 : {
969 0 : int ret;
970 :
971 0 : ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
972 0 : if (ret < 0)
973 : return ret;
974 : /* Mapping all sectors */
975 0 : index_stripe_sectors(rbio);
976 0 : return 0;
977 : }
978 :
979 : /* only allocate pages for p/q stripes */
980 0 : static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
981 : {
982 0 : const int data_pages = rbio->nr_data * rbio->stripe_npages;
983 0 : int ret;
984 :
985 0 : ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
986 0 : rbio->stripe_pages + data_pages);
987 0 : if (ret < 0)
988 : return ret;
989 :
990 0 : index_stripe_sectors(rbio);
991 0 : return 0;
992 : }
993 :
994 : /*
995 : * Return the total number of errors found in the vertical stripe of @sector_nr.
996 : *
997 : * @faila and @failb will also be updated to the first and second stripe
998 : * number of the errors.
999 : */
1000 0 : static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
1001 : int *faila, int *failb)
1002 : {
1003 0 : int stripe_nr;
1004 0 : int found_errors = 0;
1005 :
1006 0 : if (faila || failb) {
1007 : /*
1008 : * Both @faila and @failb should be valid pointers if any of
1009 : * them is specified.
1010 : */
1011 0 : ASSERT(faila && failb);
1012 0 : *faila = -1;
1013 0 : *failb = -1;
1014 : }
1015 :
1016 0 : for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1017 0 : int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
1018 :
1019 0 : if (test_bit(total_sector_nr, rbio->error_bitmap)) {
1020 0 : found_errors++;
1021 0 : if (faila) {
1022 : /* Update faila and failb. */
1023 0 : if (*faila < 0)
1024 0 : *faila = stripe_nr;
1025 0 : else if (*failb < 0)
1026 0 : *failb = stripe_nr;
1027 : }
1028 : }
1029 : }
1030 0 : return found_errors;
1031 : }
1032 :
1033 : /*
1034 : * Add a single sector @sector into our list of bios for IO.
1035 : *
1036 : * Return 0 if everything went well.
1037 : * Return <0 for error.
1038 : */
1039 0 : static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1040 : struct bio_list *bio_list,
1041 : struct sector_ptr *sector,
1042 : unsigned int stripe_nr,
1043 : unsigned int sector_nr,
1044 : enum req_op op)
1045 : {
1046 0 : const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1047 0 : struct bio *last = bio_list->tail;
1048 0 : int ret;
1049 0 : struct bio *bio;
1050 0 : struct btrfs_io_stripe *stripe;
1051 0 : u64 disk_start;
1052 :
1053 : /*
1054 : * Note: here stripe_nr has taken device replace into consideration,
1055 : * thus it can be larger than rbio->real_stripe.
1056 : * So here we check against bioc->num_stripes, not rbio->real_stripes.
1057 : */
1058 0 : ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
1059 0 : ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
1060 0 : ASSERT(sector->page);
1061 :
1062 0 : stripe = &rbio->bioc->stripes[stripe_nr];
1063 0 : disk_start = stripe->physical + sector_nr * sectorsize;
1064 :
1065 : /* if the device is missing, just fail this stripe */
1066 0 : if (!stripe->dev->bdev) {
1067 0 : int found_errors;
1068 :
1069 0 : set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
1070 0 : rbio->error_bitmap);
1071 :
1072 : /* Check if we have reached tolerance early. */
1073 0 : found_errors = get_rbio_veritical_errors(rbio, sector_nr,
1074 : NULL, NULL);
1075 0 : if (found_errors > rbio->bioc->max_errors)
1076 : return -EIO;
1077 0 : return 0;
1078 : }
1079 :
1080 : /* see if we can add this page onto our existing bio */
1081 0 : if (last) {
1082 0 : u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT;
1083 0 : last_end += last->bi_iter.bi_size;
1084 :
1085 : /*
1086 : * we can't merge these if they are from different
1087 : * devices or if they are not contiguous
1088 : */
1089 0 : if (last_end == disk_start && !last->bi_status &&
1090 0 : last->bi_bdev == stripe->dev->bdev) {
1091 0 : ret = bio_add_page(last, sector->page, sectorsize,
1092 0 : sector->pgoff);
1093 0 : if (ret == sectorsize)
1094 : return 0;
1095 : }
1096 : }
1097 :
1098 : /* put a new bio on the list */
1099 0 : bio = bio_alloc(stripe->dev->bdev,
1100 : max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
1101 : op, GFP_NOFS);
1102 0 : bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
1103 0 : bio->bi_private = rbio;
1104 :
1105 0 : __bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
1106 0 : bio_list_add(bio_list, bio);
1107 0 : return 0;
1108 : }
1109 :
1110 0 : static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1111 : {
1112 0 : const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1113 0 : struct bio_vec bvec;
1114 0 : struct bvec_iter iter;
1115 0 : u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1116 0 : rbio->bioc->full_stripe_logical;
1117 :
1118 0 : bio_for_each_segment(bvec, bio, iter) {
1119 0 : u32 bvec_offset;
1120 :
1121 0 : for (bvec_offset = 0; bvec_offset < bvec.bv_len;
1122 0 : bvec_offset += sectorsize, offset += sectorsize) {
1123 0 : int index = offset / sectorsize;
1124 0 : struct sector_ptr *sector = &rbio->bio_sectors[index];
1125 :
1126 0 : sector->page = bvec.bv_page;
1127 0 : sector->pgoff = bvec.bv_offset + bvec_offset;
1128 0 : ASSERT(sector->pgoff < PAGE_SIZE);
1129 : }
1130 : }
1131 0 : }
1132 :
1133 : /*
1134 : * helper function to walk our bio list and populate the bio_pages array with
1135 : * the result. This seems expensive, but it is faster than constantly
1136 : * searching through the bio list as we setup the IO in finish_rmw or stripe
1137 : * reconstruction.
1138 : *
1139 : * This must be called before you trust the answers from page_in_rbio
1140 : */
1141 0 : static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1142 : {
1143 0 : struct bio *bio;
1144 :
1145 0 : spin_lock(&rbio->bio_list_lock);
1146 0 : bio_list_for_each(bio, &rbio->bio_list)
1147 0 : index_one_bio(rbio, bio);
1148 :
1149 0 : spin_unlock(&rbio->bio_list_lock);
1150 0 : }
1151 :
1152 0 : static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
1153 : struct raid56_bio_trace_info *trace_info)
1154 : {
1155 0 : const struct btrfs_io_context *bioc = rbio->bioc;
1156 0 : int i;
1157 :
1158 0 : ASSERT(bioc);
1159 :
1160 : /* We rely on bio->bi_bdev to find the stripe number. */
1161 0 : if (!bio->bi_bdev)
1162 0 : goto not_found;
1163 :
1164 0 : for (i = 0; i < bioc->num_stripes; i++) {
1165 0 : if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
1166 0 : continue;
1167 0 : trace_info->stripe_nr = i;
1168 0 : trace_info->devid = bioc->stripes[i].dev->devid;
1169 0 : trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1170 0 : bioc->stripes[i].physical;
1171 0 : return;
1172 : }
1173 :
1174 0 : not_found:
1175 0 : trace_info->devid = -1;
1176 0 : trace_info->offset = -1;
1177 0 : trace_info->stripe_nr = -1;
1178 : }
1179 :
1180 0 : static inline void bio_list_put(struct bio_list *bio_list)
1181 : {
1182 0 : struct bio *bio;
1183 :
1184 0 : while ((bio = bio_list_pop(bio_list)))
1185 0 : bio_put(bio);
1186 0 : }
1187 :
1188 : /* Generate PQ for one vertical stripe. */
1189 0 : static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
1190 : {
1191 0 : void **pointers = rbio->finish_pointers;
1192 0 : const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1193 0 : struct sector_ptr *sector;
1194 0 : int stripe;
1195 0 : const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
1196 :
1197 : /* First collect one sector from each data stripe */
1198 0 : for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1199 0 : sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1200 0 : pointers[stripe] = kmap_local_page(sector->page) +
1201 0 : sector->pgoff;
1202 : }
1203 :
1204 : /* Then add the parity stripe */
1205 0 : sector = rbio_pstripe_sector(rbio, sectornr);
1206 0 : sector->uptodate = 1;
1207 0 : pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
1208 :
1209 0 : if (has_qstripe) {
1210 : /*
1211 : * RAID6, add the qstripe and call the library function
1212 : * to fill in our p/q
1213 : */
1214 0 : sector = rbio_qstripe_sector(rbio, sectornr);
1215 0 : sector->uptodate = 1;
1216 0 : pointers[stripe++] = kmap_local_page(sector->page) +
1217 0 : sector->pgoff;
1218 :
1219 0 : raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
1220 : pointers);
1221 : } else {
1222 : /* raid5 */
1223 0 : memcpy(pointers[rbio->nr_data], pointers[0], sectorsize);
1224 0 : run_xor(pointers + 1, rbio->nr_data - 1, sectorsize);
1225 : }
1226 0 : for (stripe = stripe - 1; stripe >= 0; stripe--)
1227 : kunmap_local(pointers[stripe]);
1228 0 : }
1229 :
1230 0 : static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
1231 : struct bio_list *bio_list)
1232 : {
1233 : /* The total sector number inside the full stripe. */
1234 0 : int total_sector_nr;
1235 0 : int sectornr;
1236 0 : int stripe;
1237 0 : int ret;
1238 :
1239 0 : ASSERT(bio_list_size(bio_list) == 0);
1240 :
1241 : /* We should have at least one data sector. */
1242 0 : ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1243 :
1244 : /*
1245 : * Reset errors, as we may have errors inherited from from degraded
1246 : * write.
1247 : */
1248 0 : bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
1249 :
1250 : /*
1251 : * Start assembly. Make bios for everything from the higher layers (the
1252 : * bio_list in our rbio) and our P/Q. Ignore everything else.
1253 : */
1254 0 : for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1255 0 : total_sector_nr++) {
1256 0 : struct sector_ptr *sector;
1257 :
1258 0 : stripe = total_sector_nr / rbio->stripe_nsectors;
1259 0 : sectornr = total_sector_nr % rbio->stripe_nsectors;
1260 :
1261 : /* This vertical stripe has no data, skip it. */
1262 0 : if (!test_bit(sectornr, &rbio->dbitmap))
1263 0 : continue;
1264 :
1265 0 : if (stripe < rbio->nr_data) {
1266 0 : sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1267 0 : if (!sector)
1268 0 : continue;
1269 : } else {
1270 0 : sector = rbio_stripe_sector(rbio, stripe, sectornr);
1271 : }
1272 :
1273 0 : ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
1274 : sectornr, REQ_OP_WRITE);
1275 0 : if (ret)
1276 0 : goto error;
1277 : }
1278 :
1279 0 : if (likely(!rbio->bioc->replace_nr_stripes))
1280 : return 0;
1281 :
1282 : /*
1283 : * Make a copy for the replace target device.
1284 : *
1285 : * Thus the source stripe number (in replace_stripe_src) should be valid.
1286 : */
1287 : ASSERT(rbio->bioc->replace_stripe_src >= 0);
1288 :
1289 0 : for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1290 0 : total_sector_nr++) {
1291 0 : struct sector_ptr *sector;
1292 :
1293 0 : stripe = total_sector_nr / rbio->stripe_nsectors;
1294 0 : sectornr = total_sector_nr % rbio->stripe_nsectors;
1295 :
1296 : /*
1297 : * For RAID56, there is only one device that can be replaced,
1298 : * and replace_stripe_src[0] indicates the stripe number we
1299 : * need to copy from.
1300 : */
1301 0 : if (stripe != rbio->bioc->replace_stripe_src) {
1302 : /*
1303 : * We can skip the whole stripe completely, note
1304 : * total_sector_nr will be increased by one anyway.
1305 : */
1306 0 : ASSERT(sectornr == 0);
1307 0 : total_sector_nr += rbio->stripe_nsectors - 1;
1308 0 : continue;
1309 : }
1310 :
1311 : /* This vertical stripe has no data, skip it. */
1312 0 : if (!test_bit(sectornr, &rbio->dbitmap))
1313 0 : continue;
1314 :
1315 0 : if (stripe < rbio->nr_data) {
1316 0 : sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1317 0 : if (!sector)
1318 0 : continue;
1319 : } else {
1320 0 : sector = rbio_stripe_sector(rbio, stripe, sectornr);
1321 : }
1322 :
1323 0 : ret = rbio_add_io_sector(rbio, bio_list, sector,
1324 0 : rbio->real_stripes,
1325 : sectornr, REQ_OP_WRITE);
1326 0 : if (ret)
1327 0 : goto error;
1328 : }
1329 :
1330 : return 0;
1331 0 : error:
1332 0 : bio_list_put(bio_list);
1333 0 : return -EIO;
1334 : }
1335 :
1336 0 : static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
1337 : {
1338 0 : struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1339 0 : u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1340 0 : rbio->bioc->full_stripe_logical;
1341 0 : int total_nr_sector = offset >> fs_info->sectorsize_bits;
1342 :
1343 0 : ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
1344 :
1345 0 : bitmap_set(rbio->error_bitmap, total_nr_sector,
1346 0 : bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
1347 :
1348 : /*
1349 : * Special handling for raid56_alloc_missing_rbio() used by
1350 : * scrub/replace. Unlike call path in raid56_parity_recover(), they
1351 : * pass an empty bio here. Thus we have to find out the missing device
1352 : * and mark the stripe error instead.
1353 : */
1354 0 : if (bio->bi_iter.bi_size == 0) {
1355 : bool found_missing = false;
1356 : int stripe_nr;
1357 :
1358 0 : for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1359 0 : if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
1360 0 : found_missing = true;
1361 0 : bitmap_set(rbio->error_bitmap,
1362 0 : stripe_nr * rbio->stripe_nsectors,
1363 0 : rbio->stripe_nsectors);
1364 : }
1365 : }
1366 0 : ASSERT(found_missing);
1367 : }
1368 0 : }
1369 :
1370 : /*
1371 : * For subpage case, we can no longer set page Up-to-date directly for
1372 : * stripe_pages[], thus we need to locate the sector.
1373 : */
1374 0 : static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
1375 : struct page *page,
1376 : unsigned int pgoff)
1377 : {
1378 0 : int i;
1379 :
1380 0 : for (i = 0; i < rbio->nr_sectors; i++) {
1381 0 : struct sector_ptr *sector = &rbio->stripe_sectors[i];
1382 :
1383 0 : if (sector->page == page && sector->pgoff == pgoff)
1384 0 : return sector;
1385 : }
1386 : return NULL;
1387 : }
1388 :
1389 : /*
1390 : * this sets each page in the bio uptodate. It should only be used on private
1391 : * rbio pages, nothing that comes in from the higher layers
1392 : */
1393 0 : static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
1394 : {
1395 0 : const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1396 0 : struct bio_vec *bvec;
1397 0 : struct bvec_iter_all iter_all;
1398 :
1399 0 : ASSERT(!bio_flagged(bio, BIO_CLONED));
1400 :
1401 0 : bio_for_each_segment_all(bvec, bio, iter_all) {
1402 0 : struct sector_ptr *sector;
1403 0 : int pgoff;
1404 :
1405 0 : for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
1406 0 : pgoff += sectorsize) {
1407 0 : sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
1408 0 : ASSERT(sector);
1409 0 : if (sector)
1410 0 : sector->uptodate = 1;
1411 : }
1412 : }
1413 0 : }
1414 :
1415 0 : static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
1416 : {
1417 0 : struct bio_vec *bv = bio_first_bvec_all(bio);
1418 0 : int i;
1419 :
1420 0 : for (i = 0; i < rbio->nr_sectors; i++) {
1421 0 : struct sector_ptr *sector;
1422 :
1423 0 : sector = &rbio->stripe_sectors[i];
1424 0 : if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
1425 : break;
1426 0 : sector = &rbio->bio_sectors[i];
1427 0 : if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
1428 : break;
1429 : }
1430 0 : ASSERT(i < rbio->nr_sectors);
1431 0 : return i;
1432 : }
1433 :
1434 0 : static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
1435 : {
1436 0 : int total_sector_nr = get_bio_sector_nr(rbio, bio);
1437 0 : u32 bio_size = 0;
1438 0 : struct bio_vec *bvec;
1439 0 : int i;
1440 :
1441 0 : bio_for_each_bvec_all(bvec, bio, i)
1442 0 : bio_size += bvec->bv_len;
1443 :
1444 : /*
1445 : * Since we can have multiple bios touching the error_bitmap, we cannot
1446 : * call bitmap_set() without protection.
1447 : *
1448 : * Instead use set_bit() for each bit, as set_bit() itself is atomic.
1449 : */
1450 0 : for (i = total_sector_nr; i < total_sector_nr +
1451 0 : (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++)
1452 0 : set_bit(i, rbio->error_bitmap);
1453 0 : }
1454 :
1455 : /* Verify the data sectors at read time. */
1456 0 : static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
1457 : struct bio *bio)
1458 : {
1459 0 : struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1460 0 : int total_sector_nr = get_bio_sector_nr(rbio, bio);
1461 0 : struct bio_vec *bvec;
1462 0 : struct bvec_iter_all iter_all;
1463 :
1464 : /* No data csum for the whole stripe, no need to verify. */
1465 0 : if (!rbio->csum_bitmap || !rbio->csum_buf)
1466 0 : return;
1467 :
1468 : /* P/Q stripes, they have no data csum to verify against. */
1469 0 : if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
1470 : return;
1471 :
1472 0 : bio_for_each_segment_all(bvec, bio, iter_all) {
1473 0 : int bv_offset;
1474 :
1475 0 : for (bv_offset = bvec->bv_offset;
1476 0 : bv_offset < bvec->bv_offset + bvec->bv_len;
1477 0 : bv_offset += fs_info->sectorsize, total_sector_nr++) {
1478 0 : u8 csum_buf[BTRFS_CSUM_SIZE];
1479 0 : u8 *expected_csum = rbio->csum_buf +
1480 0 : total_sector_nr * fs_info->csum_size;
1481 0 : int ret;
1482 :
1483 : /* No csum for this sector, skip to the next sector. */
1484 0 : if (!test_bit(total_sector_nr, rbio->csum_bitmap))
1485 0 : continue;
1486 :
1487 0 : ret = btrfs_check_sector_csum(fs_info, bvec->bv_page,
1488 : bv_offset, csum_buf, expected_csum);
1489 0 : if (ret < 0)
1490 0 : set_bit(total_sector_nr, rbio->error_bitmap);
1491 : }
1492 : }
1493 : }
1494 :
1495 0 : static void raid_wait_read_end_io(struct bio *bio)
1496 : {
1497 0 : struct btrfs_raid_bio *rbio = bio->bi_private;
1498 :
1499 0 : if (bio->bi_status) {
1500 0 : rbio_update_error_bitmap(rbio, bio);
1501 : } else {
1502 0 : set_bio_pages_uptodate(rbio, bio);
1503 0 : verify_bio_data_sectors(rbio, bio);
1504 : }
1505 :
1506 0 : bio_put(bio);
1507 0 : if (atomic_dec_and_test(&rbio->stripes_pending))
1508 0 : wake_up(&rbio->io_wait);
1509 0 : }
1510 :
1511 0 : static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
1512 : struct bio_list *bio_list)
1513 : {
1514 0 : struct bio *bio;
1515 :
1516 0 : atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
1517 0 : while ((bio = bio_list_pop(bio_list))) {
1518 0 : bio->bi_end_io = raid_wait_read_end_io;
1519 :
1520 0 : if (trace_raid56_scrub_read_recover_enabled()) {
1521 0 : struct raid56_bio_trace_info trace_info = { 0 };
1522 :
1523 0 : bio_get_trace_info(rbio, bio, &trace_info);
1524 0 : trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
1525 : }
1526 0 : submit_bio(bio);
1527 : }
1528 :
1529 0 : wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
1530 0 : }
1531 :
1532 0 : static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
1533 : {
1534 0 : const int data_pages = rbio->nr_data * rbio->stripe_npages;
1535 0 : int ret;
1536 :
1537 0 : ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages);
1538 0 : if (ret < 0)
1539 : return ret;
1540 :
1541 0 : index_stripe_sectors(rbio);
1542 0 : return 0;
1543 : }
1544 :
1545 : /*
1546 : * We use plugging call backs to collect full stripes.
1547 : * Any time we get a partial stripe write while plugged
1548 : * we collect it into a list. When the unplug comes down,
1549 : * we sort the list by logical block number and merge
1550 : * everything we can into the same rbios
1551 : */
1552 : struct btrfs_plug_cb {
1553 : struct blk_plug_cb cb;
1554 : struct btrfs_fs_info *info;
1555 : struct list_head rbio_list;
1556 : struct work_struct work;
1557 : };
1558 :
1559 : /*
1560 : * rbios on the plug list are sorted for easier merging.
1561 : */
1562 0 : static int plug_cmp(void *priv, const struct list_head *a,
1563 : const struct list_head *b)
1564 : {
1565 0 : const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1566 : plug_list);
1567 0 : const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1568 : plug_list);
1569 0 : u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1570 0 : u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1571 :
1572 0 : if (a_sector < b_sector)
1573 : return -1;
1574 0 : if (a_sector > b_sector)
1575 0 : return 1;
1576 : return 0;
1577 : }
1578 :
1579 0 : static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1580 : {
1581 0 : struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
1582 0 : struct btrfs_raid_bio *cur;
1583 0 : struct btrfs_raid_bio *last = NULL;
1584 :
1585 0 : list_sort(NULL, &plug->rbio_list, plug_cmp);
1586 :
1587 0 : while (!list_empty(&plug->rbio_list)) {
1588 0 : cur = list_entry(plug->rbio_list.next,
1589 : struct btrfs_raid_bio, plug_list);
1590 0 : list_del_init(&cur->plug_list);
1591 :
1592 0 : if (rbio_is_full(cur)) {
1593 : /* We have a full stripe, queue it down. */
1594 0 : start_async_work(cur, rmw_rbio_work);
1595 0 : continue;
1596 : }
1597 0 : if (last) {
1598 0 : if (rbio_can_merge(last, cur)) {
1599 0 : merge_rbio(last, cur);
1600 0 : free_raid_bio(cur);
1601 0 : continue;
1602 : }
1603 0 : start_async_work(last, rmw_rbio_work);
1604 : }
1605 : last = cur;
1606 : }
1607 0 : if (last)
1608 0 : start_async_work(last, rmw_rbio_work);
1609 0 : kfree(plug);
1610 0 : }
1611 :
1612 : /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
1613 0 : static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
1614 : {
1615 0 : const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1616 0 : const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1617 0 : const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
1618 0 : const u32 orig_len = orig_bio->bi_iter.bi_size;
1619 0 : const u32 sectorsize = fs_info->sectorsize;
1620 0 : u64 cur_logical;
1621 :
1622 0 : ASSERT(orig_logical >= full_stripe_start &&
1623 : orig_logical + orig_len <= full_stripe_start +
1624 : rbio->nr_data * BTRFS_STRIPE_LEN);
1625 :
1626 0 : bio_list_add(&rbio->bio_list, orig_bio);
1627 0 : rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1628 :
1629 : /* Update the dbitmap. */
1630 0 : for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1631 0 : cur_logical += sectorsize) {
1632 0 : int bit = ((u32)(cur_logical - full_stripe_start) >>
1633 0 : fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1634 :
1635 0 : set_bit(bit, &rbio->dbitmap);
1636 : }
1637 0 : }
1638 :
1639 : /*
1640 : * our main entry point for writes from the rest of the FS.
1641 : */
1642 0 : void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
1643 : {
1644 0 : struct btrfs_fs_info *fs_info = bioc->fs_info;
1645 0 : struct btrfs_raid_bio *rbio;
1646 0 : struct btrfs_plug_cb *plug = NULL;
1647 0 : struct blk_plug_cb *cb;
1648 :
1649 0 : rbio = alloc_rbio(fs_info, bioc);
1650 0 : if (IS_ERR(rbio)) {
1651 0 : bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
1652 0 : bio_endio(bio);
1653 0 : return;
1654 : }
1655 0 : rbio->operation = BTRFS_RBIO_WRITE;
1656 0 : rbio_add_bio(rbio, bio);
1657 :
1658 : /*
1659 : * Don't plug on full rbios, just get them out the door
1660 : * as quickly as we can
1661 : */
1662 0 : if (!rbio_is_full(rbio)) {
1663 0 : cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
1664 0 : if (cb) {
1665 0 : plug = container_of(cb, struct btrfs_plug_cb, cb);
1666 0 : if (!plug->info) {
1667 0 : plug->info = fs_info;
1668 0 : INIT_LIST_HEAD(&plug->rbio_list);
1669 : }
1670 0 : list_add_tail(&rbio->plug_list, &plug->rbio_list);
1671 0 : return;
1672 : }
1673 : }
1674 :
1675 : /*
1676 : * Either we don't have any existing plug, or we're doing a full stripe,
1677 : * queue the rmw work now.
1678 : */
1679 0 : start_async_work(rbio, rmw_rbio_work);
1680 : }
1681 :
1682 0 : static int verify_one_sector(struct btrfs_raid_bio *rbio,
1683 : int stripe_nr, int sector_nr)
1684 : {
1685 0 : struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1686 0 : struct sector_ptr *sector;
1687 0 : u8 csum_buf[BTRFS_CSUM_SIZE];
1688 0 : u8 *csum_expected;
1689 0 : int ret;
1690 :
1691 0 : if (!rbio->csum_bitmap || !rbio->csum_buf)
1692 : return 0;
1693 :
1694 : /* No way to verify P/Q as they are not covered by data csum. */
1695 0 : if (stripe_nr >= rbio->nr_data)
1696 : return 0;
1697 : /*
1698 : * If we're rebuilding a read, we have to use pages from the
1699 : * bio list if possible.
1700 : */
1701 0 : if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1702 : rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) {
1703 0 : sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
1704 : } else {
1705 0 : sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1706 : }
1707 :
1708 0 : ASSERT(sector->page);
1709 :
1710 0 : csum_expected = rbio->csum_buf +
1711 0 : (stripe_nr * rbio->stripe_nsectors + sector_nr) *
1712 0 : fs_info->csum_size;
1713 0 : ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff,
1714 : csum_buf, csum_expected);
1715 0 : return ret;
1716 : }
1717 :
1718 : /*
1719 : * Recover a vertical stripe specified by @sector_nr.
1720 : * @*pointers are the pre-allocated pointers by the caller, so we don't
1721 : * need to allocate/free the pointers again and again.
1722 : */
1723 0 : static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
1724 : void **pointers, void **unmap_array)
1725 : {
1726 0 : struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1727 0 : struct sector_ptr *sector;
1728 0 : const u32 sectorsize = fs_info->sectorsize;
1729 0 : int found_errors;
1730 0 : int faila;
1731 0 : int failb;
1732 0 : int stripe_nr;
1733 0 : int ret = 0;
1734 :
1735 : /*
1736 : * Now we just use bitmap to mark the horizontal stripes in
1737 : * which we have data when doing parity scrub.
1738 : */
1739 0 : if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1740 0 : !test_bit(sector_nr, &rbio->dbitmap))
1741 : return 0;
1742 :
1743 0 : found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
1744 : &failb);
1745 : /*
1746 : * No errors in the vertical stripe, skip it. Can happen for recovery
1747 : * which only part of a stripe failed csum check.
1748 : */
1749 0 : if (!found_errors)
1750 : return 0;
1751 :
1752 0 : if (found_errors > rbio->bioc->max_errors)
1753 : return -EIO;
1754 :
1755 : /*
1756 : * Setup our array of pointers with sectors from each stripe
1757 : *
1758 : * NOTE: store a duplicate array of pointers to preserve the
1759 : * pointer order.
1760 : */
1761 0 : for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1762 : /*
1763 : * If we're rebuilding a read, we have to use pages from the
1764 : * bio list if possible.
1765 : */
1766 0 : if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1767 : rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) {
1768 0 : sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
1769 : } else {
1770 0 : sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1771 : }
1772 0 : ASSERT(sector->page);
1773 0 : pointers[stripe_nr] = kmap_local_page(sector->page) +
1774 0 : sector->pgoff;
1775 0 : unmap_array[stripe_nr] = pointers[stripe_nr];
1776 : }
1777 :
1778 : /* All raid6 handling here */
1779 0 : if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1780 : /* Single failure, rebuild from parity raid5 style */
1781 0 : if (failb < 0) {
1782 0 : if (faila == rbio->nr_data)
1783 : /*
1784 : * Just the P stripe has failed, without
1785 : * a bad data or Q stripe.
1786 : * We have nothing to do, just skip the
1787 : * recovery for this stripe.
1788 : */
1789 0 : goto cleanup;
1790 : /*
1791 : * a single failure in raid6 is rebuilt
1792 : * in the pstripe code below
1793 : */
1794 0 : goto pstripe;
1795 : }
1796 :
1797 : /*
1798 : * If the q stripe is failed, do a pstripe reconstruction from
1799 : * the xors.
1800 : * If both the q stripe and the P stripe are failed, we're
1801 : * here due to a crc mismatch and we can't give them the
1802 : * data they want.
1803 : */
1804 0 : if (failb == rbio->real_stripes - 1) {
1805 0 : if (faila == rbio->real_stripes - 2)
1806 : /*
1807 : * Only P and Q are corrupted.
1808 : * We only care about data stripes recovery,
1809 : * can skip this vertical stripe.
1810 : */
1811 0 : goto cleanup;
1812 : /*
1813 : * Otherwise we have one bad data stripe and
1814 : * a good P stripe. raid5!
1815 : */
1816 0 : goto pstripe;
1817 : }
1818 :
1819 0 : if (failb == rbio->real_stripes - 2) {
1820 0 : raid6_datap_recov(rbio->real_stripes, sectorsize,
1821 : faila, pointers);
1822 : } else {
1823 0 : raid6_2data_recov(rbio->real_stripes, sectorsize,
1824 : faila, failb, pointers);
1825 : }
1826 : } else {
1827 : void *p;
1828 :
1829 : /* Rebuild from P stripe here (raid5 or raid6). */
1830 : ASSERT(failb == -1);
1831 0 : pstripe:
1832 : /* Copy parity block into failed block to start with */
1833 0 : memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
1834 :
1835 : /* Rearrange the pointer array */
1836 0 : p = pointers[faila];
1837 0 : for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
1838 0 : stripe_nr++)
1839 0 : pointers[stripe_nr] = pointers[stripe_nr + 1];
1840 0 : pointers[rbio->nr_data - 1] = p;
1841 :
1842 : /* Xor in the rest */
1843 0 : run_xor(pointers, rbio->nr_data - 1, sectorsize);
1844 :
1845 : }
1846 :
1847 : /*
1848 : * No matter if this is a RMW or recovery, we should have all
1849 : * failed sectors repaired in the vertical stripe, thus they are now
1850 : * uptodate.
1851 : * Especially if we determine to cache the rbio, we need to
1852 : * have at least all data sectors uptodate.
1853 : *
1854 : * If possible, also check if the repaired sector matches its data
1855 : * checksum.
1856 : */
1857 0 : if (faila >= 0) {
1858 0 : ret = verify_one_sector(rbio, faila, sector_nr);
1859 0 : if (ret < 0)
1860 0 : goto cleanup;
1861 :
1862 0 : sector = rbio_stripe_sector(rbio, faila, sector_nr);
1863 0 : sector->uptodate = 1;
1864 : }
1865 0 : if (failb >= 0) {
1866 0 : ret = verify_one_sector(rbio, failb, sector_nr);
1867 0 : if (ret < 0)
1868 0 : goto cleanup;
1869 :
1870 0 : sector = rbio_stripe_sector(rbio, failb, sector_nr);
1871 0 : sector->uptodate = 1;
1872 : }
1873 :
1874 0 : cleanup:
1875 0 : for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
1876 : kunmap_local(unmap_array[stripe_nr]);
1877 : return ret;
1878 : }
1879 :
1880 0 : static int recover_sectors(struct btrfs_raid_bio *rbio)
1881 : {
1882 0 : void **pointers = NULL;
1883 0 : void **unmap_array = NULL;
1884 0 : int sectornr;
1885 0 : int ret = 0;
1886 :
1887 : /*
1888 : * @pointers array stores the pointer for each sector.
1889 : *
1890 : * @unmap_array stores copy of pointers that does not get reordered
1891 : * during reconstruction so that kunmap_local works.
1892 : */
1893 0 : pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1894 0 : unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1895 0 : if (!pointers || !unmap_array) {
1896 0 : ret = -ENOMEM;
1897 0 : goto out;
1898 : }
1899 :
1900 0 : if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1901 : rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1902 0 : spin_lock(&rbio->bio_list_lock);
1903 0 : set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1904 0 : spin_unlock(&rbio->bio_list_lock);
1905 : }
1906 :
1907 0 : index_rbio_pages(rbio);
1908 :
1909 0 : for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1910 0 : ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
1911 0 : if (ret < 0)
1912 : break;
1913 : }
1914 :
1915 0 : out:
1916 0 : kfree(pointers);
1917 0 : kfree(unmap_array);
1918 0 : return ret;
1919 : }
1920 :
1921 0 : static void recover_rbio(struct btrfs_raid_bio *rbio)
1922 : {
1923 0 : struct bio_list bio_list = BIO_EMPTY_LIST;
1924 0 : int total_sector_nr;
1925 0 : int ret = 0;
1926 :
1927 : /*
1928 : * Either we're doing recover for a read failure or degraded write,
1929 : * caller should have set error bitmap correctly.
1930 : */
1931 0 : ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
1932 :
1933 : /* For recovery, we need to read all sectors including P/Q. */
1934 0 : ret = alloc_rbio_pages(rbio);
1935 0 : if (ret < 0)
1936 0 : goto out;
1937 :
1938 0 : index_rbio_pages(rbio);
1939 :
1940 : /*
1941 : * Read everything that hasn't failed. However this time we will
1942 : * not trust any cached sector.
1943 : * As we may read out some stale data but higher layer is not reading
1944 : * that stale part.
1945 : *
1946 : * So here we always re-read everything in recovery path.
1947 : */
1948 0 : for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1949 0 : total_sector_nr++) {
1950 0 : int stripe = total_sector_nr / rbio->stripe_nsectors;
1951 0 : int sectornr = total_sector_nr % rbio->stripe_nsectors;
1952 0 : struct sector_ptr *sector;
1953 :
1954 : /*
1955 : * Skip the range which has error. It can be a range which is
1956 : * marked error (for csum mismatch), or it can be a missing
1957 : * device.
1958 : */
1959 0 : if (!rbio->bioc->stripes[stripe].dev->bdev ||
1960 0 : test_bit(total_sector_nr, rbio->error_bitmap)) {
1961 : /*
1962 : * Also set the error bit for missing device, which
1963 : * may not yet have its error bit set.
1964 : */
1965 0 : set_bit(total_sector_nr, rbio->error_bitmap);
1966 0 : continue;
1967 : }
1968 :
1969 0 : sector = rbio_stripe_sector(rbio, stripe, sectornr);
1970 0 : ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
1971 : sectornr, REQ_OP_READ);
1972 0 : if (ret < 0) {
1973 0 : bio_list_put(&bio_list);
1974 0 : goto out;
1975 : }
1976 : }
1977 :
1978 0 : submit_read_wait_bio_list(rbio, &bio_list);
1979 0 : ret = recover_sectors(rbio);
1980 0 : out:
1981 0 : rbio_orig_end_io(rbio, errno_to_blk_status(ret));
1982 0 : }
1983 :
1984 0 : static void recover_rbio_work(struct work_struct *work)
1985 : {
1986 0 : struct btrfs_raid_bio *rbio;
1987 :
1988 0 : rbio = container_of(work, struct btrfs_raid_bio, work);
1989 0 : if (!lock_stripe_add(rbio))
1990 0 : recover_rbio(rbio);
1991 0 : }
1992 :
1993 0 : static void recover_rbio_work_locked(struct work_struct *work)
1994 : {
1995 0 : recover_rbio(container_of(work, struct btrfs_raid_bio, work));
1996 0 : }
1997 :
1998 0 : static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
1999 : {
2000 0 : bool found = false;
2001 0 : int sector_nr;
2002 :
2003 : /*
2004 : * This is for RAID6 extra recovery tries, thus mirror number should
2005 : * be large than 2.
2006 : * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
2007 : * RAID5 methods.
2008 : */
2009 0 : ASSERT(mirror_num > 2);
2010 0 : for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2011 0 : int found_errors;
2012 0 : int faila;
2013 0 : int failb;
2014 :
2015 0 : found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2016 : &faila, &failb);
2017 : /* This vertical stripe doesn't have errors. */
2018 0 : if (!found_errors)
2019 0 : continue;
2020 :
2021 : /*
2022 : * If we found errors, there should be only one error marked
2023 : * by previous set_rbio_range_error().
2024 : */
2025 0 : ASSERT(found_errors == 1);
2026 0 : found = true;
2027 :
2028 : /* Now select another stripe to mark as error. */
2029 0 : failb = rbio->real_stripes - (mirror_num - 1);
2030 0 : if (failb <= faila)
2031 0 : failb--;
2032 :
2033 : /* Set the extra bit in error bitmap. */
2034 0 : if (failb >= 0)
2035 0 : set_bit(failb * rbio->stripe_nsectors + sector_nr,
2036 0 : rbio->error_bitmap);
2037 : }
2038 :
2039 : /* We should found at least one vertical stripe with error.*/
2040 0 : ASSERT(found);
2041 0 : }
2042 :
2043 : /*
2044 : * the main entry point for reads from the higher layers. This
2045 : * is really only called when the normal read path had a failure,
2046 : * so we assume the bio they send down corresponds to a failed part
2047 : * of the drive.
2048 : */
2049 0 : void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2050 : int mirror_num)
2051 : {
2052 0 : struct btrfs_fs_info *fs_info = bioc->fs_info;
2053 0 : struct btrfs_raid_bio *rbio;
2054 :
2055 0 : rbio = alloc_rbio(fs_info, bioc);
2056 0 : if (IS_ERR(rbio)) {
2057 0 : bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
2058 0 : bio_endio(bio);
2059 0 : return;
2060 : }
2061 :
2062 0 : rbio->operation = BTRFS_RBIO_READ_REBUILD;
2063 0 : rbio_add_bio(rbio, bio);
2064 :
2065 0 : set_rbio_range_error(rbio, bio);
2066 :
2067 : /*
2068 : * Loop retry:
2069 : * for 'mirror == 2', reconstruct from all other stripes.
2070 : * for 'mirror_num > 2', select a stripe to fail on every retry.
2071 : */
2072 0 : if (mirror_num > 2)
2073 0 : set_rbio_raid6_extra_error(rbio, mirror_num);
2074 :
2075 0 : start_async_work(rbio, recover_rbio_work);
2076 : }
2077 :
2078 0 : static void fill_data_csums(struct btrfs_raid_bio *rbio)
2079 : {
2080 0 : struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
2081 0 : struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
2082 : rbio->bioc->full_stripe_logical);
2083 0 : const u64 start = rbio->bioc->full_stripe_logical;
2084 0 : const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
2085 0 : fs_info->sectorsize_bits;
2086 0 : int ret;
2087 :
2088 : /* The rbio should not have its csum buffer initialized. */
2089 0 : ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
2090 :
2091 : /*
2092 : * Skip the csum search if:
2093 : *
2094 : * - The rbio doesn't belong to data block groups
2095 : * Then we are doing IO for tree blocks, no need to search csums.
2096 : *
2097 : * - The rbio belongs to mixed block groups
2098 : * This is to avoid deadlock, as we're already holding the full
2099 : * stripe lock, if we trigger a metadata read, and it needs to do
2100 : * raid56 recovery, we will deadlock.
2101 : */
2102 0 : if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
2103 : rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
2104 : return;
2105 :
2106 0 : rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors *
2107 0 : fs_info->csum_size, GFP_NOFS);
2108 0 : rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors,
2109 : GFP_NOFS);
2110 0 : if (!rbio->csum_buf || !rbio->csum_bitmap) {
2111 0 : ret = -ENOMEM;
2112 0 : goto error;
2113 : }
2114 :
2115 0 : ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1,
2116 : rbio->csum_buf, rbio->csum_bitmap, false);
2117 0 : if (ret < 0)
2118 0 : goto error;
2119 0 : if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
2120 0 : goto no_csum;
2121 : return;
2122 :
2123 0 : error:
2124 : /*
2125 : * We failed to allocate memory or grab the csum, but it's not fatal,
2126 : * we can still continue. But better to warn users that RMW is no
2127 : * longer safe for this particular sub-stripe write.
2128 : */
2129 0 : btrfs_warn_rl(fs_info,
2130 : "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
2131 : rbio->bioc->full_stripe_logical, ret);
2132 0 : no_csum:
2133 0 : kfree(rbio->csum_buf);
2134 0 : bitmap_free(rbio->csum_bitmap);
2135 0 : rbio->csum_buf = NULL;
2136 0 : rbio->csum_bitmap = NULL;
2137 : }
2138 :
2139 0 : static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
2140 : {
2141 0 : struct bio_list bio_list = BIO_EMPTY_LIST;
2142 0 : int total_sector_nr;
2143 0 : int ret = 0;
2144 :
2145 : /*
2146 : * Fill the data csums we need for data verification. We need to fill
2147 : * the csum_bitmap/csum_buf first, as our endio function will try to
2148 : * verify the data sectors.
2149 : */
2150 0 : fill_data_csums(rbio);
2151 :
2152 : /*
2153 : * Build a list of bios to read all sectors (including data and P/Q).
2154 : *
2155 : * This behavior is to compensate the later csum verification and recovery.
2156 : */
2157 0 : for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2158 0 : total_sector_nr++) {
2159 0 : struct sector_ptr *sector;
2160 0 : int stripe = total_sector_nr / rbio->stripe_nsectors;
2161 0 : int sectornr = total_sector_nr % rbio->stripe_nsectors;
2162 :
2163 0 : sector = rbio_stripe_sector(rbio, stripe, sectornr);
2164 0 : ret = rbio_add_io_sector(rbio, &bio_list, sector,
2165 : stripe, sectornr, REQ_OP_READ);
2166 0 : if (ret) {
2167 0 : bio_list_put(&bio_list);
2168 0 : return ret;
2169 : }
2170 : }
2171 :
2172 : /*
2173 : * We may or may not have any corrupted sectors (including missing dev
2174 : * and csum mismatch), just let recover_sectors() to handle them all.
2175 : */
2176 0 : submit_read_wait_bio_list(rbio, &bio_list);
2177 0 : return recover_sectors(rbio);
2178 : }
2179 :
2180 0 : static void raid_wait_write_end_io(struct bio *bio)
2181 : {
2182 0 : struct btrfs_raid_bio *rbio = bio->bi_private;
2183 0 : blk_status_t err = bio->bi_status;
2184 :
2185 0 : if (err)
2186 0 : rbio_update_error_bitmap(rbio, bio);
2187 0 : bio_put(bio);
2188 0 : if (atomic_dec_and_test(&rbio->stripes_pending))
2189 0 : wake_up(&rbio->io_wait);
2190 0 : }
2191 :
2192 0 : static void submit_write_bios(struct btrfs_raid_bio *rbio,
2193 : struct bio_list *bio_list)
2194 : {
2195 0 : struct bio *bio;
2196 :
2197 0 : atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
2198 0 : while ((bio = bio_list_pop(bio_list))) {
2199 0 : bio->bi_end_io = raid_wait_write_end_io;
2200 :
2201 0 : if (trace_raid56_write_stripe_enabled()) {
2202 0 : struct raid56_bio_trace_info trace_info = { 0 };
2203 :
2204 0 : bio_get_trace_info(rbio, bio, &trace_info);
2205 0 : trace_raid56_write_stripe(rbio, bio, &trace_info);
2206 : }
2207 0 : submit_bio(bio);
2208 : }
2209 0 : }
2210 :
2211 : /*
2212 : * To determine if we need to read any sector from the disk.
2213 : * Should only be utilized in RMW path, to skip cached rbio.
2214 : */
2215 0 : static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
2216 : {
2217 0 : int i;
2218 :
2219 0 : for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
2220 0 : struct sector_ptr *sector = &rbio->stripe_sectors[i];
2221 :
2222 : /*
2223 : * We have a sector which doesn't have page nor uptodate,
2224 : * thus this rbio can not be cached one, as cached one must
2225 : * have all its data sectors present and uptodate.
2226 : */
2227 0 : if (!sector->page || !sector->uptodate)
2228 : return true;
2229 : }
2230 : return false;
2231 : }
2232 :
2233 0 : static void rmw_rbio(struct btrfs_raid_bio *rbio)
2234 : {
2235 0 : struct bio_list bio_list;
2236 0 : int sectornr;
2237 0 : int ret = 0;
2238 :
2239 : /*
2240 : * Allocate the pages for parity first, as P/Q pages will always be
2241 : * needed for both full-stripe and sub-stripe writes.
2242 : */
2243 0 : ret = alloc_rbio_parity_pages(rbio);
2244 0 : if (ret < 0)
2245 0 : goto out;
2246 :
2247 : /*
2248 : * Either full stripe write, or we have every data sector already
2249 : * cached, can go to write path immediately.
2250 : */
2251 0 : if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) {
2252 : /*
2253 : * Now we're doing sub-stripe write, also need all data stripes
2254 : * to do the full RMW.
2255 : */
2256 0 : ret = alloc_rbio_data_pages(rbio);
2257 0 : if (ret < 0)
2258 0 : goto out;
2259 :
2260 0 : index_rbio_pages(rbio);
2261 :
2262 0 : ret = rmw_read_wait_recover(rbio);
2263 0 : if (ret < 0)
2264 0 : goto out;
2265 : }
2266 :
2267 : /*
2268 : * At this stage we're not allowed to add any new bios to the
2269 : * bio list any more, anyone else that wants to change this stripe
2270 : * needs to do their own rmw.
2271 : */
2272 0 : spin_lock(&rbio->bio_list_lock);
2273 0 : set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
2274 0 : spin_unlock(&rbio->bio_list_lock);
2275 :
2276 0 : bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2277 :
2278 0 : index_rbio_pages(rbio);
2279 :
2280 : /*
2281 : * We don't cache full rbios because we're assuming
2282 : * the higher layers are unlikely to use this area of
2283 : * the disk again soon. If they do use it again,
2284 : * hopefully they will send another full bio.
2285 : */
2286 0 : if (!rbio_is_full(rbio))
2287 0 : cache_rbio_pages(rbio);
2288 : else
2289 0 : clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2290 :
2291 0 : for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
2292 0 : generate_pq_vertical(rbio, sectornr);
2293 :
2294 0 : bio_list_init(&bio_list);
2295 0 : ret = rmw_assemble_write_bios(rbio, &bio_list);
2296 0 : if (ret < 0)
2297 0 : goto out;
2298 :
2299 : /* We should have at least one bio assembled. */
2300 0 : ASSERT(bio_list_size(&bio_list));
2301 0 : submit_write_bios(rbio, &bio_list);
2302 0 : wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2303 :
2304 : /* We may have more errors than our tolerance during the read. */
2305 0 : for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2306 0 : int found_errors;
2307 :
2308 0 : found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL);
2309 0 : if (found_errors > rbio->bioc->max_errors) {
2310 : ret = -EIO;
2311 : break;
2312 : }
2313 : }
2314 0 : out:
2315 0 : rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2316 0 : }
2317 :
2318 0 : static void rmw_rbio_work(struct work_struct *work)
2319 : {
2320 0 : struct btrfs_raid_bio *rbio;
2321 :
2322 0 : rbio = container_of(work, struct btrfs_raid_bio, work);
2323 0 : if (lock_stripe_add(rbio) == 0)
2324 0 : rmw_rbio(rbio);
2325 0 : }
2326 :
2327 0 : static void rmw_rbio_work_locked(struct work_struct *work)
2328 : {
2329 0 : rmw_rbio(container_of(work, struct btrfs_raid_bio, work));
2330 0 : }
2331 :
2332 : /*
2333 : * The following code is used to scrub/replace the parity stripe
2334 : *
2335 : * Caller must have already increased bio_counter for getting @bioc.
2336 : *
2337 : * Note: We need make sure all the pages that add into the scrub/replace
2338 : * raid bio are correct and not be changed during the scrub/replace. That
2339 : * is those pages just hold metadata or file data with checksum.
2340 : */
2341 :
2342 0 : struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2343 : struct btrfs_io_context *bioc,
2344 : struct btrfs_device *scrub_dev,
2345 : unsigned long *dbitmap, int stripe_nsectors)
2346 : {
2347 0 : struct btrfs_fs_info *fs_info = bioc->fs_info;
2348 0 : struct btrfs_raid_bio *rbio;
2349 0 : int i;
2350 :
2351 0 : rbio = alloc_rbio(fs_info, bioc);
2352 0 : if (IS_ERR(rbio))
2353 : return NULL;
2354 0 : bio_list_add(&rbio->bio_list, bio);
2355 : /*
2356 : * This is a special bio which is used to hold the completion handler
2357 : * and make the scrub rbio is similar to the other types
2358 : */
2359 0 : ASSERT(!bio->bi_iter.bi_size);
2360 0 : rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2361 :
2362 : /*
2363 : * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2364 : * to the end position, so this search can start from the first parity
2365 : * stripe.
2366 : */
2367 0 : for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2368 0 : if (bioc->stripes[i].dev == scrub_dev) {
2369 0 : rbio->scrubp = i;
2370 0 : break;
2371 : }
2372 : }
2373 0 : ASSERT(i < rbio->real_stripes);
2374 :
2375 0 : bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
2376 0 : return rbio;
2377 : }
2378 :
2379 : /*
2380 : * We just scrub the parity that we have correct data on the same horizontal,
2381 : * so we needn't allocate all pages for all the stripes.
2382 : */
2383 0 : static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2384 : {
2385 0 : const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2386 0 : int total_sector_nr;
2387 :
2388 0 : for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2389 0 : total_sector_nr++) {
2390 0 : struct page *page;
2391 0 : int sectornr = total_sector_nr % rbio->stripe_nsectors;
2392 0 : int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
2393 :
2394 0 : if (!test_bit(sectornr, &rbio->dbitmap))
2395 0 : continue;
2396 0 : if (rbio->stripe_pages[index])
2397 0 : continue;
2398 0 : page = alloc_page(GFP_NOFS);
2399 0 : if (!page)
2400 : return -ENOMEM;
2401 0 : rbio->stripe_pages[index] = page;
2402 : }
2403 0 : index_stripe_sectors(rbio);
2404 0 : return 0;
2405 : }
2406 :
2407 0 : static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
2408 : {
2409 0 : struct btrfs_io_context *bioc = rbio->bioc;
2410 0 : const u32 sectorsize = bioc->fs_info->sectorsize;
2411 0 : void **pointers = rbio->finish_pointers;
2412 0 : unsigned long *pbitmap = &rbio->finish_pbitmap;
2413 0 : int nr_data = rbio->nr_data;
2414 0 : int stripe;
2415 0 : int sectornr;
2416 0 : bool has_qstripe;
2417 0 : struct sector_ptr p_sector = { 0 };
2418 0 : struct sector_ptr q_sector = { 0 };
2419 0 : struct bio_list bio_list;
2420 0 : int is_replace = 0;
2421 0 : int ret;
2422 :
2423 0 : bio_list_init(&bio_list);
2424 :
2425 0 : if (rbio->real_stripes - rbio->nr_data == 1)
2426 : has_qstripe = false;
2427 0 : else if (rbio->real_stripes - rbio->nr_data == 2)
2428 : has_qstripe = true;
2429 : else
2430 0 : BUG();
2431 :
2432 : /*
2433 : * Replace is running and our P/Q stripe is being replaced, then we
2434 : * need to duplicate the final write to replace target.
2435 : */
2436 0 : if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
2437 0 : is_replace = 1;
2438 0 : bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
2439 : }
2440 :
2441 : /*
2442 : * Because the higher layers(scrubber) are unlikely to
2443 : * use this area of the disk again soon, so don't cache
2444 : * it.
2445 : */
2446 0 : clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2447 :
2448 0 : p_sector.page = alloc_page(GFP_NOFS);
2449 0 : if (!p_sector.page)
2450 : return -ENOMEM;
2451 0 : p_sector.pgoff = 0;
2452 0 : p_sector.uptodate = 1;
2453 :
2454 0 : if (has_qstripe) {
2455 : /* RAID6, allocate and map temp space for the Q stripe */
2456 0 : q_sector.page = alloc_page(GFP_NOFS);
2457 0 : if (!q_sector.page) {
2458 0 : __free_page(p_sector.page);
2459 0 : p_sector.page = NULL;
2460 0 : return -ENOMEM;
2461 : }
2462 0 : q_sector.pgoff = 0;
2463 0 : q_sector.uptodate = 1;
2464 0 : pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
2465 : }
2466 :
2467 0 : bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2468 :
2469 : /* Map the parity stripe just once */
2470 0 : pointers[nr_data] = kmap_local_page(p_sector.page);
2471 :
2472 0 : for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2473 : struct sector_ptr *sector;
2474 : void *parity;
2475 :
2476 : /* first collect one page from each data stripe */
2477 0 : for (stripe = 0; stripe < nr_data; stripe++) {
2478 0 : sector = sector_in_rbio(rbio, stripe, sectornr, 0);
2479 0 : pointers[stripe] = kmap_local_page(sector->page) +
2480 0 : sector->pgoff;
2481 : }
2482 :
2483 0 : if (has_qstripe) {
2484 : /* RAID6, call the library function to fill in our P/Q */
2485 0 : raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
2486 : pointers);
2487 : } else {
2488 : /* raid5 */
2489 0 : memcpy(pointers[nr_data], pointers[0], sectorsize);
2490 0 : run_xor(pointers + 1, nr_data - 1, sectorsize);
2491 : }
2492 :
2493 : /* Check scrubbing parity and repair it */
2494 0 : sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2495 0 : parity = kmap_local_page(sector->page) + sector->pgoff;
2496 0 : if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
2497 0 : memcpy(parity, pointers[rbio->scrubp], sectorsize);
2498 : else
2499 : /* Parity is right, needn't writeback */
2500 0 : bitmap_clear(&rbio->dbitmap, sectornr, 1);
2501 0 : kunmap_local(parity);
2502 :
2503 0 : for (stripe = nr_data - 1; stripe >= 0; stripe--)
2504 : kunmap_local(pointers[stripe]);
2505 : }
2506 :
2507 0 : kunmap_local(pointers[nr_data]);
2508 0 : __free_page(p_sector.page);
2509 0 : p_sector.page = NULL;
2510 0 : if (q_sector.page) {
2511 0 : kunmap_local(pointers[rbio->real_stripes - 1]);
2512 0 : __free_page(q_sector.page);
2513 0 : q_sector.page = NULL;
2514 : }
2515 :
2516 : /*
2517 : * time to start writing. Make bios for everything from the
2518 : * higher layers (the bio_list in our rbio) and our p/q. Ignore
2519 : * everything else.
2520 : */
2521 0 : for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2522 0 : struct sector_ptr *sector;
2523 :
2524 0 : sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2525 0 : ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
2526 : sectornr, REQ_OP_WRITE);
2527 0 : if (ret)
2528 0 : goto cleanup;
2529 : }
2530 :
2531 0 : if (!is_replace)
2532 0 : goto submit_write;
2533 :
2534 : /*
2535 : * Replace is running and our parity stripe needs to be duplicated to
2536 : * the target device. Check we have a valid source stripe number.
2537 : */
2538 : ASSERT(rbio->bioc->replace_stripe_src >= 0);
2539 0 : for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2540 0 : struct sector_ptr *sector;
2541 :
2542 0 : sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2543 0 : ret = rbio_add_io_sector(rbio, &bio_list, sector,
2544 0 : rbio->real_stripes,
2545 : sectornr, REQ_OP_WRITE);
2546 0 : if (ret)
2547 0 : goto cleanup;
2548 : }
2549 :
2550 0 : submit_write:
2551 0 : submit_write_bios(rbio, &bio_list);
2552 0 : return 0;
2553 :
2554 0 : cleanup:
2555 0 : bio_list_put(&bio_list);
2556 0 : return ret;
2557 : }
2558 :
2559 : static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2560 : {
2561 0 : if (stripe >= 0 && stripe < rbio->nr_data)
2562 0 : return 1;
2563 : return 0;
2564 : }
2565 :
2566 0 : static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
2567 : {
2568 0 : void **pointers = NULL;
2569 0 : void **unmap_array = NULL;
2570 0 : int sector_nr;
2571 0 : int ret = 0;
2572 :
2573 : /*
2574 : * @pointers array stores the pointer for each sector.
2575 : *
2576 : * @unmap_array stores copy of pointers that does not get reordered
2577 : * during reconstruction so that kunmap_local works.
2578 : */
2579 0 : pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2580 0 : unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2581 0 : if (!pointers || !unmap_array) {
2582 0 : ret = -ENOMEM;
2583 0 : goto out;
2584 : }
2585 :
2586 0 : for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2587 0 : int dfail = 0, failp = -1;
2588 0 : int faila;
2589 0 : int failb;
2590 0 : int found_errors;
2591 :
2592 0 : found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2593 : &faila, &failb);
2594 0 : if (found_errors > rbio->bioc->max_errors) {
2595 0 : ret = -EIO;
2596 0 : goto out;
2597 : }
2598 0 : if (found_errors == 0)
2599 0 : continue;
2600 :
2601 : /* We should have at least one error here. */
2602 0 : ASSERT(faila >= 0 || failb >= 0);
2603 :
2604 0 : if (is_data_stripe(rbio, faila))
2605 : dfail++;
2606 0 : else if (is_parity_stripe(faila))
2607 : failp = faila;
2608 :
2609 0 : if (is_data_stripe(rbio, failb))
2610 0 : dfail++;
2611 0 : else if (is_parity_stripe(failb))
2612 : failp = failb;
2613 : /*
2614 : * Because we can not use a scrubbing parity to repair the
2615 : * data, so the capability of the repair is declined. (In the
2616 : * case of RAID5, we can not repair anything.)
2617 : */
2618 0 : if (dfail > rbio->bioc->max_errors - 1) {
2619 0 : ret = -EIO;
2620 0 : goto out;
2621 : }
2622 : /*
2623 : * If all data is good, only parity is correctly, just repair
2624 : * the parity, no need to recover data stripes.
2625 : */
2626 0 : if (dfail == 0)
2627 0 : continue;
2628 :
2629 : /*
2630 : * Here means we got one corrupted data stripe and one
2631 : * corrupted parity on RAID6, if the corrupted parity is
2632 : * scrubbing parity, luckily, use the other one to repair the
2633 : * data, or we can not repair the data stripe.
2634 : */
2635 0 : if (failp != rbio->scrubp) {
2636 0 : ret = -EIO;
2637 0 : goto out;
2638 : }
2639 :
2640 : ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
2641 : if (ret < 0)
2642 : goto out;
2643 : }
2644 0 : out:
2645 0 : kfree(pointers);
2646 0 : kfree(unmap_array);
2647 0 : return ret;
2648 : }
2649 :
2650 0 : static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
2651 : {
2652 0 : struct bio_list bio_list = BIO_EMPTY_LIST;
2653 0 : int total_sector_nr;
2654 0 : int ret = 0;
2655 :
2656 : /* Build a list of bios to read all the missing parts. */
2657 0 : for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2658 0 : total_sector_nr++) {
2659 0 : int sectornr = total_sector_nr % rbio->stripe_nsectors;
2660 0 : int stripe = total_sector_nr / rbio->stripe_nsectors;
2661 0 : struct sector_ptr *sector;
2662 :
2663 : /* No data in the vertical stripe, no need to read. */
2664 0 : if (!test_bit(sectornr, &rbio->dbitmap))
2665 0 : continue;
2666 :
2667 : /*
2668 : * We want to find all the sectors missing from the rbio and
2669 : * read them from the disk. If sector_in_rbio() finds a sector
2670 : * in the bio list we don't need to read it off the stripe.
2671 : */
2672 0 : sector = sector_in_rbio(rbio, stripe, sectornr, 1);
2673 0 : if (sector)
2674 0 : continue;
2675 :
2676 0 : sector = rbio_stripe_sector(rbio, stripe, sectornr);
2677 : /*
2678 : * The bio cache may have handed us an uptodate sector. If so,
2679 : * use it.
2680 : */
2681 0 : if (sector->uptodate)
2682 0 : continue;
2683 :
2684 0 : ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
2685 : sectornr, REQ_OP_READ);
2686 0 : if (ret) {
2687 0 : bio_list_put(&bio_list);
2688 0 : return ret;
2689 : }
2690 : }
2691 :
2692 0 : submit_read_wait_bio_list(rbio, &bio_list);
2693 0 : return 0;
2694 : }
2695 :
2696 0 : static void scrub_rbio(struct btrfs_raid_bio *rbio)
2697 : {
2698 0 : int sector_nr;
2699 0 : int ret;
2700 :
2701 0 : ret = alloc_rbio_essential_pages(rbio);
2702 0 : if (ret)
2703 0 : goto out;
2704 :
2705 0 : bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2706 :
2707 0 : ret = scrub_assemble_read_bios(rbio);
2708 0 : if (ret < 0)
2709 0 : goto out;
2710 :
2711 : /* We may have some failures, recover the failed sectors first. */
2712 0 : ret = recover_scrub_rbio(rbio);
2713 0 : if (ret < 0)
2714 0 : goto out;
2715 :
2716 : /*
2717 : * We have every sector properly prepared. Can finish the scrub
2718 : * and writeback the good content.
2719 : */
2720 0 : ret = finish_parity_scrub(rbio);
2721 0 : wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2722 0 : for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2723 0 : int found_errors;
2724 :
2725 0 : found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
2726 0 : if (found_errors > rbio->bioc->max_errors) {
2727 : ret = -EIO;
2728 : break;
2729 : }
2730 : }
2731 0 : out:
2732 0 : rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2733 0 : }
2734 :
2735 0 : static void scrub_rbio_work_locked(struct work_struct *work)
2736 : {
2737 0 : scrub_rbio(container_of(work, struct btrfs_raid_bio, work));
2738 0 : }
2739 :
2740 0 : void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2741 : {
2742 0 : if (!lock_stripe_add(rbio))
2743 0 : start_async_work(rbio, scrub_rbio_work_locked);
2744 0 : }
2745 :
2746 : /*
2747 : * This is for scrub call sites where we already have correct data contents.
2748 : * This allows us to avoid reading data stripes again.
2749 : *
2750 : * Unfortunately here we have to do page copy, other than reusing the pages.
2751 : * This is due to the fact rbio has its own page management for its cache.
2752 : */
2753 0 : void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
2754 : struct page **data_pages, u64 data_logical)
2755 : {
2756 0 : const u64 offset_in_full_stripe = data_logical -
2757 0 : rbio->bioc->full_stripe_logical;
2758 0 : const int page_index = offset_in_full_stripe >> PAGE_SHIFT;
2759 0 : const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2760 0 : const u32 sectors_per_page = PAGE_SIZE / sectorsize;
2761 0 : int ret;
2762 :
2763 : /*
2764 : * If we hit ENOMEM temporarily, but later at
2765 : * raid56_parity_submit_scrub_rbio() time it succeeded, we just do
2766 : * the extra read, not a big deal.
2767 : *
2768 : * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time,
2769 : * the bio would got proper error number set.
2770 : */
2771 0 : ret = alloc_rbio_data_pages(rbio);
2772 0 : if (ret < 0)
2773 : return;
2774 :
2775 : /* data_logical must be at stripe boundary and inside the full stripe. */
2776 : ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
2777 : ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
2778 :
2779 0 : for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) {
2780 0 : struct page *dst = rbio->stripe_pages[page_nr + page_index];
2781 0 : struct page *src = data_pages[page_nr];
2782 :
2783 0 : memcpy_page(dst, 0, src, 0, PAGE_SIZE);
2784 0 : for (int sector_nr = sectors_per_page * page_index;
2785 0 : sector_nr < sectors_per_page * (page_index + 1);
2786 0 : sector_nr++)
2787 0 : rbio->stripe_sectors[sector_nr].uptodate = true;
2788 : }
2789 : }
|