Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (C) 2007,2008 Oracle. All rights reserved.
4 : */
5 :
6 : #include <linux/sched.h>
7 : #include <linux/slab.h>
8 : #include <linux/rbtree.h>
9 : #include <linux/mm.h>
10 : #include <linux/error-injection.h>
11 : #include "messages.h"
12 : #include "ctree.h"
13 : #include "disk-io.h"
14 : #include "transaction.h"
15 : #include "print-tree.h"
16 : #include "locking.h"
17 : #include "volumes.h"
18 : #include "qgroup.h"
19 : #include "tree-mod-log.h"
20 : #include "tree-checker.h"
21 : #include "fs.h"
22 : #include "accessors.h"
23 : #include "extent-tree.h"
24 : #include "relocation.h"
25 : #include "file-item.h"
26 :
27 : static struct kmem_cache *btrfs_path_cachep;
28 :
29 : static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
30 : *root, struct btrfs_path *path, int level);
31 : static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
32 : const struct btrfs_key *ins_key, struct btrfs_path *path,
33 : int data_size, int extend);
34 : static int push_node_left(struct btrfs_trans_handle *trans,
35 : struct extent_buffer *dst,
36 : struct extent_buffer *src, int empty);
37 : static int balance_node_right(struct btrfs_trans_handle *trans,
38 : struct extent_buffer *dst_buf,
39 : struct extent_buffer *src_buf);
40 :
41 : static const struct btrfs_csums {
42 : u16 size;
43 : const char name[10];
44 : const char driver[12];
45 : } btrfs_csums[] = {
46 : [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
47 : [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
48 : [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
49 : [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
50 : .driver = "blake2b-256" },
51 : };
52 :
53 : /*
54 : * The leaf data grows from end-to-front in the node. this returns the address
55 : * of the start of the last item, which is the stop of the leaf data stack.
56 : */
57 136314326 : static unsigned int leaf_data_end(const struct extent_buffer *leaf)
58 : {
59 136314326 : u32 nr = btrfs_header_nritems(leaf);
60 :
61 136314326 : if (nr == 0)
62 11252 : return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
63 136303074 : return btrfs_item_offset(leaf, nr - 1);
64 : }
65 :
66 : /*
67 : * Move data in a @leaf (using memmove, safe for overlapping ranges).
68 : *
69 : * @leaf: leaf that we're doing a memmove on
70 : * @dst_offset: item data offset we're moving to
71 : * @src_offset: item data offset were' moving from
72 : * @len: length of the data we're moving
73 : *
74 : * Wrapper around memmove_extent_buffer() that takes into account the header on
75 : * the leaf. The btrfs_item offset's start directly after the header, so we
76 : * have to adjust any offsets to account for the header in the leaf. This
77 : * handles that math to simplify the callers.
78 : */
79 : static inline void memmove_leaf_data(const struct extent_buffer *leaf,
80 : unsigned long dst_offset,
81 : unsigned long src_offset,
82 : unsigned long len)
83 : {
84 101290967 : memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset,
85 : btrfs_item_nr_offset(leaf, 0) + src_offset, len);
86 15176899 : }
87 :
88 : /*
89 : * Copy item data from @src into @dst at the given @offset.
90 : *
91 : * @dst: destination leaf that we're copying into
92 : * @src: source leaf that we're copying from
93 : * @dst_offset: item data offset we're copying to
94 : * @src_offset: item data offset were' copying from
95 : * @len: length of the data we're copying
96 : *
97 : * Wrapper around copy_extent_buffer() that takes into account the header on
98 : * the leaf. The btrfs_item offset's start directly after the header, so we
99 : * have to adjust any offsets to account for the header in the leaf. This
100 : * handles that math to simplify the callers.
101 : */
102 : static inline void copy_leaf_data(const struct extent_buffer *dst,
103 : const struct extent_buffer *src,
104 : unsigned long dst_offset,
105 : unsigned long src_offset, unsigned long len)
106 : {
107 2741221 : copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset,
108 : btrfs_item_nr_offset(src, 0) + src_offset, len);
109 : }
110 :
111 : /*
112 : * Move items in a @leaf (using memmove).
113 : *
114 : * @dst: destination leaf for the items
115 : * @dst_item: the item nr we're copying into
116 : * @src_item: the item nr we're copying from
117 : * @nr_items: the number of items to copy
118 : *
119 : * Wrapper around memmove_extent_buffer() that does the math to get the
120 : * appropriate offsets into the leaf from the item numbers.
121 : */
122 61259984 : static inline void memmove_leaf_items(const struct extent_buffer *leaf,
123 : int dst_item, int src_item, int nr_items)
124 : {
125 61259984 : memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item),
126 : btrfs_item_nr_offset(leaf, src_item),
127 : nr_items * sizeof(struct btrfs_item));
128 952186 : }
129 :
130 : /*
131 : * Copy items from @src into @dst at the given @offset.
132 : *
133 : * @dst: destination leaf for the items
134 : * @src: source leaf for the items
135 : * @dst_item: the item nr we're copying into
136 : * @src_item: the item nr we're copying from
137 : * @nr_items: the number of items to copy
138 : *
139 : * Wrapper around copy_extent_buffer() that does the math to get the
140 : * appropriate offsets into the leaf from the item numbers.
141 : */
142 : static inline void copy_leaf_items(const struct extent_buffer *dst,
143 : const struct extent_buffer *src,
144 : int dst_item, int src_item, int nr_items)
145 : {
146 1040815 : copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item),
147 : btrfs_item_nr_offset(src, src_item),
148 : nr_items * sizeof(struct btrfs_item));
149 : }
150 :
151 : /* This exists for btrfs-progs usages. */
152 0 : u16 btrfs_csum_type_size(u16 type)
153 : {
154 3254 : return btrfs_csums[type].size;
155 : }
156 :
157 3254 : int btrfs_super_csum_size(const struct btrfs_super_block *s)
158 : {
159 3254 : u16 t = btrfs_super_csum_type(s);
160 : /*
161 : * csum type is validated at mount time
162 : */
163 3254 : return btrfs_csum_type_size(t);
164 : }
165 :
166 3242 : const char *btrfs_super_csum_name(u16 csum_type)
167 : {
168 : /* csum type is validated at mount time */
169 3242 : return btrfs_csums[csum_type].name;
170 : }
171 :
172 : /*
173 : * Return driver name if defined, otherwise the name that's also a valid driver
174 : * name
175 : */
176 3242 : const char *btrfs_super_csum_driver(u16 csum_type)
177 : {
178 : /* csum type is validated at mount time */
179 3242 : return btrfs_csums[csum_type].driver[0] ?
180 3242 : btrfs_csums[csum_type].driver :
181 : btrfs_csums[csum_type].name;
182 : }
183 :
184 0 : size_t __attribute_const__ btrfs_get_num_csums(void)
185 : {
186 0 : return ARRAY_SIZE(btrfs_csums);
187 : }
188 :
189 198913144 : struct btrfs_path *btrfs_alloc_path(void)
190 : {
191 198913144 : might_sleep();
192 :
193 198907827 : return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
194 : }
195 :
196 : /* this also releases the path */
197 296458616 : void btrfs_free_path(struct btrfs_path *p)
198 : {
199 296458616 : if (!p)
200 : return;
201 198939485 : btrfs_release_path(p);
202 198951417 : kmem_cache_free(btrfs_path_cachep, p);
203 : }
204 :
205 : /*
206 : * path release drops references on the extent buffers in the path
207 : * and it drops any locks held by this path
208 : *
209 : * It is safe to call this on paths that no locks or extent buffers held.
210 : */
211 575995349 : noinline void btrfs_release_path(struct btrfs_path *p)
212 : {
213 575995349 : int i;
214 :
215 5181869846 : for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
216 4605943683 : p->slots[i] = 0;
217 4605943683 : if (!p->nodes[i])
218 3673997998 : continue;
219 931945685 : if (p->locks[i]) {
220 415623920 : btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
221 415594203 : p->locks[i] = 0;
222 : }
223 931915968 : free_extent_buffer(p->nodes[i]);
224 931876499 : p->nodes[i] = NULL;
225 : }
226 575926163 : }
227 :
228 : /*
229 : * We want the transaction abort to print stack trace only for errors where the
230 : * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
231 : * caused by external factors.
232 : */
233 17 : bool __cold abort_should_print_stack(int errno)
234 : {
235 17 : switch (errno) {
236 : case -EIO:
237 : case -EROFS:
238 : case -ENOMEM:
239 : return false;
240 : }
241 0 : return true;
242 : }
243 :
244 : /*
245 : * safely gets a reference on the root node of a tree. A lock
246 : * is not taken, so a concurrent writer may put a different node
247 : * at the root of the tree. See btrfs_lock_root_node for the
248 : * looping required.
249 : *
250 : * The extent buffer returned by this has a reference taken, so
251 : * it won't disappear. It may stop being the root of the tree
252 : * at any time because there are no locks held.
253 : */
254 490240797 : struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
255 : {
256 490240797 : struct extent_buffer *eb;
257 :
258 490240797 : while (1) {
259 490240797 : rcu_read_lock();
260 490277442 : eb = rcu_dereference(root->node);
261 :
262 : /*
263 : * RCU really hurts here, we could free up the root node because
264 : * it was COWed but we may not get the new root node yet so do
265 : * the inc_not_zero dance and if it doesn't work then
266 : * synchronize_rcu and try again.
267 : */
268 980773138 : if (atomic_inc_not_zero(&eb->refs)) {
269 490495696 : rcu_read_unlock();
270 490473583 : break;
271 : }
272 0 : rcu_read_unlock();
273 0 : synchronize_rcu();
274 : }
275 490473583 : return eb;
276 : }
277 :
278 : /*
279 : * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
280 : * just get put onto a simple dirty list. Transaction walks this list to make
281 : * sure they get properly updated on disk.
282 : */
283 1314629 : static void add_root_to_dirty_list(struct btrfs_root *root)
284 : {
285 1314629 : struct btrfs_fs_info *fs_info = root->fs_info;
286 :
287 1314629 : if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
288 0 : !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
289 : return;
290 :
291 449571 : spin_lock(&fs_info->trans_lock);
292 449572 : if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
293 : /* Want the extent tree to be the last on the list */
294 449572 : if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
295 203247 : list_move_tail(&root->dirty_list,
296 : &fs_info->dirty_cowonly_roots);
297 : else
298 246325 : list_move(&root->dirty_list,
299 : &fs_info->dirty_cowonly_roots);
300 : }
301 449572 : spin_unlock(&fs_info->trans_lock);
302 : }
303 :
304 : /*
305 : * used by snapshot creation to make a copy of a root for a tree with
306 : * a given objectid. The buffer with the new root node is returned in
307 : * cow_ret, and this func returns zero on success or a negative error code.
308 : */
309 2559 : int btrfs_copy_root(struct btrfs_trans_handle *trans,
310 : struct btrfs_root *root,
311 : struct extent_buffer *buf,
312 : struct extent_buffer **cow_ret, u64 new_root_objectid)
313 : {
314 2559 : struct btrfs_fs_info *fs_info = root->fs_info;
315 2559 : struct extent_buffer *cow;
316 2559 : int ret = 0;
317 2559 : int level;
318 2559 : struct btrfs_disk_key disk_key;
319 :
320 5118 : WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
321 : trans->transid != fs_info->running_transaction->transid);
322 5118 : WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
323 : trans->transid != root->last_trans);
324 :
325 2559 : level = btrfs_header_level(buf);
326 2559 : if (level == 0)
327 831 : btrfs_item_key(buf, &disk_key, 0);
328 : else
329 1728 : btrfs_node_key(buf, &disk_key, 0);
330 :
331 2559 : cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
332 : &disk_key, level, buf->start, 0,
333 : BTRFS_NESTING_NEW_ROOT);
334 2559 : if (IS_ERR(cow))
335 0 : return PTR_ERR(cow);
336 :
337 2559 : copy_extent_buffer_full(cow, buf);
338 2559 : btrfs_set_header_bytenr(cow, cow->start);
339 2559 : btrfs_set_header_generation(cow, trans->transid);
340 2559 : btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
341 2559 : btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
342 : BTRFS_HEADER_FLAG_RELOC);
343 2559 : if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
344 1534 : btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
345 : else
346 1025 : btrfs_set_header_owner(cow, new_root_objectid);
347 :
348 2559 : write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
349 :
350 2559 : WARN_ON(btrfs_header_generation(buf) > trans->transid);
351 2559 : if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
352 1534 : ret = btrfs_inc_ref(trans, root, cow, 1);
353 : else
354 1025 : ret = btrfs_inc_ref(trans, root, cow, 0);
355 2559 : if (ret) {
356 0 : btrfs_tree_unlock(cow);
357 0 : free_extent_buffer(cow);
358 0 : btrfs_abort_transaction(trans, ret);
359 0 : return ret;
360 : }
361 :
362 2559 : btrfs_mark_buffer_dirty(cow);
363 2559 : *cow_ret = cow;
364 2559 : return 0;
365 : }
366 :
367 : /*
368 : * check if the tree block can be shared by multiple trees
369 : */
370 8724595 : int btrfs_block_can_be_shared(struct btrfs_root *root,
371 : struct extent_buffer *buf)
372 : {
373 : /*
374 : * Tree blocks not in shareable trees and tree roots are never shared.
375 : * If a block was allocated after the last snapshot and the block was
376 : * not allocated by tree relocation, we know the block is not shared.
377 : */
378 17449190 : if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
379 4804430 : buf != root->node && buf != root->commit_root &&
380 : (btrfs_header_generation(buf) <=
381 2683390 : btrfs_root_last_snapshot(&root->root_item) ||
382 : btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
383 1953174 : return 1;
384 :
385 : return 0;
386 : }
387 :
388 8629515 : static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
389 : struct btrfs_root *root,
390 : struct extent_buffer *buf,
391 : struct extent_buffer *cow,
392 : int *last_ref)
393 : {
394 8629515 : struct btrfs_fs_info *fs_info = root->fs_info;
395 8629515 : u64 refs;
396 8629515 : u64 owner;
397 8629515 : u64 flags;
398 8629515 : u64 new_flags = 0;
399 8629515 : int ret;
400 :
401 : /*
402 : * Backrefs update rules:
403 : *
404 : * Always use full backrefs for extent pointers in tree block
405 : * allocated by tree relocation.
406 : *
407 : * If a shared tree block is no longer referenced by its owner
408 : * tree (btrfs_header_owner(buf) == root->root_key.objectid),
409 : * use full backrefs for extent pointers in tree block.
410 : *
411 : * If a tree block is been relocating
412 : * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
413 : * use full backrefs for extent pointers in tree block.
414 : * The reason for this is some operations (such as drop tree)
415 : * are only allowed for blocks use full backrefs.
416 : */
417 :
418 8629515 : if (btrfs_block_can_be_shared(root, buf)) {
419 1953046 : ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
420 : btrfs_header_level(buf), 1,
421 : &refs, &flags);
422 1953047 : if (ret)
423 : return ret;
424 1953047 : if (unlikely(refs == 0)) {
425 0 : btrfs_crit(fs_info,
426 : "found 0 references for tree block at bytenr %llu level %d root %llu",
427 : buf->start, btrfs_header_level(buf),
428 : btrfs_root_id(root));
429 0 : ret = -EUCLEAN;
430 0 : btrfs_abort_transaction(trans, ret);
431 0 : return ret;
432 : }
433 : } else {
434 6676389 : refs = 1;
435 6676389 : if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
436 : btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
437 691 : flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
438 : else
439 6675698 : flags = 0;
440 : }
441 :
442 8629436 : owner = btrfs_header_owner(buf);
443 8629436 : BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
444 : !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
445 :
446 8629436 : if (refs > 1) {
447 1951540 : if ((owner == root->root_key.objectid ||
448 1950710 : root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
449 1950710 : !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
450 69062 : ret = btrfs_inc_ref(trans, root, buf, 1);
451 69062 : if (ret)
452 : return ret;
453 :
454 69062 : if (root->root_key.objectid ==
455 : BTRFS_TREE_RELOC_OBJECTID) {
456 32529 : ret = btrfs_dec_ref(trans, root, buf, 0);
457 32529 : if (ret)
458 : return ret;
459 32529 : ret = btrfs_inc_ref(trans, root, cow, 1);
460 32529 : if (ret)
461 : return ret;
462 : }
463 69062 : new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
464 : } else {
465 :
466 1882478 : if (root->root_key.objectid ==
467 : BTRFS_TREE_RELOC_OBJECTID)
468 1881518 : ret = btrfs_inc_ref(trans, root, cow, 1);
469 : else
470 960 : ret = btrfs_inc_ref(trans, root, cow, 0);
471 1882478 : if (ret)
472 0 : return ret;
473 : }
474 69062 : if (new_flags != 0) {
475 69062 : ret = btrfs_set_disk_extent_flags(trans, buf, new_flags);
476 69062 : if (ret)
477 0 : return ret;
478 : }
479 : } else {
480 6677896 : if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
481 1167 : if (root->root_key.objectid ==
482 : BTRFS_TREE_RELOC_OBJECTID)
483 1009 : ret = btrfs_inc_ref(trans, root, cow, 1);
484 : else
485 158 : ret = btrfs_inc_ref(trans, root, cow, 0);
486 1167 : if (ret)
487 : return ret;
488 1167 : ret = btrfs_dec_ref(trans, root, buf, 1);
489 1167 : if (ret)
490 : return ret;
491 : }
492 6677896 : btrfs_clear_buffer_dirty(trans, buf);
493 6677889 : *last_ref = 1;
494 : }
495 : return 0;
496 : }
497 :
498 : /*
499 : * does the dirty work in cow of a single block. The parent block (if
500 : * supplied) is updated to point to the new cow copy. The new buffer is marked
501 : * dirty and returned locked. If you modify the block it needs to be marked
502 : * dirty again.
503 : *
504 : * search_start -- an allocation hint for the new block
505 : *
506 : * empty_size -- a hint that you plan on doing more cow. This is the size in
507 : * bytes the allocator should try to find free next to the block it returns.
508 : * This is just a hint and may be ignored by the allocator.
509 : */
510 8630899 : static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
511 : struct btrfs_root *root,
512 : struct extent_buffer *buf,
513 : struct extent_buffer *parent, int parent_slot,
514 : struct extent_buffer **cow_ret,
515 : u64 search_start, u64 empty_size,
516 : enum btrfs_lock_nesting nest)
517 : {
518 8630899 : struct btrfs_fs_info *fs_info = root->fs_info;
519 8630899 : struct btrfs_disk_key disk_key;
520 8630899 : struct extent_buffer *cow;
521 8630899 : int level, ret;
522 8630899 : int last_ref = 0;
523 8630899 : int unlock_orig = 0;
524 8630899 : u64 parent_start = 0;
525 :
526 8630899 : if (*cow_ret == buf)
527 8630920 : unlock_orig = 1;
528 :
529 8630899 : btrfs_assert_tree_write_locked(buf);
530 :
531 13435899 : WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
532 : trans->transid != fs_info->running_transaction->transid);
533 13435902 : WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
534 : trans->transid != root->last_trans);
535 :
536 8630899 : level = btrfs_header_level(buf);
537 :
538 8630899 : if (level == 0)
539 7752286 : btrfs_item_key(buf, &disk_key, 0);
540 : else
541 878613 : btrfs_node_key(buf, &disk_key, 0);
542 :
543 8630897 : if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
544 1914353 : parent_start = parent->start;
545 :
546 8630897 : cow = btrfs_alloc_tree_block(trans, root, parent_start,
547 : root->root_key.objectid, &disk_key, level,
548 : search_start, empty_size, nest);
549 8631150 : if (IS_ERR(cow))
550 1491 : return PTR_ERR(cow);
551 :
552 : /* cow is set to blocking by btrfs_init_new_buffer */
553 :
554 8629659 : copy_extent_buffer_full(cow, buf);
555 8629585 : btrfs_set_header_bytenr(cow, cow->start);
556 8629585 : btrfs_set_header_generation(cow, trans->transid);
557 8629585 : btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
558 8629497 : btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
559 : BTRFS_HEADER_FLAG_RELOC);
560 8629551 : if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
561 1915056 : btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
562 : else
563 6714495 : btrfs_set_header_owner(cow, root->root_key.objectid);
564 :
565 8629551 : write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
566 :
567 8629566 : ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
568 8629432 : if (ret) {
569 0 : btrfs_tree_unlock(cow);
570 0 : free_extent_buffer(cow);
571 0 : btrfs_abort_transaction(trans, ret);
572 0 : return ret;
573 : }
574 :
575 17258864 : if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
576 4803459 : ret = btrfs_reloc_cow_block(trans, root, buf, cow);
577 4803333 : if (ret) {
578 0 : btrfs_tree_unlock(cow);
579 0 : free_extent_buffer(cow);
580 0 : btrfs_abort_transaction(trans, ret);
581 0 : return ret;
582 : }
583 : }
584 :
585 8629306 : if (buf == root->node) {
586 1311493 : WARN_ON(parent && parent != buf);
587 1311493 : if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
588 : btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
589 703 : parent_start = buf->start;
590 :
591 1311493 : ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
592 1311471 : if (ret < 0) {
593 0 : btrfs_tree_unlock(cow);
594 0 : free_extent_buffer(cow);
595 0 : btrfs_abort_transaction(trans, ret);
596 0 : return ret;
597 : }
598 1311471 : atomic_inc(&cow->refs);
599 1311511 : rcu_assign_pointer(root->node, cow);
600 :
601 1311511 : btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
602 : parent_start, last_ref);
603 1311516 : free_extent_buffer(buf);
604 1311514 : add_root_to_dirty_list(root);
605 : } else {
606 7317813 : WARN_ON(trans->transid != btrfs_header_generation(parent));
607 7317813 : ret = btrfs_tree_mod_log_insert_key(parent, parent_slot,
608 : BTRFS_MOD_LOG_KEY_REPLACE);
609 7317625 : if (ret) {
610 0 : btrfs_tree_unlock(cow);
611 0 : free_extent_buffer(cow);
612 0 : btrfs_abort_transaction(trans, ret);
613 0 : return ret;
614 : }
615 7317625 : btrfs_set_node_blockptr(parent, parent_slot,
616 : cow->start);
617 7317582 : btrfs_set_node_ptr_generation(parent, parent_slot,
618 : trans->transid);
619 7317685 : btrfs_mark_buffer_dirty(parent);
620 7317920 : if (last_ref) {
621 5366380 : ret = btrfs_tree_mod_log_free_eb(buf);
622 5365801 : if (ret) {
623 0 : btrfs_tree_unlock(cow);
624 0 : free_extent_buffer(cow);
625 0 : btrfs_abort_transaction(trans, ret);
626 0 : return ret;
627 : }
628 : }
629 7317341 : btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
630 : parent_start, last_ref);
631 : }
632 8629686 : if (unlock_orig)
633 8629684 : btrfs_tree_unlock(buf);
634 8629686 : free_extent_buffer_stale(buf);
635 8629304 : btrfs_mark_buffer_dirty(cow);
636 8629700 : *cow_ret = cow;
637 8629700 : return 0;
638 : }
639 :
640 486197484 : static inline int should_cow_block(struct btrfs_trans_handle *trans,
641 : struct btrfs_root *root,
642 : struct extent_buffer *buf)
643 : {
644 486197484 : if (btrfs_is_testing(root->fs_info))
645 : return 0;
646 :
647 : /* Ensure we can see the FORCE_COW bit */
648 486197484 : smp_mb__before_atomic();
649 :
650 : /*
651 : * We do not need to cow a block if
652 : * 1) this block is not created or changed in this transaction;
653 : * 2) this block does not belong to TREE_RELOC tree;
654 : * 3) the root is not forced COW.
655 : *
656 : * What is forced COW:
657 : * when we create snapshot during committing the transaction,
658 : * after we've finished copying src root, we must COW the shared
659 : * block to ensure the metadata consistency.
660 : */
661 486197484 : if (btrfs_header_generation(buf) == trans->transid &&
662 470098486 : !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
663 470098486 : !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
664 470096613 : btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
665 470096613 : !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
666 470088123 : return 0;
667 : return 1;
668 : }
669 :
670 : /*
671 : * cows a single block, see __btrfs_cow_block for the real work.
672 : * This version of it has extra checks so that a block isn't COWed more than
673 : * once per transaction, as long as it hasn't been written yet
674 : */
675 12604001 : noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
676 : struct btrfs_root *root, struct extent_buffer *buf,
677 : struct extent_buffer *parent, int parent_slot,
678 : struct extent_buffer **cow_ret,
679 : enum btrfs_lock_nesting nest)
680 : {
681 12604001 : struct btrfs_fs_info *fs_info = root->fs_info;
682 12604001 : u64 search_start;
683 12604001 : int ret;
684 :
685 25208002 : if (test_bit(BTRFS_ROOT_DELETING, &root->state))
686 0 : btrfs_err(fs_info,
687 : "COW'ing blocks on a fs root that's being dropped");
688 :
689 12604001 : if (trans->transaction != fs_info->running_transaction)
690 0 : WARN(1, KERN_CRIT "trans %llu running %llu\n",
691 : trans->transid,
692 : fs_info->running_transaction->transid);
693 :
694 12604017 : if (trans->transid != fs_info->generation)
695 0 : WARN(1, KERN_CRIT "trans %llu running %llu\n",
696 : trans->transid, fs_info->generation);
697 :
698 12604017 : if (!should_cow_block(trans, root, buf)) {
699 3973058 : *cow_ret = buf;
700 3973058 : return 0;
701 : }
702 :
703 8630899 : search_start = buf->start & ~((u64)SZ_1G - 1);
704 :
705 : /*
706 : * Before CoWing this block for later modification, check if it's
707 : * the subtree root and do the delayed subtree trace if needed.
708 : *
709 : * Also We don't care about the error, as it's handled internally.
710 : */
711 8630899 : btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
712 8630902 : ret = __btrfs_cow_block(trans, root, buf, parent,
713 : parent_slot, cow_ret, search_start, 0, nest);
714 :
715 8631192 : trace_btrfs_cow_block(root, buf, *cow_ret);
716 :
717 8631192 : return ret;
718 : }
719 : ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
720 :
721 : /*
722 : * helper function for defrag to decide if two blocks pointed to by a
723 : * node are actually close by
724 : */
725 0 : static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
726 : {
727 0 : if (blocknr < other && other - (blocknr + blocksize) < 32768)
728 : return 1;
729 0 : if (blocknr > other && blocknr - (other + blocksize) < 32768)
730 0 : return 1;
731 : return 0;
732 : }
733 :
734 : #ifdef __LITTLE_ENDIAN
735 :
736 : /*
737 : * Compare two keys, on little-endian the disk order is same as CPU order and
738 : * we can avoid the conversion.
739 : */
740 : static int comp_keys(const struct btrfs_disk_key *disk_key,
741 : const struct btrfs_key *k2)
742 : {
743 5242626199 : const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
744 :
745 5242626199 : return btrfs_comp_cpu_keys(k1, k2);
746 : }
747 :
748 : #else
749 :
750 : /*
751 : * compare two keys in a memcmp fashion
752 : */
753 : static int comp_keys(const struct btrfs_disk_key *disk,
754 : const struct btrfs_key *k2)
755 : {
756 : struct btrfs_key k1;
757 :
758 : btrfs_disk_key_to_cpu(&k1, disk);
759 :
760 : return btrfs_comp_cpu_keys(&k1, k2);
761 : }
762 : #endif
763 :
764 : /*
765 : * same as comp_keys only with two btrfs_key's
766 : */
767 6769975715 : int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
768 : {
769 6769975715 : if (k1->objectid > k2->objectid)
770 : return 1;
771 5504038428 : if (k1->objectid < k2->objectid)
772 : return -1;
773 3448893878 : if (k1->type > k2->type)
774 : return 1;
775 3346986289 : if (k1->type < k2->type)
776 : return -1;
777 3171697303 : if (k1->offset > k2->offset)
778 : return 1;
779 2530129606 : if (k1->offset < k2->offset)
780 1986334256 : return -1;
781 : return 0;
782 : }
783 :
784 : /*
785 : * this is used by the defrag code to go through all the
786 : * leaves pointed to by a node and reallocate them so that
787 : * disk order is close to key order
788 : */
789 0 : int btrfs_realloc_node(struct btrfs_trans_handle *trans,
790 : struct btrfs_root *root, struct extent_buffer *parent,
791 : int start_slot, u64 *last_ret,
792 : struct btrfs_key *progress)
793 : {
794 0 : struct btrfs_fs_info *fs_info = root->fs_info;
795 0 : struct extent_buffer *cur;
796 0 : u64 blocknr;
797 0 : u64 search_start = *last_ret;
798 0 : u64 last_block = 0;
799 0 : u64 other;
800 0 : u32 parent_nritems;
801 0 : int end_slot;
802 0 : int i;
803 0 : int err = 0;
804 0 : u32 blocksize;
805 0 : int progress_passed = 0;
806 0 : struct btrfs_disk_key disk_key;
807 :
808 0 : WARN_ON(trans->transaction != fs_info->running_transaction);
809 0 : WARN_ON(trans->transid != fs_info->generation);
810 :
811 0 : parent_nritems = btrfs_header_nritems(parent);
812 0 : blocksize = fs_info->nodesize;
813 0 : end_slot = parent_nritems - 1;
814 :
815 0 : if (parent_nritems <= 1)
816 : return 0;
817 :
818 0 : for (i = start_slot; i <= end_slot; i++) {
819 0 : int close = 1;
820 :
821 0 : btrfs_node_key(parent, &disk_key, i);
822 0 : if (!progress_passed && comp_keys(&disk_key, progress) < 0)
823 0 : continue;
824 :
825 0 : progress_passed = 1;
826 0 : blocknr = btrfs_node_blockptr(parent, i);
827 0 : if (last_block == 0)
828 0 : last_block = blocknr;
829 :
830 0 : if (i > 0) {
831 0 : other = btrfs_node_blockptr(parent, i - 1);
832 0 : close = close_blocks(blocknr, other, blocksize);
833 : }
834 0 : if (!close && i < end_slot) {
835 0 : other = btrfs_node_blockptr(parent, i + 1);
836 0 : close = close_blocks(blocknr, other, blocksize);
837 : }
838 0 : if (close) {
839 0 : last_block = blocknr;
840 0 : continue;
841 : }
842 :
843 0 : cur = btrfs_read_node_slot(parent, i);
844 0 : if (IS_ERR(cur))
845 0 : return PTR_ERR(cur);
846 0 : if (search_start == 0)
847 0 : search_start = last_block;
848 :
849 0 : btrfs_tree_lock(cur);
850 0 : err = __btrfs_cow_block(trans, root, cur, parent, i,
851 : &cur, search_start,
852 0 : min(16 * blocksize,
853 : (end_slot - i) * blocksize),
854 : BTRFS_NESTING_COW);
855 0 : if (err) {
856 0 : btrfs_tree_unlock(cur);
857 0 : free_extent_buffer(cur);
858 0 : break;
859 : }
860 0 : search_start = cur->start;
861 0 : last_block = cur->start;
862 0 : *last_ret = search_start;
863 0 : btrfs_tree_unlock(cur);
864 0 : free_extent_buffer(cur);
865 : }
866 : return err;
867 : }
868 :
869 : /*
870 : * Search for a key in the given extent_buffer.
871 : *
872 : * The lower boundary for the search is specified by the slot number @first_slot.
873 : * Use a value of 0 to search over the whole extent buffer. Works for both
874 : * leaves and nodes.
875 : *
876 : * The slot in the extent buffer is returned via @slot. If the key exists in the
877 : * extent buffer, then @slot will point to the slot where the key is, otherwise
878 : * it points to the slot where you would insert the key.
879 : *
880 : * Slot may point to the total number of items (i.e. one position beyond the last
881 : * key) if the key is bigger than the last key in the extent buffer.
882 : */
883 884289995 : int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
884 : const struct btrfs_key *key, int *slot)
885 : {
886 884289995 : unsigned long p;
887 884289995 : int item_size;
888 : /*
889 : * Use unsigned types for the low and high slots, so that we get a more
890 : * efficient division in the search loop below.
891 : */
892 884289995 : u32 low = first_slot;
893 884289995 : u32 high = btrfs_header_nritems(eb);
894 884289995 : int ret;
895 884289995 : const int key_size = sizeof(struct btrfs_disk_key);
896 :
897 884289995 : if (unlikely(low > high)) {
898 0 : btrfs_err(eb->fs_info,
899 : "%s: low (%u) > high (%u) eb %llu owner %llu level %d",
900 : __func__, low, high, eb->start,
901 : btrfs_header_owner(eb), btrfs_header_level(eb));
902 0 : return -EINVAL;
903 : }
904 :
905 884289995 : if (btrfs_header_level(eb) == 0) {
906 : p = offsetof(struct btrfs_leaf, items);
907 : item_size = sizeof(struct btrfs_item);
908 : } else {
909 552236854 : p = offsetof(struct btrfs_node, ptrs);
910 552236854 : item_size = sizeof(struct btrfs_key_ptr);
911 : }
912 :
913 5844972084 : while (low < high) {
914 5155758696 : unsigned long oip;
915 5155758696 : unsigned long offset;
916 5155758696 : struct btrfs_disk_key *tmp;
917 5155758696 : struct btrfs_disk_key unaligned;
918 5155758696 : int mid;
919 :
920 5155758696 : mid = (low + high) / 2;
921 5155758696 : offset = p + mid * item_size;
922 5155758696 : oip = offset_in_page(offset);
923 :
924 5155758696 : if (oip + key_size <= PAGE_SIZE) {
925 5145705609 : const unsigned long idx = get_eb_page_index(offset);
926 5145705609 : char *kaddr = page_address(eb->pages[idx]);
927 :
928 5145705609 : oip = get_eb_offset_in_page(eb, offset);
929 5145705609 : tmp = (struct btrfs_disk_key *)(kaddr + oip);
930 : } else {
931 10053087 : read_extent_buffer(eb, &unaligned, offset, key_size);
932 10053087 : tmp = &unaligned;
933 : }
934 :
935 5156049692 : ret = comp_keys(tmp, key);
936 :
937 5156049692 : if (ret < 0)
938 3001724268 : low = mid + 1;
939 2154325424 : else if (ret > 0)
940 : high = mid;
941 : else {
942 195367603 : *slot = mid;
943 195367603 : return 0;
944 : }
945 : }
946 689213388 : *slot = low;
947 689213388 : return 1;
948 : }
949 :
950 365277 : static void root_add_used(struct btrfs_root *root, u32 size)
951 : {
952 365277 : spin_lock(&root->accounting_lock);
953 365277 : btrfs_set_root_used(&root->root_item,
954 : btrfs_root_used(&root->root_item) + size);
955 365277 : spin_unlock(&root->accounting_lock);
956 365277 : }
957 :
958 120172 : static void root_sub_used(struct btrfs_root *root, u32 size)
959 : {
960 120172 : spin_lock(&root->accounting_lock);
961 120172 : btrfs_set_root_used(&root->root_item,
962 : btrfs_root_used(&root->root_item) - size);
963 120172 : spin_unlock(&root->accounting_lock);
964 120172 : }
965 :
966 : /* given a node and slot number, this reads the blocks it points to. The
967 : * extent buffer is returned with a reference taken (but unlocked).
968 : */
969 8341621 : struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
970 : int slot)
971 : {
972 8341621 : int level = btrfs_header_level(parent);
973 8341621 : struct btrfs_tree_parent_check check = { 0 };
974 8341621 : struct extent_buffer *eb;
975 :
976 8341621 : if (slot < 0 || slot >= btrfs_header_nritems(parent))
977 : return ERR_PTR(-ENOENT);
978 :
979 8341622 : ASSERT(level);
980 :
981 8341622 : check.level = level - 1;
982 8341622 : check.transid = btrfs_node_ptr_generation(parent, slot);
983 8341691 : check.owner_root = btrfs_header_owner(parent);
984 8341691 : check.has_first_key = true;
985 8341691 : btrfs_node_key_to_cpu(parent, &check.first_key, slot);
986 :
987 8341713 : eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
988 : &check);
989 8341806 : if (IS_ERR(eb))
990 : return eb;
991 16683612 : if (!extent_buffer_uptodate(eb)) {
992 0 : free_extent_buffer(eb);
993 0 : return ERR_PTR(-EIO);
994 : }
995 :
996 : return eb;
997 : }
998 :
999 : /*
1000 : * node level balancing, used to make sure nodes are in proper order for
1001 : * item deletion. We balance from the top down, so we have to make sure
1002 : * that a deletion won't leave an node completely empty later on.
1003 : */
1004 53853339 : static noinline int balance_level(struct btrfs_trans_handle *trans,
1005 : struct btrfs_root *root,
1006 : struct btrfs_path *path, int level)
1007 : {
1008 53853339 : struct btrfs_fs_info *fs_info = root->fs_info;
1009 53853339 : struct extent_buffer *right = NULL;
1010 53853339 : struct extent_buffer *mid;
1011 53853339 : struct extent_buffer *left = NULL;
1012 53853339 : struct extent_buffer *parent = NULL;
1013 53853339 : int ret = 0;
1014 53853339 : int wret;
1015 53853339 : int pslot;
1016 53853339 : int orig_slot = path->slots[level];
1017 53853339 : u64 orig_ptr;
1018 :
1019 53853339 : ASSERT(level > 0);
1020 :
1021 53853339 : mid = path->nodes[level];
1022 :
1023 53853339 : WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
1024 53853339 : WARN_ON(btrfs_header_generation(mid) != trans->transid);
1025 :
1026 53853339 : orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1027 :
1028 53854027 : if (level < BTRFS_MAX_LEVEL - 1) {
1029 53854047 : parent = path->nodes[level + 1];
1030 53854047 : pslot = path->slots[level + 1];
1031 : }
1032 :
1033 : /*
1034 : * deal with the case where there is only one pointer in the root
1035 : * by promoting the node below to a root
1036 : */
1037 53854047 : if (!parent) {
1038 47202863 : struct extent_buffer *child;
1039 :
1040 47202863 : if (btrfs_header_nritems(mid) != 1)
1041 47202863 : return 0;
1042 :
1043 : /* promote the child to a root */
1044 1158 : child = btrfs_read_node_slot(mid, 0);
1045 1158 : if (IS_ERR(child)) {
1046 0 : ret = PTR_ERR(child);
1047 0 : goto out;
1048 : }
1049 :
1050 1158 : btrfs_tree_lock(child);
1051 1158 : ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
1052 : BTRFS_NESTING_COW);
1053 1158 : if (ret) {
1054 0 : btrfs_tree_unlock(child);
1055 0 : free_extent_buffer(child);
1056 0 : goto out;
1057 : }
1058 :
1059 1158 : ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
1060 1158 : if (ret < 0) {
1061 0 : btrfs_tree_unlock(child);
1062 0 : free_extent_buffer(child);
1063 0 : btrfs_abort_transaction(trans, ret);
1064 0 : goto out;
1065 : }
1066 1158 : rcu_assign_pointer(root->node, child);
1067 :
1068 1158 : add_root_to_dirty_list(root);
1069 1158 : btrfs_tree_unlock(child);
1070 :
1071 1158 : path->locks[level] = 0;
1072 1158 : path->nodes[level] = NULL;
1073 1158 : btrfs_clear_buffer_dirty(trans, mid);
1074 1158 : btrfs_tree_unlock(mid);
1075 : /* once for the path */
1076 1158 : free_extent_buffer(mid);
1077 :
1078 1158 : root_sub_used(root, mid->len);
1079 1158 : btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
1080 : /* once for the root ptr */
1081 1158 : free_extent_buffer_stale(mid);
1082 1158 : return 0;
1083 : }
1084 6651164 : if (btrfs_header_nritems(mid) >
1085 6651164 : BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1086 : return 0;
1087 :
1088 15768 : if (pslot) {
1089 15741 : left = btrfs_read_node_slot(parent, pslot - 1);
1090 15741 : if (IS_ERR(left)) {
1091 0 : ret = PTR_ERR(left);
1092 0 : left = NULL;
1093 0 : goto out;
1094 : }
1095 :
1096 15741 : __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1097 15741 : wret = btrfs_cow_block(trans, root, left,
1098 : parent, pslot - 1, &left,
1099 : BTRFS_NESTING_LEFT_COW);
1100 15741 : if (wret) {
1101 0 : ret = wret;
1102 0 : goto out;
1103 : }
1104 : }
1105 :
1106 15768 : if (pslot + 1 < btrfs_header_nritems(parent)) {
1107 117 : right = btrfs_read_node_slot(parent, pslot + 1);
1108 117 : if (IS_ERR(right)) {
1109 0 : ret = PTR_ERR(right);
1110 0 : right = NULL;
1111 0 : goto out;
1112 : }
1113 :
1114 117 : __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1115 117 : wret = btrfs_cow_block(trans, root, right,
1116 : parent, pslot + 1, &right,
1117 : BTRFS_NESTING_RIGHT_COW);
1118 117 : if (wret) {
1119 0 : ret = wret;
1120 0 : goto out;
1121 : }
1122 : }
1123 :
1124 : /* first, try to make some room in the middle buffer */
1125 15768 : if (left) {
1126 15741 : orig_slot += btrfs_header_nritems(left);
1127 15741 : wret = push_node_left(trans, left, mid, 1);
1128 15741 : if (wret < 0)
1129 : ret = wret;
1130 : }
1131 :
1132 : /*
1133 : * then try to empty the right most buffer into the middle
1134 : */
1135 15768 : if (right) {
1136 117 : wret = push_node_left(trans, mid, right, 1);
1137 117 : if (wret < 0 && wret != -ENOSPC)
1138 : ret = wret;
1139 117 : if (btrfs_header_nritems(right) == 0) {
1140 96 : btrfs_clear_buffer_dirty(trans, right);
1141 96 : btrfs_tree_unlock(right);
1142 96 : ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1);
1143 96 : if (ret < 0) {
1144 0 : free_extent_buffer_stale(right);
1145 0 : right = NULL;
1146 0 : goto out;
1147 : }
1148 96 : root_sub_used(root, right->len);
1149 96 : btrfs_free_tree_block(trans, btrfs_root_id(root), right,
1150 : 0, 1);
1151 96 : free_extent_buffer_stale(right);
1152 96 : right = NULL;
1153 : } else {
1154 21 : struct btrfs_disk_key right_key;
1155 21 : btrfs_node_key(right, &right_key, 0);
1156 21 : ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1157 : BTRFS_MOD_LOG_KEY_REPLACE);
1158 21 : if (ret < 0) {
1159 0 : btrfs_abort_transaction(trans, ret);
1160 0 : goto out;
1161 : }
1162 21 : btrfs_set_node_key(parent, &right_key, pslot + 1);
1163 21 : btrfs_mark_buffer_dirty(parent);
1164 : }
1165 : }
1166 15768 : if (btrfs_header_nritems(mid) == 1) {
1167 : /*
1168 : * we're not allowed to leave a node with one item in the
1169 : * tree during a delete. A deletion from lower in the tree
1170 : * could try to delete the only pointer in this node.
1171 : * So, pull some keys from the left.
1172 : * There has to be a left pointer at this point because
1173 : * otherwise we would have pulled some pointers from the
1174 : * right
1175 : */
1176 0 : if (unlikely(!left)) {
1177 0 : btrfs_crit(fs_info,
1178 : "missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu",
1179 : parent->start, btrfs_header_level(parent),
1180 : mid->start, btrfs_root_id(root));
1181 0 : ret = -EUCLEAN;
1182 0 : btrfs_abort_transaction(trans, ret);
1183 0 : goto out;
1184 : }
1185 0 : wret = balance_node_right(trans, mid, left);
1186 0 : if (wret < 0) {
1187 0 : ret = wret;
1188 0 : goto out;
1189 : }
1190 0 : if (wret == 1) {
1191 0 : wret = push_node_left(trans, left, mid, 1);
1192 0 : if (wret < 0)
1193 : ret = wret;
1194 : }
1195 0 : BUG_ON(wret == 1);
1196 : }
1197 15768 : if (btrfs_header_nritems(mid) == 0) {
1198 18 : btrfs_clear_buffer_dirty(trans, mid);
1199 18 : btrfs_tree_unlock(mid);
1200 18 : ret = btrfs_del_ptr(trans, root, path, level + 1, pslot);
1201 18 : if (ret < 0) {
1202 0 : free_extent_buffer_stale(mid);
1203 0 : mid = NULL;
1204 0 : goto out;
1205 : }
1206 18 : root_sub_used(root, mid->len);
1207 18 : btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
1208 18 : free_extent_buffer_stale(mid);
1209 18 : mid = NULL;
1210 : } else {
1211 : /* update the parent key to reflect our changes */
1212 15750 : struct btrfs_disk_key mid_key;
1213 15750 : btrfs_node_key(mid, &mid_key, 0);
1214 15750 : ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1215 : BTRFS_MOD_LOG_KEY_REPLACE);
1216 15750 : if (ret < 0) {
1217 0 : btrfs_abort_transaction(trans, ret);
1218 0 : goto out;
1219 : }
1220 15750 : btrfs_set_node_key(parent, &mid_key, pslot);
1221 15750 : btrfs_mark_buffer_dirty(parent);
1222 : }
1223 :
1224 : /* update the path */
1225 15768 : if (left) {
1226 15741 : if (btrfs_header_nritems(left) > orig_slot) {
1227 97 : atomic_inc(&left->refs);
1228 : /* left was locked after cow */
1229 97 : path->nodes[level] = left;
1230 97 : path->slots[level + 1] -= 1;
1231 97 : path->slots[level] = orig_slot;
1232 97 : if (mid) {
1233 79 : btrfs_tree_unlock(mid);
1234 79 : free_extent_buffer(mid);
1235 : }
1236 : } else {
1237 15644 : orig_slot -= btrfs_header_nritems(left);
1238 15644 : path->slots[level] = orig_slot;
1239 : }
1240 : }
1241 : /* double check we haven't messed things up */
1242 15768 : if (orig_ptr !=
1243 15768 : btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1244 0 : BUG();
1245 15768 : out:
1246 15768 : if (right) {
1247 21 : btrfs_tree_unlock(right);
1248 21 : free_extent_buffer(right);
1249 : }
1250 15768 : if (left) {
1251 15741 : if (path->nodes[level] != left)
1252 15644 : btrfs_tree_unlock(left);
1253 15741 : free_extent_buffer(left);
1254 : }
1255 : return ret;
1256 : }
1257 :
1258 : /* Node balancing for insertion. Here we only split or push nodes around
1259 : * when they are completely full. This is also done top down, so we
1260 : * have to be pessimistic.
1261 : */
1262 114554 : static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1263 : struct btrfs_root *root,
1264 : struct btrfs_path *path, int level)
1265 : {
1266 114554 : struct btrfs_fs_info *fs_info = root->fs_info;
1267 114554 : struct extent_buffer *right = NULL;
1268 114554 : struct extent_buffer *mid;
1269 114554 : struct extent_buffer *left = NULL;
1270 114554 : struct extent_buffer *parent = NULL;
1271 114554 : int ret = 0;
1272 114554 : int wret;
1273 114554 : int pslot;
1274 114554 : int orig_slot = path->slots[level];
1275 :
1276 114554 : if (level == 0)
1277 : return 1;
1278 :
1279 114554 : mid = path->nodes[level];
1280 114554 : WARN_ON(btrfs_header_generation(mid) != trans->transid);
1281 :
1282 114554 : if (level < BTRFS_MAX_LEVEL - 1) {
1283 114554 : parent = path->nodes[level + 1];
1284 114554 : pslot = path->slots[level + 1];
1285 : }
1286 :
1287 114554 : if (!parent)
1288 : return 1;
1289 :
1290 : /* first, try to make some room in the middle buffer */
1291 114554 : if (pslot) {
1292 68486 : u32 left_nr;
1293 :
1294 68486 : left = btrfs_read_node_slot(parent, pslot - 1);
1295 68486 : if (IS_ERR(left))
1296 0 : return PTR_ERR(left);
1297 :
1298 68486 : __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1299 :
1300 68486 : left_nr = btrfs_header_nritems(left);
1301 68486 : if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1302 : wret = 1;
1303 : } else {
1304 57662 : ret = btrfs_cow_block(trans, root, left, parent,
1305 : pslot - 1, &left,
1306 : BTRFS_NESTING_LEFT_COW);
1307 57662 : if (ret)
1308 : wret = 1;
1309 : else {
1310 57662 : wret = push_node_left(trans, left, mid, 0);
1311 : }
1312 : }
1313 57662 : if (wret < 0)
1314 : ret = wret;
1315 57662 : if (wret == 0) {
1316 57662 : struct btrfs_disk_key disk_key;
1317 57662 : orig_slot += left_nr;
1318 57662 : btrfs_node_key(mid, &disk_key, 0);
1319 57662 : ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1320 : BTRFS_MOD_LOG_KEY_REPLACE);
1321 57662 : if (ret < 0) {
1322 0 : btrfs_tree_unlock(left);
1323 0 : free_extent_buffer(left);
1324 0 : btrfs_abort_transaction(trans, ret);
1325 0 : return ret;
1326 : }
1327 57662 : btrfs_set_node_key(parent, &disk_key, pslot);
1328 57662 : btrfs_mark_buffer_dirty(parent);
1329 57662 : if (btrfs_header_nritems(left) > orig_slot) {
1330 277 : path->nodes[level] = left;
1331 277 : path->slots[level + 1] -= 1;
1332 277 : path->slots[level] = orig_slot;
1333 277 : btrfs_tree_unlock(mid);
1334 277 : free_extent_buffer(mid);
1335 : } else {
1336 57385 : orig_slot -=
1337 : btrfs_header_nritems(left);
1338 57385 : path->slots[level] = orig_slot;
1339 57385 : btrfs_tree_unlock(left);
1340 57385 : free_extent_buffer(left);
1341 : }
1342 57662 : return 0;
1343 : }
1344 10824 : btrfs_tree_unlock(left);
1345 10824 : free_extent_buffer(left);
1346 : }
1347 :
1348 : /*
1349 : * then try to empty the right most buffer into the middle
1350 : */
1351 56892 : if (pslot + 1 < btrfs_header_nritems(parent)) {
1352 56615 : u32 right_nr;
1353 :
1354 56615 : right = btrfs_read_node_slot(parent, pslot + 1);
1355 56615 : if (IS_ERR(right))
1356 0 : return PTR_ERR(right);
1357 :
1358 56615 : __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1359 :
1360 56615 : right_nr = btrfs_header_nritems(right);
1361 56615 : if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1362 : wret = 1;
1363 : } else {
1364 56570 : ret = btrfs_cow_block(trans, root, right,
1365 : parent, pslot + 1,
1366 : &right, BTRFS_NESTING_RIGHT_COW);
1367 56570 : if (ret)
1368 : wret = 1;
1369 : else {
1370 56570 : wret = balance_node_right(trans, right, mid);
1371 : }
1372 : }
1373 56570 : if (wret < 0)
1374 : ret = wret;
1375 56570 : if (wret == 0) {
1376 56570 : struct btrfs_disk_key disk_key;
1377 :
1378 56570 : btrfs_node_key(right, &disk_key, 0);
1379 56570 : ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1380 : BTRFS_MOD_LOG_KEY_REPLACE);
1381 56570 : if (ret < 0) {
1382 0 : btrfs_tree_unlock(right);
1383 0 : free_extent_buffer(right);
1384 0 : btrfs_abort_transaction(trans, ret);
1385 0 : return ret;
1386 : }
1387 56570 : btrfs_set_node_key(parent, &disk_key, pslot + 1);
1388 56570 : btrfs_mark_buffer_dirty(parent);
1389 :
1390 56570 : if (btrfs_header_nritems(mid) <= orig_slot) {
1391 87 : path->nodes[level] = right;
1392 87 : path->slots[level + 1] += 1;
1393 87 : path->slots[level] = orig_slot -
1394 : btrfs_header_nritems(mid);
1395 87 : btrfs_tree_unlock(mid);
1396 87 : free_extent_buffer(mid);
1397 : } else {
1398 56483 : btrfs_tree_unlock(right);
1399 56483 : free_extent_buffer(right);
1400 : }
1401 56570 : return 0;
1402 : }
1403 45 : btrfs_tree_unlock(right);
1404 45 : free_extent_buffer(right);
1405 : }
1406 : return 1;
1407 : }
1408 :
1409 : /*
1410 : * readahead one full node of leaves, finding things that are close
1411 : * to the block in 'slot', and triggering ra on them.
1412 : */
1413 36414 : static void reada_for_search(struct btrfs_fs_info *fs_info,
1414 : struct btrfs_path *path,
1415 : int level, int slot, u64 objectid)
1416 : {
1417 36414 : struct extent_buffer *node;
1418 36414 : struct btrfs_disk_key disk_key;
1419 36414 : u32 nritems;
1420 36414 : u64 search;
1421 36414 : u64 target;
1422 36414 : u64 nread = 0;
1423 36414 : u64 nread_max;
1424 36414 : u32 nr;
1425 36414 : u32 blocksize;
1426 36414 : u32 nscan = 0;
1427 :
1428 36414 : if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
1429 552 : return;
1430 :
1431 35862 : if (!path->nodes[level])
1432 : return;
1433 :
1434 35862 : node = path->nodes[level];
1435 :
1436 : /*
1437 : * Since the time between visiting leaves is much shorter than the time
1438 : * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1439 : * much IO at once (possibly random).
1440 : */
1441 35862 : if (path->reada == READA_FORWARD_ALWAYS) {
1442 19923 : if (level > 1)
1443 6694 : nread_max = node->fs_info->nodesize;
1444 : else
1445 : nread_max = SZ_128K;
1446 : } else {
1447 : nread_max = SZ_64K;
1448 : }
1449 :
1450 35862 : search = btrfs_node_blockptr(node, slot);
1451 35858 : blocksize = fs_info->nodesize;
1452 35858 : if (path->reada != READA_FORWARD_ALWAYS) {
1453 15935 : struct extent_buffer *eb;
1454 :
1455 15935 : eb = find_extent_buffer(fs_info, search);
1456 15939 : if (eb) {
1457 0 : free_extent_buffer(eb);
1458 0 : return;
1459 : }
1460 : }
1461 :
1462 35862 : target = search;
1463 :
1464 35862 : nritems = btrfs_header_nritems(node);
1465 35862 : nr = slot;
1466 :
1467 605160 : while (1) {
1468 605160 : if (path->reada == READA_BACK) {
1469 544 : if (nr == 0)
1470 : break;
1471 544 : nr--;
1472 604616 : } else if (path->reada == READA_FORWARD ||
1473 : path->reada == READA_FORWARD_ALWAYS) {
1474 604578 : nr++;
1475 604578 : if (nr >= nritems)
1476 : break;
1477 : }
1478 598895 : if (path->reada == READA_BACK && objectid) {
1479 544 : btrfs_node_key(node, &disk_key, nr);
1480 544 : if (btrfs_disk_key_objectid(&disk_key) != objectid)
1481 : break;
1482 : }
1483 598644 : search = btrfs_node_blockptr(node, nr);
1484 598647 : if (path->reada == READA_FORWARD_ALWAYS ||
1485 474768 : (search <= target && target - search <= 65536) ||
1486 271099 : (search > target && search - target <= 65536)) {
1487 134942 : btrfs_readahead_node_child(node, nr);
1488 134940 : nread += blocksize;
1489 : }
1490 598645 : nscan++;
1491 598645 : if (nread > nread_max || nscan > 32)
1492 : break;
1493 : }
1494 : }
1495 :
1496 53967976 : static noinline void reada_for_balance(struct btrfs_path *path, int level)
1497 : {
1498 53967976 : struct extent_buffer *parent;
1499 53967976 : int slot;
1500 53967976 : int nritems;
1501 :
1502 53967976 : parent = path->nodes[level + 1];
1503 53967976 : if (!parent)
1504 : return;
1505 :
1506 6765718 : nritems = btrfs_header_nritems(parent);
1507 6765718 : slot = path->slots[level + 1];
1508 :
1509 6765718 : if (slot > 0)
1510 6377432 : btrfs_readahead_node_child(parent, slot - 1);
1511 6765718 : if (slot + 1 < nritems)
1512 1150351 : btrfs_readahead_node_child(parent, slot + 1);
1513 : }
1514 :
1515 :
1516 : /*
1517 : * when we walk down the tree, it is usually safe to unlock the higher layers
1518 : * in the tree. The exceptions are when our path goes through slot 0, because
1519 : * operations on the tree might require changing key pointers higher up in the
1520 : * tree.
1521 : *
1522 : * callers might also have set path->keep_locks, which tells this code to keep
1523 : * the lock if the path points to the last slot in the block. This is part of
1524 : * walking through the tree, and selecting the next slot in the higher block.
1525 : *
1526 : * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1527 : * if lowest_unlock is 1, level 0 won't be unlocked
1528 : */
1529 905098459 : static noinline void unlock_up(struct btrfs_path *path, int level,
1530 : int lowest_unlock, int min_write_lock_level,
1531 : int *write_lock_level)
1532 : {
1533 905098459 : int i;
1534 905098459 : int skip_level = level;
1535 905098459 : bool check_skip = true;
1536 :
1537 2026313637 : for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1538 2026313637 : if (!path->nodes[i])
1539 : break;
1540 1497897874 : if (!path->locks[i])
1541 : break;
1542 :
1543 1119706137 : if (check_skip) {
1544 1092208079 : if (path->slots[i] == 0) {
1545 146609967 : skip_level = i + 1;
1546 146609967 : continue;
1547 : }
1548 :
1549 945598112 : if (path->keep_locks) {
1550 228499325 : u32 nritems;
1551 :
1552 228499325 : nritems = btrfs_header_nritems(path->nodes[i]);
1553 228499325 : if (nritems < 1 || path->slots[i] >= nritems - 1) {
1554 122686264 : skip_level = i + 1;
1555 122686264 : continue;
1556 : }
1557 : }
1558 : }
1559 :
1560 850409906 : if (i >= lowest_unlock && i > skip_level) {
1561 132993919 : check_skip = false;
1562 132993919 : btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
1563 134502960 : path->locks[i] = 0;
1564 134502960 : if (write_lock_level &&
1565 134502960 : i > min_write_lock_level &&
1566 40443805 : i <= *write_lock_level) {
1567 152502 : *write_lock_level = i - 1;
1568 : }
1569 : }
1570 : }
1571 906607500 : }
1572 :
1573 : /*
1574 : * Helper function for btrfs_search_slot() and other functions that do a search
1575 : * on a btree. The goal is to find a tree block in the cache (the radix tree at
1576 : * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
1577 : * its pages from disk.
1578 : *
1579 : * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
1580 : * whole btree search, starting again from the current root node.
1581 : */
1582 : static int
1583 532766581 : read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1584 : struct extent_buffer **eb_ret, int level, int slot,
1585 : const struct btrfs_key *key)
1586 : {
1587 532766581 : struct btrfs_fs_info *fs_info = root->fs_info;
1588 532766581 : struct btrfs_tree_parent_check check = { 0 };
1589 532766581 : u64 blocknr;
1590 532766581 : u64 gen;
1591 532766581 : struct extent_buffer *tmp;
1592 532766581 : int ret;
1593 532766581 : int parent_level;
1594 532766581 : bool unlock_up;
1595 :
1596 532766581 : unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
1597 532766581 : blocknr = btrfs_node_blockptr(*eb_ret, slot);
1598 532537435 : gen = btrfs_node_ptr_generation(*eb_ret, slot);
1599 532565891 : parent_level = btrfs_header_level(*eb_ret);
1600 532565891 : btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
1601 532559682 : check.has_first_key = true;
1602 532559682 : check.level = parent_level - 1;
1603 532559682 : check.transid = gen;
1604 532559682 : check.owner_root = root->root_key.objectid;
1605 :
1606 : /*
1607 : * If we need to read an extent buffer from disk and we are holding locks
1608 : * on upper level nodes, we unlock all the upper nodes before reading the
1609 : * extent buffer, and then return -EAGAIN to the caller as it needs to
1610 : * restart the search. We don't release the lock on the current level
1611 : * because we need to walk this node to figure out which blocks to read.
1612 : */
1613 532559682 : tmp = find_extent_buffer(fs_info, blocknr);
1614 533082653 : if (tmp) {
1615 532925261 : if (p->reada == READA_FORWARD_ALWAYS)
1616 19910 : reada_for_search(fs_info, p, level, slot, key->objectid);
1617 :
1618 : /* first we do an atomic uptodate check */
1619 532925261 : if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
1620 : /*
1621 : * Do extra check for first_key, eb can be stale due to
1622 : * being cached, read from scrub, or have multiple
1623 : * parents (shared tree blocks).
1624 : */
1625 532815628 : if (btrfs_verify_level_key(tmp,
1626 : parent_level - 1, &check.first_key, gen)) {
1627 0 : free_extent_buffer(tmp);
1628 0 : return -EUCLEAN;
1629 : }
1630 532772897 : *eb_ret = tmp;
1631 532772897 : return 0;
1632 : }
1633 :
1634 261 : if (p->nowait) {
1635 0 : free_extent_buffer(tmp);
1636 0 : return -EAGAIN;
1637 : }
1638 :
1639 261 : if (unlock_up)
1640 2 : btrfs_unlock_up_safe(p, level + 1);
1641 :
1642 : /* now we're allowed to do a blocking uptodate check */
1643 261 : ret = btrfs_read_extent_buffer(tmp, &check);
1644 261 : if (ret) {
1645 0 : free_extent_buffer(tmp);
1646 0 : btrfs_release_path(p);
1647 0 : return -EIO;
1648 : }
1649 261 : if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
1650 0 : free_extent_buffer(tmp);
1651 0 : btrfs_release_path(p);
1652 0 : return -EUCLEAN;
1653 : }
1654 :
1655 261 : if (unlock_up)
1656 2 : ret = -EAGAIN;
1657 :
1658 259 : goto out;
1659 157392 : } else if (p->nowait) {
1660 : return -EAGAIN;
1661 : }
1662 :
1663 157392 : if (unlock_up) {
1664 19152 : btrfs_unlock_up_safe(p, level + 1);
1665 19152 : ret = -EAGAIN;
1666 : } else {
1667 : ret = 0;
1668 : }
1669 :
1670 157394 : if (p->reada != READA_NONE)
1671 16503 : reada_for_search(fs_info, p, level, slot, key->objectid);
1672 :
1673 157396 : tmp = read_tree_block(fs_info, blocknr, &check);
1674 157393 : if (IS_ERR(tmp)) {
1675 0 : btrfs_release_path(p);
1676 0 : return PTR_ERR(tmp);
1677 : }
1678 : /*
1679 : * If the read above didn't mark this buffer up to date,
1680 : * it will never end up being up to date. Set ret to EIO now
1681 : * and give up so that our caller doesn't loop forever
1682 : * on our EAGAINs.
1683 : */
1684 314786 : if (!extent_buffer_uptodate(tmp))
1685 : ret = -EIO;
1686 :
1687 157392 : out:
1688 157653 : if (ret == 0) {
1689 138496 : *eb_ret = tmp;
1690 : } else {
1691 19158 : free_extent_buffer(tmp);
1692 19158 : btrfs_release_path(p);
1693 : }
1694 :
1695 : return ret;
1696 : }
1697 :
1698 : /*
1699 : * helper function for btrfs_search_slot. This does all of the checks
1700 : * for node-level blocks and does any balancing required based on
1701 : * the ins_len.
1702 : *
1703 : * If no extra work was required, zero is returned. If we had to
1704 : * drop the path, -EAGAIN is returned and btrfs_search_slot must
1705 : * start over
1706 : */
1707 : static int
1708 563274850 : setup_nodes_for_search(struct btrfs_trans_handle *trans,
1709 : struct btrfs_root *root, struct btrfs_path *p,
1710 : struct extent_buffer *b, int level, int ins_len,
1711 : int *write_lock_level)
1712 : {
1713 563274850 : struct btrfs_fs_info *fs_info = root->fs_info;
1714 563274850 : int ret = 0;
1715 :
1716 563274850 : if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1717 151398260 : BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
1718 :
1719 184675 : if (*write_lock_level < level + 1) {
1720 70042 : *write_lock_level = level + 1;
1721 70042 : btrfs_release_path(p);
1722 70042 : return -EAGAIN;
1723 : }
1724 :
1725 114633 : reada_for_balance(p, level);
1726 114633 : ret = split_node(trans, root, p, level);
1727 :
1728 114633 : b = p->nodes[level];
1729 563090175 : } else if (ins_len < 0 && btrfs_header_nritems(b) <
1730 107305342 : BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
1731 :
1732 83056475 : if (*write_lock_level < level + 1) {
1733 29203398 : *write_lock_level = level + 1;
1734 29203398 : btrfs_release_path(p);
1735 29203398 : return -EAGAIN;
1736 : }
1737 :
1738 53853077 : reada_for_balance(p, level);
1739 53852823 : ret = balance_level(trans, root, p, level);
1740 53853708 : if (ret)
1741 : return ret;
1742 :
1743 53853708 : b = p->nodes[level];
1744 53853708 : if (!b) {
1745 1158 : btrfs_release_path(p);
1746 1158 : return -EAGAIN;
1747 : }
1748 53852550 : BUG_ON(btrfs_header_nritems(b) == 1);
1749 : }
1750 : return ret;
1751 : }
1752 :
1753 37450 : int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
1754 : u64 iobjectid, u64 ioff, u8 key_type,
1755 : struct btrfs_key *found_key)
1756 : {
1757 37450 : int ret;
1758 37450 : struct btrfs_key key;
1759 37450 : struct extent_buffer *eb;
1760 :
1761 37450 : ASSERT(path);
1762 37450 : ASSERT(found_key);
1763 :
1764 37450 : key.type = key_type;
1765 37450 : key.objectid = iobjectid;
1766 37450 : key.offset = ioff;
1767 :
1768 37450 : ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1769 37450 : if (ret < 0)
1770 : return ret;
1771 :
1772 37450 : eb = path->nodes[0];
1773 37450 : if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1774 792 : ret = btrfs_next_leaf(fs_root, path);
1775 792 : if (ret)
1776 : return ret;
1777 792 : eb = path->nodes[0];
1778 : }
1779 :
1780 37450 : btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1781 37450 : if (found_key->type != key.type ||
1782 35376 : found_key->objectid != key.objectid)
1783 2074 : return 1;
1784 :
1785 : return 0;
1786 : }
1787 :
1788 393141564 : static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1789 : struct btrfs_path *p,
1790 : int write_lock_level)
1791 : {
1792 393141564 : struct extent_buffer *b;
1793 393141564 : int root_lock = 0;
1794 393141564 : int level = 0;
1795 :
1796 393141564 : if (p->search_commit_root) {
1797 33860721 : b = root->commit_root;
1798 33860721 : atomic_inc(&b->refs);
1799 33865465 : level = btrfs_header_level(b);
1800 : /*
1801 : * Ensure that all callers have set skip_locking when
1802 : * p->search_commit_root = 1.
1803 : */
1804 33865465 : ASSERT(p->skip_locking == 1);
1805 :
1806 33865465 : goto out;
1807 : }
1808 :
1809 359280843 : if (p->skip_locking) {
1810 7621630 : b = btrfs_root_node(root);
1811 7621630 : level = btrfs_header_level(b);
1812 7621630 : goto out;
1813 : }
1814 :
1815 : /* We try very hard to do read locks on the root */
1816 351659213 : root_lock = BTRFS_READ_LOCK;
1817 :
1818 : /*
1819 : * If the level is set to maximum, we can skip trying to get the read
1820 : * lock.
1821 : */
1822 351659213 : if (write_lock_level < BTRFS_MAX_LEVEL) {
1823 : /*
1824 : * We don't know the level of the root node until we actually
1825 : * have it read locked
1826 : */
1827 320305208 : if (p->nowait) {
1828 0 : b = btrfs_try_read_lock_root_node(root);
1829 0 : if (IS_ERR(b))
1830 : return b;
1831 : } else {
1832 320305208 : b = btrfs_read_lock_root_node(root);
1833 : }
1834 320290111 : level = btrfs_header_level(b);
1835 320290111 : if (level > write_lock_level)
1836 191732695 : goto out;
1837 :
1838 : /* Whoops, must trade for write lock */
1839 128557416 : btrfs_tree_read_unlock(b);
1840 128559319 : free_extent_buffer(b);
1841 : }
1842 :
1843 159924846 : b = btrfs_lock_root_node(root);
1844 159925620 : root_lock = BTRFS_WRITE_LOCK;
1845 :
1846 : /* The level might have changed, check again */
1847 159925620 : level = btrfs_header_level(b);
1848 :
1849 393145410 : out:
1850 : /*
1851 : * The root may have failed to write out at some point, and thus is no
1852 : * longer valid, return an error in this case.
1853 : */
1854 786290820 : if (!extent_buffer_uptodate(b)) {
1855 16 : if (root_lock)
1856 16 : btrfs_tree_unlock_rw(b, root_lock);
1857 16 : free_extent_buffer(b);
1858 16 : return ERR_PTR(-EIO);
1859 : }
1860 :
1861 393145394 : p->nodes[level] = b;
1862 393145394 : if (!p->skip_locking)
1863 351603457 : p->locks[level] = root_lock;
1864 : /*
1865 : * Callers are responsible for dropping b's references.
1866 : */
1867 : return b;
1868 : }
1869 :
1870 : /*
1871 : * Replace the extent buffer at the lowest level of the path with a cloned
1872 : * version. The purpose is to be able to use it safely, after releasing the
1873 : * commit root semaphore, even if relocation is happening in parallel, the
1874 : * transaction used for relocation is committed and the extent buffer is
1875 : * reallocated in the next transaction.
1876 : *
1877 : * This is used in a context where the caller does not prevent transaction
1878 : * commits from happening, either by holding a transaction handle or holding
1879 : * some lock, while it's doing searches through a commit root.
1880 : * At the moment it's only used for send operations.
1881 : */
1882 18787323 : static int finish_need_commit_sem_search(struct btrfs_path *path)
1883 : {
1884 18787323 : const int i = path->lowest_level;
1885 18787323 : const int slot = path->slots[i];
1886 18787323 : struct extent_buffer *lowest = path->nodes[i];
1887 18787323 : struct extent_buffer *clone;
1888 :
1889 18787323 : ASSERT(path->need_commit_sem);
1890 :
1891 18787323 : if (!lowest)
1892 : return 0;
1893 :
1894 18787323 : lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
1895 :
1896 18787323 : clone = btrfs_clone_extent_buffer(lowest);
1897 18788780 : if (!clone)
1898 : return -ENOMEM;
1899 :
1900 18788780 : btrfs_release_path(path);
1901 18788010 : path->nodes[i] = clone;
1902 18788010 : path->slots[i] = slot;
1903 :
1904 18788010 : return 0;
1905 : }
1906 :
1907 : static inline int search_for_key_slot(struct extent_buffer *eb,
1908 : int search_low_slot,
1909 : const struct btrfs_key *key,
1910 : int prev_cmp,
1911 : int *slot)
1912 : {
1913 : /*
1914 : * If a previous call to btrfs_bin_search() on a parent node returned an
1915 : * exact match (prev_cmp == 0), we can safely assume the target key will
1916 : * always be at slot 0 on lower levels, since each key pointer
1917 : * (struct btrfs_key_ptr) refers to the lowest key accessible from the
1918 : * subtree it points to. Thus we can skip searching lower levels.
1919 : */
1920 905233927 : if (prev_cmp == 0) {
1921 21603520 : *slot = 0;
1922 21603520 : return 0;
1923 : }
1924 :
1925 883630407 : return btrfs_bin_search(eb, search_low_slot, key, slot);
1926 : }
1927 :
1928 349138199 : static int search_leaf(struct btrfs_trans_handle *trans,
1929 : struct btrfs_root *root,
1930 : const struct btrfs_key *key,
1931 : struct btrfs_path *path,
1932 : int ins_len,
1933 : int prev_cmp)
1934 : {
1935 349138199 : struct extent_buffer *leaf = path->nodes[0];
1936 349138199 : int leaf_free_space = -1;
1937 349138199 : int search_low_slot = 0;
1938 349138199 : int ret;
1939 349138199 : bool do_bin_search = true;
1940 :
1941 : /*
1942 : * If we are doing an insertion, the leaf has enough free space and the
1943 : * destination slot for the key is not slot 0, then we can unlock our
1944 : * write lock on the parent, and any other upper nodes, before doing the
1945 : * binary search on the leaf (with search_for_key_slot()), allowing other
1946 : * tasks to lock the parent and any other upper nodes.
1947 : */
1948 349138199 : if (ins_len > 0) {
1949 : /*
1950 : * Cache the leaf free space, since we will need it later and it
1951 : * will not change until then.
1952 : */
1953 89445821 : leaf_free_space = btrfs_leaf_free_space(leaf);
1954 :
1955 : /*
1956 : * !path->locks[1] means we have a single node tree, the leaf is
1957 : * the root of the tree.
1958 : */
1959 89434495 : if (path->locks[1] && leaf_free_space >= ins_len) {
1960 78099137 : struct btrfs_disk_key first_key;
1961 :
1962 78099137 : ASSERT(btrfs_header_nritems(leaf) > 0);
1963 78099137 : btrfs_item_key(leaf, &first_key, 0);
1964 :
1965 : /*
1966 : * Doing the extra comparison with the first key is cheap,
1967 : * taking into account that the first key is very likely
1968 : * already in a cache line because it immediately follows
1969 : * the extent buffer's header and we have recently accessed
1970 : * the header's level field.
1971 : */
1972 78102516 : ret = comp_keys(&first_key, key);
1973 78102516 : if (ret < 0) {
1974 : /*
1975 : * The first key is smaller than the key we want
1976 : * to insert, so we are safe to unlock all upper
1977 : * nodes and we have to do the binary search.
1978 : *
1979 : * We do use btrfs_unlock_up_safe() and not
1980 : * unlock_up() because the later does not unlock
1981 : * nodes with a slot of 0 - we can safely unlock
1982 : * any node even if its slot is 0 since in this
1983 : * case the key does not end up at slot 0 of the
1984 : * leaf and there's no need to split the leaf.
1985 : */
1986 70900462 : btrfs_unlock_up_safe(path, 1);
1987 70900462 : search_low_slot = 1;
1988 : } else {
1989 : /*
1990 : * The first key is >= then the key we want to
1991 : * insert, so we can skip the binary search as
1992 : * the target key will be at slot 0.
1993 : *
1994 : * We can not unlock upper nodes when the key is
1995 : * less than the first key, because we will need
1996 : * to update the key at slot 0 of the parent node
1997 : * and possibly of other upper nodes too.
1998 : * If the key matches the first key, then we can
1999 : * unlock all the upper nodes, using
2000 : * btrfs_unlock_up_safe() instead of unlock_up()
2001 : * as stated above.
2002 : */
2003 7202054 : if (ret == 0)
2004 7098426 : btrfs_unlock_up_safe(path, 1);
2005 : /*
2006 : * ret is already 0 or 1, matching the result of
2007 : * a btrfs_bin_search() call, so there is no need
2008 : * to adjust it.
2009 : */
2010 7202054 : do_bin_search = false;
2011 7202054 : path->slots[0] = 0;
2012 : }
2013 : }
2014 : }
2015 :
2016 349132745 : if (do_bin_search) {
2017 341812952 : ret = search_for_key_slot(leaf, search_low_slot, key,
2018 : prev_cmp, &path->slots[0]);
2019 341837591 : if (ret < 0)
2020 : return ret;
2021 : }
2022 :
2023 349157384 : if (ins_len > 0) {
2024 : /*
2025 : * Item key already exists. In this case, if we are allowed to
2026 : * insert the item (for example, in dir_item case, item key
2027 : * collision is allowed), it will be merged with the original
2028 : * item. Only the item size grows, no new btrfs item will be
2029 : * added. If search_for_extension is not set, ins_len already
2030 : * accounts the size btrfs_item, deduct it here so leaf space
2031 : * check will be correct.
2032 : */
2033 89431116 : if (ret == 0 && !path->search_for_extension) {
2034 423608 : ASSERT(ins_len >= sizeof(struct btrfs_item));
2035 423608 : ins_len -= sizeof(struct btrfs_item);
2036 : }
2037 :
2038 89431116 : ASSERT(leaf_free_space >= 0);
2039 :
2040 89431116 : if (leaf_free_space < ins_len) {
2041 3553402 : int err;
2042 :
2043 3553402 : err = split_leaf(trans, root, key, path, ins_len,
2044 : (ret == 0));
2045 3553478 : ASSERT(err <= 0);
2046 3553478 : if (WARN_ON(err > 0))
2047 : err = -EUCLEAN;
2048 3553478 : if (err)
2049 : ret = err;
2050 : }
2051 : }
2052 :
2053 : return ret;
2054 : }
2055 :
2056 : /*
2057 : * btrfs_search_slot - look for a key in a tree and perform necessary
2058 : * modifications to preserve tree invariants.
2059 : *
2060 : * @trans: Handle of transaction, used when modifying the tree
2061 : * @p: Holds all btree nodes along the search path
2062 : * @root: The root node of the tree
2063 : * @key: The key we are looking for
2064 : * @ins_len: Indicates purpose of search:
2065 : * >0 for inserts it's size of item inserted (*)
2066 : * <0 for deletions
2067 : * 0 for plain searches, not modifying the tree
2068 : *
2069 : * (*) If size of item inserted doesn't include
2070 : * sizeof(struct btrfs_item), then p->search_for_extension must
2071 : * be set.
2072 : * @cow: boolean should CoW operations be performed. Must always be 1
2073 : * when modifying the tree.
2074 : *
2075 : * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2076 : * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2077 : *
2078 : * If @key is found, 0 is returned and you can find the item in the leaf level
2079 : * of the path (level 0)
2080 : *
2081 : * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2082 : * points to the slot where it should be inserted
2083 : *
2084 : * If an error is encountered while searching the tree a negative error number
2085 : * is returned
2086 : */
2087 357661905 : int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2088 : const struct btrfs_key *key, struct btrfs_path *p,
2089 : int ins_len, int cow)
2090 : {
2091 357661905 : struct btrfs_fs_info *fs_info = root->fs_info;
2092 357661905 : struct extent_buffer *b;
2093 357661905 : int slot;
2094 357661905 : int ret;
2095 357661905 : int err;
2096 357661905 : int level;
2097 357661905 : int lowest_unlock = 1;
2098 : /* everything at write_lock_level or lower must be write locked */
2099 357661905 : int write_lock_level = 0;
2100 357661905 : u8 lowest_level = 0;
2101 357661905 : int min_write_lock_level;
2102 357661905 : int prev_cmp;
2103 :
2104 357661905 : might_sleep();
2105 :
2106 357628494 : lowest_level = p->lowest_level;
2107 357628494 : WARN_ON(lowest_level && ins_len > 0);
2108 357628494 : WARN_ON(p->nodes[0] != NULL);
2109 357628494 : BUG_ON(!cow && ins_len);
2110 :
2111 : /*
2112 : * For now only allow nowait for read only operations. There's no
2113 : * strict reason why we can't, we just only need it for reads so it's
2114 : * only implemented for reads.
2115 : */
2116 357628494 : ASSERT(!p->nowait || !cow);
2117 :
2118 357628494 : if (ins_len < 0) {
2119 59694417 : lowest_unlock = 2;
2120 :
2121 : /* when we are removing items, we might have to go up to level
2122 : * two as we update tree pointers Make sure we keep write
2123 : * for those levels as well
2124 : */
2125 59694417 : write_lock_level = 2;
2126 297934077 : } else if (ins_len > 0) {
2127 : /*
2128 : * for inserting items, make sure we have a write lock on
2129 : * level 1 so we can update keys
2130 : */
2131 89444315 : write_lock_level = 1;
2132 : }
2133 :
2134 357628494 : if (!cow)
2135 174150757 : write_lock_level = -1;
2136 :
2137 357628494 : if (cow && (p->keep_locks || p->lowest_level))
2138 31349115 : write_lock_level = BTRFS_MAX_LEVEL;
2139 :
2140 357628494 : min_write_lock_level = write_lock_level;
2141 :
2142 357628494 : if (p->need_commit_sem) {
2143 18714764 : ASSERT(p->search_commit_root);
2144 18714764 : if (p->nowait) {
2145 0 : if (!down_read_trylock(&fs_info->commit_root_sem))
2146 : return -EAGAIN;
2147 : } else {
2148 18714764 : down_read(&fs_info->commit_root_sem);
2149 : }
2150 : }
2151 :
2152 338913730 : again:
2153 393106914 : prev_cmp = -1;
2154 393106914 : b = btrfs_search_slot_get_root(root, p, write_lock_level);
2155 393036943 : if (IS_ERR(b)) {
2156 16 : ret = PTR_ERR(b);
2157 16 : goto done;
2158 : }
2159 :
2160 913797861 : while (b) {
2161 913797861 : int dec = 0;
2162 :
2163 913797861 : level = btrfs_header_level(b);
2164 :
2165 913797861 : if (cow) {
2166 473652286 : bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2167 :
2168 : /*
2169 : * if we don't really need to cow this block
2170 : * then we don't want to set the path blocking,
2171 : * so we test it here
2172 : */
2173 473652286 : if (!should_cow_block(trans, root, b))
2174 466022566 : goto cow_done;
2175 :
2176 : /*
2177 : * must have write locks on this node and the
2178 : * parent
2179 : */
2180 7542133 : if (level > write_lock_level ||
2181 7290266 : (level + 1 > write_lock_level &&
2182 1588238 : level + 1 < BTRFS_MAX_LEVEL &&
2183 1588238 : p->nodes[level + 1])) {
2184 1181289 : write_lock_level = level + 1;
2185 1181289 : btrfs_release_path(p);
2186 1181407 : goto again;
2187 : }
2188 :
2189 6360844 : if (last_level)
2190 0 : err = btrfs_cow_block(trans, root, b, NULL, 0,
2191 : &b,
2192 : BTRFS_NESTING_COW);
2193 : else
2194 6360844 : err = btrfs_cow_block(trans, root, b,
2195 : p->nodes[level + 1],
2196 : p->slots[level + 1], &b,
2197 : BTRFS_NESTING_COW);
2198 6360987 : if (err) {
2199 68 : ret = err;
2200 68 : goto done;
2201 : }
2202 : }
2203 446506494 : cow_done:
2204 912529060 : p->nodes[level] = b;
2205 :
2206 : /*
2207 : * we have a lock on b and as long as we aren't changing
2208 : * the tree, there is no way to for the items in b to change.
2209 : * It is safe to drop the lock on our parent before we
2210 : * go through the expensive btree search on b.
2211 : *
2212 : * If we're inserting or deleting (ins_len != 0), then we might
2213 : * be changing slot zero, which may require changing the parent.
2214 : * So, we can't drop the lock until after we know which slot
2215 : * we're operating on.
2216 : */
2217 912529060 : if (!ins_len && !p->keep_locks) {
2218 427084573 : int u = level + 1;
2219 :
2220 427084573 : if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2221 185302552 : btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2222 185299907 : p->locks[u] = 0;
2223 : }
2224 : }
2225 :
2226 912526415 : if (level == 0) {
2227 349105440 : if (ins_len > 0)
2228 : ASSERT(write_lock_level >= 1);
2229 :
2230 349105440 : ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
2231 349009073 : if (!p->search_for_split)
2232 348972045 : unlock_up(p, level, lowest_unlock,
2233 : min_write_lock_level, NULL);
2234 348992113 : goto done;
2235 : }
2236 :
2237 563420975 : ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
2238 563224759 : if (ret < 0)
2239 0 : goto done;
2240 563224759 : prev_cmp = ret;
2241 :
2242 563224759 : if (ret && slot > 0) {
2243 525734942 : dec = 1;
2244 525734942 : slot--;
2245 : }
2246 563224759 : p->slots[level] = slot;
2247 563224759 : err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2248 : &write_lock_level);
2249 563470455 : if (err == -EAGAIN)
2250 29274473 : goto again;
2251 534195982 : if (err) {
2252 0 : ret = err;
2253 0 : goto done;
2254 : }
2255 534195982 : b = p->nodes[level];
2256 534195982 : slot = p->slots[level];
2257 :
2258 : /*
2259 : * Slot 0 is special, if we change the key we have to update
2260 : * the parent pointer which means we must have a write lock on
2261 : * the parent
2262 : */
2263 534195982 : if (slot == 0 && ins_len && write_lock_level < level + 1) {
2264 5005294 : write_lock_level = level + 1;
2265 5005294 : btrfs_release_path(p);
2266 5005458 : goto again;
2267 : }
2268 :
2269 529190688 : unlock_up(p, level, lowest_unlock, min_write_lock_level,
2270 : &write_lock_level);
2271 :
2272 528972373 : if (level == lowest_level) {
2273 8536312 : if (dec)
2274 27032 : p->slots[level]++;
2275 8536312 : goto done;
2276 : }
2277 :
2278 520436061 : err = read_block_for_search(root, p, &b, level, slot, key);
2279 520739556 : if (err == -EAGAIN)
2280 13097 : goto again;
2281 520726459 : if (err) {
2282 0 : ret = err;
2283 0 : goto done;
2284 : }
2285 :
2286 520726459 : if (!p->skip_locking) {
2287 461013449 : level = btrfs_header_level(b);
2288 :
2289 461013449 : btrfs_maybe_reset_lockdep_class(root, b);
2290 :
2291 461013449 : if (level <= write_lock_level) {
2292 248883349 : btrfs_tree_lock(b);
2293 248918420 : p->locks[level] = BTRFS_WRITE_LOCK;
2294 : } else {
2295 212130100 : if (p->nowait) {
2296 0 : if (!btrfs_try_tree_read_lock(b)) {
2297 0 : free_extent_buffer(b);
2298 0 : ret = -EAGAIN;
2299 0 : goto done;
2300 : }
2301 : } else {
2302 212130100 : btrfs_tree_read_lock(b);
2303 : }
2304 212129504 : p->locks[level] = BTRFS_READ_LOCK;
2305 : }
2306 461047924 : p->nodes[level] = b;
2307 : }
2308 : }
2309 : ret = 1;
2310 357528509 : done:
2311 357528509 : if (ret < 0 && !p->skip_release_on_error)
2312 84 : btrfs_release_path(p);
2313 :
2314 357528509 : if (p->need_commit_sem) {
2315 18718410 : int ret2;
2316 :
2317 18718410 : ret2 = finish_need_commit_sem_search(p);
2318 18718777 : up_read(&fs_info->commit_root_sem);
2319 18719477 : if (ret2)
2320 0 : ret = ret2;
2321 : }
2322 :
2323 : return ret;
2324 : }
2325 : ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
2326 :
2327 : /*
2328 : * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2329 : * current state of the tree together with the operations recorded in the tree
2330 : * modification log to search for the key in a previous version of this tree, as
2331 : * denoted by the time_seq parameter.
2332 : *
2333 : * Naturally, there is no support for insert, delete or cow operations.
2334 : *
2335 : * The resulting path and return value will be set up as if we called
2336 : * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2337 : */
2338 3826516 : int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2339 : struct btrfs_path *p, u64 time_seq)
2340 : {
2341 3826516 : struct btrfs_fs_info *fs_info = root->fs_info;
2342 3826516 : struct extent_buffer *b;
2343 3826516 : int slot;
2344 3826516 : int ret;
2345 3826516 : int err;
2346 3826516 : int level;
2347 3826516 : int lowest_unlock = 1;
2348 3826516 : u8 lowest_level = 0;
2349 :
2350 3826516 : lowest_level = p->lowest_level;
2351 3826516 : WARN_ON(p->nodes[0] != NULL);
2352 3826516 : ASSERT(!p->nowait);
2353 :
2354 3826516 : if (p->search_commit_root) {
2355 3347655 : BUG_ON(time_seq);
2356 3347655 : return btrfs_search_slot(NULL, root, key, p, 0, 0);
2357 : }
2358 :
2359 478861 : again:
2360 478861 : b = btrfs_get_old_root(root, time_seq);
2361 478861 : if (!b) {
2362 0 : ret = -EIO;
2363 0 : goto done;
2364 : }
2365 478861 : level = btrfs_header_level(b);
2366 478861 : p->locks[level] = BTRFS_READ_LOCK;
2367 :
2368 918366 : while (b) {
2369 918366 : int dec = 0;
2370 :
2371 918366 : level = btrfs_header_level(b);
2372 918366 : p->nodes[level] = b;
2373 :
2374 : /*
2375 : * we have a lock on b and as long as we aren't changing
2376 : * the tree, there is no way to for the items in b to change.
2377 : * It is safe to drop the lock on our parent before we
2378 : * go through the expensive btree search on b.
2379 : */
2380 918366 : btrfs_unlock_up_safe(p, level + 1);
2381 :
2382 918366 : ret = btrfs_bin_search(b, 0, key, &slot);
2383 918366 : if (ret < 0)
2384 0 : goto done;
2385 :
2386 918366 : if (level == 0) {
2387 309942 : p->slots[level] = slot;
2388 309942 : unlock_up(p, level, lowest_unlock, 0, NULL);
2389 309942 : goto done;
2390 : }
2391 :
2392 608424 : if (ret && slot > 0) {
2393 435400 : dec = 1;
2394 435400 : slot--;
2395 : }
2396 608424 : p->slots[level] = slot;
2397 608424 : unlock_up(p, level, lowest_unlock, 0, NULL);
2398 :
2399 608424 : if (level == lowest_level) {
2400 168919 : if (dec)
2401 0 : p->slots[level]++;
2402 168919 : goto done;
2403 : }
2404 :
2405 439505 : err = read_block_for_search(root, p, &b, level, slot, key);
2406 439505 : if (err == -EAGAIN)
2407 0 : goto again;
2408 439505 : if (err) {
2409 0 : ret = err;
2410 0 : goto done;
2411 : }
2412 :
2413 439505 : level = btrfs_header_level(b);
2414 439505 : btrfs_tree_read_lock(b);
2415 439505 : b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
2416 439505 : if (!b) {
2417 0 : ret = -ENOMEM;
2418 0 : goto done;
2419 : }
2420 439505 : p->locks[level] = BTRFS_READ_LOCK;
2421 439505 : p->nodes[level] = b;
2422 : }
2423 : ret = 1;
2424 478861 : done:
2425 478861 : if (ret < 0)
2426 0 : btrfs_release_path(p);
2427 :
2428 : return ret;
2429 : }
2430 :
2431 : /*
2432 : * Search the tree again to find a leaf with smaller keys.
2433 : * Returns 0 if it found something.
2434 : * Returns 1 if there are no smaller keys.
2435 : * Returns < 0 on error.
2436 : *
2437 : * This may release the path, and so you may lose any locks held at the
2438 : * time you call it.
2439 : */
2440 1485 : static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
2441 : {
2442 1485 : struct btrfs_key key;
2443 1485 : struct btrfs_key orig_key;
2444 1485 : struct btrfs_disk_key found_key;
2445 1485 : int ret;
2446 :
2447 1485 : btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
2448 1485 : orig_key = key;
2449 :
2450 1485 : if (key.offset > 0) {
2451 1485 : key.offset--;
2452 0 : } else if (key.type > 0) {
2453 0 : key.type--;
2454 0 : key.offset = (u64)-1;
2455 0 : } else if (key.objectid > 0) {
2456 0 : key.objectid--;
2457 0 : key.type = (u8)-1;
2458 0 : key.offset = (u64)-1;
2459 : } else {
2460 : return 1;
2461 : }
2462 :
2463 1485 : btrfs_release_path(path);
2464 1485 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2465 1485 : if (ret <= 0)
2466 : return ret;
2467 :
2468 : /*
2469 : * Previous key not found. Even if we were at slot 0 of the leaf we had
2470 : * before releasing the path and calling btrfs_search_slot(), we now may
2471 : * be in a slot pointing to the same original key - this can happen if
2472 : * after we released the path, one of more items were moved from a
2473 : * sibling leaf into the front of the leaf we had due to an insertion
2474 : * (see push_leaf_right()).
2475 : * If we hit this case and our slot is > 0 and just decrement the slot
2476 : * so that the caller does not process the same key again, which may or
2477 : * may not break the caller, depending on its logic.
2478 : */
2479 1485 : if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
2480 946 : btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
2481 946 : ret = comp_keys(&found_key, &orig_key);
2482 946 : if (ret == 0) {
2483 946 : if (path->slots[0] > 0) {
2484 0 : path->slots[0]--;
2485 0 : return 0;
2486 : }
2487 : /*
2488 : * At slot 0, same key as before, it means orig_key is
2489 : * the lowest, leftmost, key in the tree. We're done.
2490 : */
2491 : return 1;
2492 : }
2493 : }
2494 :
2495 539 : btrfs_item_key(path->nodes[0], &found_key, 0);
2496 539 : ret = comp_keys(&found_key, &key);
2497 : /*
2498 : * We might have had an item with the previous key in the tree right
2499 : * before we released our path. And after we released our path, that
2500 : * item might have been pushed to the first slot (0) of the leaf we
2501 : * were holding due to a tree balance. Alternatively, an item with the
2502 : * previous key can exist as the only element of a leaf (big fat item).
2503 : * Therefore account for these 2 cases, so that our callers (like
2504 : * btrfs_previous_item) don't miss an existing item with a key matching
2505 : * the previous key we computed above.
2506 : */
2507 539 : if (ret <= 0)
2508 539 : return 0;
2509 : return 1;
2510 : }
2511 :
2512 : /*
2513 : * helper to use instead of search slot if no exact match is needed but
2514 : * instead the next or previous item should be returned.
2515 : * When find_higher is true, the next higher item is returned, the next lower
2516 : * otherwise.
2517 : * When return_any and find_higher are both true, and no higher item is found,
2518 : * return the next lower instead.
2519 : * When return_any is true and find_higher is false, and no lower item is found,
2520 : * return the next higher instead.
2521 : * It returns 0 if any item is found, 1 if none is found (tree empty), and
2522 : * < 0 on error
2523 : */
2524 1835455 : int btrfs_search_slot_for_read(struct btrfs_root *root,
2525 : const struct btrfs_key *key,
2526 : struct btrfs_path *p, int find_higher,
2527 : int return_any)
2528 : {
2529 1835455 : int ret;
2530 1835455 : struct extent_buffer *leaf;
2531 :
2532 : again:
2533 1835455 : ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2534 1835490 : if (ret <= 0)
2535 959 : return ret;
2536 : /*
2537 : * a return value of 1 means the path is at the position where the
2538 : * item should be inserted. Normally this is the next bigger item,
2539 : * but in case the previous item is the last in a leaf, path points
2540 : * to the first free slot in the previous leaf, i.e. at an invalid
2541 : * item.
2542 : */
2543 1834531 : leaf = p->nodes[0];
2544 :
2545 1834531 : if (find_higher) {
2546 1426424 : if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2547 61531 : ret = btrfs_next_leaf(root, p);
2548 61531 : if (ret <= 0)
2549 61454 : return ret;
2550 77 : if (!return_any)
2551 : return 1;
2552 : /*
2553 : * no higher item found, return the next
2554 : * lower instead
2555 : */
2556 0 : return_any = 0;
2557 0 : find_higher = 0;
2558 0 : btrfs_release_path(p);
2559 0 : goto again;
2560 : }
2561 : } else {
2562 408107 : if (p->slots[0] == 0) {
2563 0 : ret = btrfs_prev_leaf(root, p);
2564 0 : if (ret < 0)
2565 0 : return ret;
2566 0 : if (!ret) {
2567 0 : leaf = p->nodes[0];
2568 0 : if (p->slots[0] == btrfs_header_nritems(leaf))
2569 0 : p->slots[0]--;
2570 0 : return 0;
2571 : }
2572 0 : if (!return_any)
2573 : return 1;
2574 : /*
2575 : * no lower item found, return the next
2576 : * higher instead
2577 : */
2578 0 : return_any = 0;
2579 0 : find_higher = 1;
2580 0 : btrfs_release_path(p);
2581 0 : goto again;
2582 : } else {
2583 408107 : --p->slots[0];
2584 : }
2585 : }
2586 : return 0;
2587 : }
2588 :
2589 : /*
2590 : * Execute search and call btrfs_previous_item to traverse backwards if the item
2591 : * was not found.
2592 : *
2593 : * Return 0 if found, 1 if not found and < 0 if error.
2594 : */
2595 3586 : int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
2596 : struct btrfs_path *path)
2597 : {
2598 3586 : int ret;
2599 :
2600 3586 : ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2601 3586 : if (ret > 0)
2602 3368 : ret = btrfs_previous_item(root, path, key->objectid, key->type);
2603 :
2604 3586 : if (ret == 0)
2605 2321 : btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2606 :
2607 3586 : return ret;
2608 : }
2609 :
2610 : /*
2611 : * Search for a valid slot for the given path.
2612 : *
2613 : * @root: The root node of the tree.
2614 : * @key: Will contain a valid item if found.
2615 : * @path: The starting point to validate the slot.
2616 : *
2617 : * Return: 0 if the item is valid
2618 : * 1 if not found
2619 : * <0 if error.
2620 : */
2621 2670779740 : int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
2622 : struct btrfs_path *path)
2623 : {
2624 2670779740 : if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2625 11274359 : int ret;
2626 :
2627 11274359 : ret = btrfs_next_leaf(root, path);
2628 11274210 : if (ret)
2629 : return ret;
2630 : }
2631 :
2632 2670763904 : btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2633 2670763904 : return 0;
2634 : }
2635 :
2636 : /*
2637 : * adjust the pointers going up the tree, starting at level
2638 : * making sure the right key of each node is points to 'key'.
2639 : * This is used after shifting pointers to the left, so it stops
2640 : * fixing up pointers when a given leaf/node is not in slot 0 of the
2641 : * higher levels
2642 : *
2643 : */
2644 5117695 : static void fixup_low_keys(struct btrfs_path *path,
2645 : struct btrfs_disk_key *key, int level)
2646 : {
2647 5117695 : int i;
2648 5117695 : struct extent_buffer *t;
2649 5117695 : int ret;
2650 :
2651 5451755 : for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2652 5451755 : int tslot = path->slots[i];
2653 :
2654 5451755 : if (!path->nodes[i])
2655 : break;
2656 4989371 : t = path->nodes[i];
2657 4989371 : ret = btrfs_tree_mod_log_insert_key(t, tslot,
2658 : BTRFS_MOD_LOG_KEY_REPLACE);
2659 4989371 : BUG_ON(ret < 0);
2660 4989371 : btrfs_set_node_key(t, key, tslot);
2661 4989374 : btrfs_mark_buffer_dirty(path->nodes[i]);
2662 4989415 : if (tslot != 0)
2663 : break;
2664 : }
2665 5117739 : }
2666 :
2667 : /*
2668 : * update item key.
2669 : *
2670 : * This function isn't completely safe. It's the caller's responsibility
2671 : * that the new key won't break the order
2672 : */
2673 5276016 : void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2674 : struct btrfs_path *path,
2675 : const struct btrfs_key *new_key)
2676 : {
2677 5276016 : struct btrfs_disk_key disk_key;
2678 5276016 : struct extent_buffer *eb;
2679 5276016 : int slot;
2680 :
2681 5276016 : eb = path->nodes[0];
2682 5276016 : slot = path->slots[0];
2683 5276016 : if (slot > 0) {
2684 4629352 : btrfs_item_key(eb, &disk_key, slot - 1);
2685 4629348 : if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
2686 0 : btrfs_print_leaf(eb);
2687 0 : btrfs_crit(fs_info,
2688 : "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2689 : slot, btrfs_disk_key_objectid(&disk_key),
2690 : btrfs_disk_key_type(&disk_key),
2691 : btrfs_disk_key_offset(&disk_key),
2692 : new_key->objectid, new_key->type,
2693 : new_key->offset);
2694 0 : BUG();
2695 : }
2696 : }
2697 5276012 : if (slot < btrfs_header_nritems(eb) - 1) {
2698 3843159 : btrfs_item_key(eb, &disk_key, slot + 1);
2699 3843158 : if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
2700 0 : btrfs_print_leaf(eb);
2701 0 : btrfs_crit(fs_info,
2702 : "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2703 : slot, btrfs_disk_key_objectid(&disk_key),
2704 : btrfs_disk_key_type(&disk_key),
2705 : btrfs_disk_key_offset(&disk_key),
2706 : new_key->objectid, new_key->type,
2707 : new_key->offset);
2708 0 : BUG();
2709 : }
2710 : }
2711 :
2712 5276011 : btrfs_cpu_key_to_disk(&disk_key, new_key);
2713 5276007 : btrfs_set_item_key(eb, &disk_key, slot);
2714 5276008 : btrfs_mark_buffer_dirty(eb);
2715 5276009 : if (slot == 0)
2716 646663 : fixup_low_keys(path, &disk_key, 1);
2717 5276009 : }
2718 :
2719 : /*
2720 : * Check key order of two sibling extent buffers.
2721 : *
2722 : * Return true if something is wrong.
2723 : * Return false if everything is fine.
2724 : *
2725 : * Tree-checker only works inside one tree block, thus the following
2726 : * corruption can not be detected by tree-checker:
2727 : *
2728 : * Leaf @left | Leaf @right
2729 : * --------------------------------------------------------------
2730 : * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
2731 : *
2732 : * Key f6 in leaf @left itself is valid, but not valid when the next
2733 : * key in leaf @right is 7.
2734 : * This can only be checked at tree block merge time.
2735 : * And since tree checker has ensured all key order in each tree block
2736 : * is correct, we only need to bother the last key of @left and the first
2737 : * key of @right.
2738 : */
2739 4074808 : static bool check_sibling_keys(struct extent_buffer *left,
2740 : struct extent_buffer *right)
2741 : {
2742 4074808 : struct btrfs_key left_last;
2743 4074808 : struct btrfs_key right_first;
2744 4074808 : int level = btrfs_header_level(left);
2745 4074808 : int nr_left = btrfs_header_nritems(left);
2746 4074808 : int nr_right = btrfs_header_nritems(right);
2747 :
2748 : /* No key to check in one of the tree blocks */
2749 4074808 : if (!nr_left || !nr_right)
2750 : return false;
2751 :
2752 4074735 : if (level) {
2753 114520 : btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2754 114520 : btrfs_node_key_to_cpu(right, &right_first, 0);
2755 : } else {
2756 3960215 : btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2757 3960207 : btrfs_item_key_to_cpu(right, &right_first, 0);
2758 : }
2759 :
2760 4074722 : if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) {
2761 0 : btrfs_crit(left->fs_info, "left extent buffer:");
2762 0 : btrfs_print_tree(left, false);
2763 0 : btrfs_crit(left->fs_info, "right extent buffer:");
2764 0 : btrfs_print_tree(right, false);
2765 0 : btrfs_crit(left->fs_info,
2766 : "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2767 : left_last.objectid, left_last.type,
2768 : left_last.offset, right_first.objectid,
2769 : right_first.type, right_first.offset);
2770 0 : return true;
2771 : }
2772 : return false;
2773 : }
2774 :
2775 : /*
2776 : * try to push data from one node into the next node left in the
2777 : * tree.
2778 : *
2779 : * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2780 : * error, and > 0 if there was no room in the left hand block.
2781 : */
2782 73520 : static int push_node_left(struct btrfs_trans_handle *trans,
2783 : struct extent_buffer *dst,
2784 : struct extent_buffer *src, int empty)
2785 : {
2786 73520 : struct btrfs_fs_info *fs_info = trans->fs_info;
2787 73520 : int push_items = 0;
2788 73520 : int src_nritems;
2789 73520 : int dst_nritems;
2790 73520 : int ret = 0;
2791 :
2792 73520 : src_nritems = btrfs_header_nritems(src);
2793 73520 : dst_nritems = btrfs_header_nritems(dst);
2794 73520 : push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2795 73520 : WARN_ON(btrfs_header_generation(src) != trans->transid);
2796 73520 : WARN_ON(btrfs_header_generation(dst) != trans->transid);
2797 :
2798 73520 : if (!empty && src_nritems <= 8)
2799 : return 1;
2800 :
2801 73520 : if (push_items <= 0)
2802 : return 1;
2803 :
2804 58248 : if (empty) {
2805 586 : push_items = min(src_nritems, push_items);
2806 586 : if (push_items < src_nritems) {
2807 : /* leave at least 8 pointers in the node if
2808 : * we aren't going to empty it
2809 : */
2810 399 : if (src_nritems - push_items < 8) {
2811 234 : if (push_items <= 8)
2812 : return 1;
2813 9 : push_items -= 8;
2814 : }
2815 : }
2816 : } else
2817 57662 : push_items = min(src_nritems - 8, push_items);
2818 :
2819 : /* dst is the left eb, src is the middle eb */
2820 58023 : if (check_sibling_keys(dst, src)) {
2821 0 : ret = -EUCLEAN;
2822 0 : btrfs_abort_transaction(trans, ret);
2823 0 : return ret;
2824 : }
2825 58023 : ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
2826 58023 : if (ret) {
2827 0 : btrfs_abort_transaction(trans, ret);
2828 0 : return ret;
2829 : }
2830 58023 : copy_extent_buffer(dst, src,
2831 : btrfs_node_key_ptr_offset(dst, dst_nritems),
2832 : btrfs_node_key_ptr_offset(src, 0),
2833 : push_items * sizeof(struct btrfs_key_ptr));
2834 :
2835 58023 : if (push_items < src_nritems) {
2836 : /*
2837 : * btrfs_tree_mod_log_eb_copy handles logging the move, so we
2838 : * don't need to do an explicit tree mod log operation for it.
2839 : */
2840 57836 : memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0),
2841 : btrfs_node_key_ptr_offset(src, push_items),
2842 57836 : (src_nritems - push_items) *
2843 : sizeof(struct btrfs_key_ptr));
2844 : }
2845 58023 : btrfs_set_header_nritems(src, src_nritems - push_items);
2846 58023 : btrfs_set_header_nritems(dst, dst_nritems + push_items);
2847 58023 : btrfs_mark_buffer_dirty(src);
2848 58023 : btrfs_mark_buffer_dirty(dst);
2849 :
2850 58023 : return ret;
2851 : }
2852 :
2853 : /*
2854 : * try to push data from one node into the next node right in the
2855 : * tree.
2856 : *
2857 : * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2858 : * error, and > 0 if there was no room in the right hand block.
2859 : *
2860 : * this will only push up to 1/2 the contents of the left node over
2861 : */
2862 56570 : static int balance_node_right(struct btrfs_trans_handle *trans,
2863 : struct extent_buffer *dst,
2864 : struct extent_buffer *src)
2865 : {
2866 56570 : struct btrfs_fs_info *fs_info = trans->fs_info;
2867 56570 : int push_items = 0;
2868 56570 : int max_push;
2869 56570 : int src_nritems;
2870 56570 : int dst_nritems;
2871 56570 : int ret = 0;
2872 :
2873 56570 : WARN_ON(btrfs_header_generation(src) != trans->transid);
2874 56570 : WARN_ON(btrfs_header_generation(dst) != trans->transid);
2875 :
2876 56570 : src_nritems = btrfs_header_nritems(src);
2877 56570 : dst_nritems = btrfs_header_nritems(dst);
2878 56570 : push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2879 56570 : if (push_items <= 0)
2880 : return 1;
2881 :
2882 56570 : if (src_nritems < 4)
2883 : return 1;
2884 :
2885 56570 : max_push = src_nritems / 2 + 1;
2886 : /* don't try to empty the node */
2887 56570 : if (max_push >= src_nritems)
2888 : return 1;
2889 :
2890 56570 : if (max_push < push_items)
2891 : push_items = max_push;
2892 :
2893 : /* dst is the right eb, src is the middle eb */
2894 56570 : if (check_sibling_keys(src, dst)) {
2895 0 : ret = -EUCLEAN;
2896 0 : btrfs_abort_transaction(trans, ret);
2897 0 : return ret;
2898 : }
2899 :
2900 : /*
2901 : * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't
2902 : * need to do an explicit tree mod log operation for it.
2903 : */
2904 56570 : memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items),
2905 : btrfs_node_key_ptr_offset(dst, 0),
2906 : (dst_nritems) *
2907 : sizeof(struct btrfs_key_ptr));
2908 :
2909 56570 : ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2910 : push_items);
2911 56570 : if (ret) {
2912 0 : btrfs_abort_transaction(trans, ret);
2913 0 : return ret;
2914 : }
2915 56570 : copy_extent_buffer(dst, src,
2916 : btrfs_node_key_ptr_offset(dst, 0),
2917 : btrfs_node_key_ptr_offset(src, src_nritems - push_items),
2918 : push_items * sizeof(struct btrfs_key_ptr));
2919 :
2920 56570 : btrfs_set_header_nritems(src, src_nritems - push_items);
2921 56570 : btrfs_set_header_nritems(dst, dst_nritems + push_items);
2922 :
2923 56570 : btrfs_mark_buffer_dirty(src);
2924 56570 : btrfs_mark_buffer_dirty(dst);
2925 :
2926 56570 : return ret;
2927 : }
2928 :
2929 : /*
2930 : * helper function to insert a new root level in the tree.
2931 : * A new node is allocated, and a single item is inserted to
2932 : * point to the existing root
2933 : *
2934 : * returns zero on success or < 0 on failure.
2935 : */
2936 1961 : static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2937 : struct btrfs_root *root,
2938 : struct btrfs_path *path, int level)
2939 : {
2940 1961 : struct btrfs_fs_info *fs_info = root->fs_info;
2941 1961 : u64 lower_gen;
2942 1961 : struct extent_buffer *lower;
2943 1961 : struct extent_buffer *c;
2944 1961 : struct extent_buffer *old;
2945 1961 : struct btrfs_disk_key lower_key;
2946 1961 : int ret;
2947 :
2948 1961 : BUG_ON(path->nodes[level]);
2949 1961 : BUG_ON(path->nodes[level-1] != root->node);
2950 :
2951 1961 : lower = path->nodes[level-1];
2952 1961 : if (level == 1)
2953 1882 : btrfs_item_key(lower, &lower_key, 0);
2954 : else
2955 79 : btrfs_node_key(lower, &lower_key, 0);
2956 :
2957 1961 : c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2958 1961 : &lower_key, level, root->node->start, 0,
2959 : BTRFS_NESTING_NEW_ROOT);
2960 1961 : if (IS_ERR(c))
2961 0 : return PTR_ERR(c);
2962 :
2963 1961 : root_add_used(root, fs_info->nodesize);
2964 :
2965 1961 : btrfs_set_header_nritems(c, 1);
2966 1961 : btrfs_set_node_key(c, &lower_key, 0);
2967 1961 : btrfs_set_node_blockptr(c, 0, lower->start);
2968 1961 : lower_gen = btrfs_header_generation(lower);
2969 1961 : WARN_ON(lower_gen != trans->transid);
2970 :
2971 1961 : btrfs_set_node_ptr_generation(c, 0, lower_gen);
2972 :
2973 1961 : btrfs_mark_buffer_dirty(c);
2974 :
2975 1961 : old = root->node;
2976 1961 : ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
2977 1961 : if (ret < 0) {
2978 0 : btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
2979 0 : btrfs_tree_unlock(c);
2980 0 : free_extent_buffer(c);
2981 0 : return ret;
2982 : }
2983 1961 : rcu_assign_pointer(root->node, c);
2984 :
2985 : /* the super has an extra ref to root->node */
2986 1961 : free_extent_buffer(old);
2987 :
2988 1961 : add_root_to_dirty_list(root);
2989 1961 : atomic_inc(&c->refs);
2990 1961 : path->nodes[level] = c;
2991 1961 : path->locks[level] = BTRFS_WRITE_LOCK;
2992 1961 : path->slots[level] = 0;
2993 1961 : return 0;
2994 : }
2995 :
2996 : /*
2997 : * worker function to insert a single pointer in a node.
2998 : * the node should have enough room for the pointer already
2999 : *
3000 : * slot and level indicate where you want the key to go, and
3001 : * blocknr is the block the key points to.
3002 : */
3003 363311 : static int insert_ptr(struct btrfs_trans_handle *trans,
3004 : struct btrfs_path *path,
3005 : struct btrfs_disk_key *key, u64 bytenr,
3006 : int slot, int level)
3007 : {
3008 363311 : struct extent_buffer *lower;
3009 363311 : int nritems;
3010 363311 : int ret;
3011 :
3012 363311 : BUG_ON(!path->nodes[level]);
3013 363311 : btrfs_assert_tree_write_locked(path->nodes[level]);
3014 363311 : lower = path->nodes[level];
3015 363311 : nritems = btrfs_header_nritems(lower);
3016 363311 : BUG_ON(slot > nritems);
3017 363311 : BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
3018 363311 : if (slot != nritems) {
3019 186527 : if (level) {
3020 186524 : ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
3021 : slot, nritems - slot);
3022 186514 : if (ret < 0) {
3023 0 : btrfs_abort_transaction(trans, ret);
3024 0 : return ret;
3025 : }
3026 : }
3027 186517 : memmove_extent_buffer(lower,
3028 : btrfs_node_key_ptr_offset(lower, slot + 1),
3029 : btrfs_node_key_ptr_offset(lower, slot),
3030 186517 : (nritems - slot) * sizeof(struct btrfs_key_ptr));
3031 : }
3032 363314 : if (level) {
3033 363314 : ret = btrfs_tree_mod_log_insert_key(lower, slot,
3034 : BTRFS_MOD_LOG_KEY_ADD);
3035 363311 : if (ret < 0) {
3036 0 : btrfs_abort_transaction(trans, ret);
3037 0 : return ret;
3038 : }
3039 : }
3040 363311 : btrfs_set_node_key(lower, key, slot);
3041 363311 : btrfs_set_node_blockptr(lower, slot, bytenr);
3042 363307 : WARN_ON(trans->transid == 0);
3043 363307 : btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3044 363308 : btrfs_set_header_nritems(lower, nritems + 1);
3045 363308 : btrfs_mark_buffer_dirty(lower);
3046 :
3047 363308 : return 0;
3048 : }
3049 :
3050 : /*
3051 : * split the node at the specified level in path in two.
3052 : * The path is corrected to point to the appropriate node after the split
3053 : *
3054 : * Before splitting this tries to make some room in the node by pushing
3055 : * left and right, if either one works, it returns right away.
3056 : *
3057 : * returns 0 on success and < 0 on failure
3058 : */
3059 114633 : static noinline int split_node(struct btrfs_trans_handle *trans,
3060 : struct btrfs_root *root,
3061 : struct btrfs_path *path, int level)
3062 : {
3063 114633 : struct btrfs_fs_info *fs_info = root->fs_info;
3064 114633 : struct extent_buffer *c;
3065 114633 : struct extent_buffer *split;
3066 114633 : struct btrfs_disk_key disk_key;
3067 114633 : int mid;
3068 114633 : int ret;
3069 114633 : u32 c_nritems;
3070 :
3071 114633 : c = path->nodes[level];
3072 114633 : WARN_ON(btrfs_header_generation(c) != trans->transid);
3073 114633 : if (c == root->node) {
3074 : /*
3075 : * trying to split the root, lets make a new one
3076 : *
3077 : * tree mod log: We don't log_removal old root in
3078 : * insert_new_root, because that root buffer will be kept as a
3079 : * normal node. We are going to log removal of half of the
3080 : * elements below with btrfs_tree_mod_log_eb_copy(). We're
3081 : * holding a tree lock on the buffer, which is why we cannot
3082 : * race with other tree_mod_log users.
3083 : */
3084 79 : ret = insert_new_root(trans, root, path, level + 1);
3085 79 : if (ret)
3086 : return ret;
3087 : } else {
3088 114554 : ret = push_nodes_for_insert(trans, root, path, level);
3089 114554 : c = path->nodes[level];
3090 114554 : if (!ret && btrfs_header_nritems(c) <
3091 114232 : BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3092 : return 0;
3093 684 : if (ret < 0)
3094 : return ret;
3095 : }
3096 :
3097 763 : c_nritems = btrfs_header_nritems(c);
3098 763 : mid = (c_nritems + 1) / 2;
3099 763 : btrfs_node_key(c, &disk_key, mid);
3100 :
3101 763 : split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3102 : &disk_key, level, c->start, 0,
3103 : BTRFS_NESTING_SPLIT);
3104 763 : if (IS_ERR(split))
3105 0 : return PTR_ERR(split);
3106 :
3107 763 : root_add_used(root, fs_info->nodesize);
3108 763 : ASSERT(btrfs_header_level(c) == level);
3109 :
3110 763 : ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3111 763 : if (ret) {
3112 0 : btrfs_tree_unlock(split);
3113 0 : free_extent_buffer(split);
3114 0 : btrfs_abort_transaction(trans, ret);
3115 0 : return ret;
3116 : }
3117 763 : copy_extent_buffer(split, c,
3118 : btrfs_node_key_ptr_offset(split, 0),
3119 : btrfs_node_key_ptr_offset(c, mid),
3120 763 : (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3121 763 : btrfs_set_header_nritems(split, c_nritems - mid);
3122 763 : btrfs_set_header_nritems(c, mid);
3123 :
3124 763 : btrfs_mark_buffer_dirty(c);
3125 763 : btrfs_mark_buffer_dirty(split);
3126 :
3127 763 : ret = insert_ptr(trans, path, &disk_key, split->start,
3128 763 : path->slots[level + 1] + 1, level + 1);
3129 763 : if (ret < 0) {
3130 0 : btrfs_tree_unlock(split);
3131 0 : free_extent_buffer(split);
3132 0 : return ret;
3133 : }
3134 :
3135 763 : if (path->slots[level] >= mid) {
3136 646 : path->slots[level] -= mid;
3137 646 : btrfs_tree_unlock(c);
3138 646 : free_extent_buffer(c);
3139 646 : path->nodes[level] = split;
3140 646 : path->slots[level + 1] += 1;
3141 : } else {
3142 117 : btrfs_tree_unlock(split);
3143 117 : free_extent_buffer(split);
3144 : }
3145 : return 0;
3146 : }
3147 :
3148 : /*
3149 : * how many bytes are required to store the items in a leaf. start
3150 : * and nr indicate which items in the leaf to check. This totals up the
3151 : * space used both by the item structs and the item data
3152 : */
3153 338341691 : static int leaf_space_used(const struct extent_buffer *l, int start, int nr)
3154 : {
3155 338341691 : int data_len;
3156 338341691 : int nritems = btrfs_header_nritems(l);
3157 338341691 : int end = min(nritems, start + nr) - 1;
3158 :
3159 338341691 : if (!nr)
3160 : return 0;
3161 338305504 : data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
3162 338145185 : data_len = data_len - btrfs_item_offset(l, end);
3163 338174952 : data_len += sizeof(struct btrfs_item) * nr;
3164 338174952 : WARN_ON(data_len < 0);
3165 : return data_len;
3166 : }
3167 :
3168 : /*
3169 : * The space between the end of the leaf items and
3170 : * the start of the leaf data. IOW, how much room
3171 : * the leaf has left for both items and data
3172 : */
3173 307572865 : int btrfs_leaf_free_space(const struct extent_buffer *leaf)
3174 : {
3175 307572865 : struct btrfs_fs_info *fs_info = leaf->fs_info;
3176 307572865 : int nritems = btrfs_header_nritems(leaf);
3177 307572865 : int ret;
3178 :
3179 307572865 : ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3180 307455147 : if (ret < 0) {
3181 0 : btrfs_crit(fs_info,
3182 : "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3183 : ret,
3184 : (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3185 : leaf_space_used(leaf, 0, nritems), nritems);
3186 : }
3187 307455147 : return ret;
3188 : }
3189 :
3190 : /*
3191 : * min slot controls the lowest index we're willing to push to the
3192 : * right. We'll push up to and including min_slot, but no lower
3193 : */
3194 2717268 : static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3195 : struct btrfs_path *path,
3196 : int data_size, int empty,
3197 : struct extent_buffer *right,
3198 : int free_space, u32 left_nritems,
3199 : u32 min_slot)
3200 : {
3201 2717268 : struct btrfs_fs_info *fs_info = right->fs_info;
3202 2717268 : struct extent_buffer *left = path->nodes[0];
3203 2717268 : struct extent_buffer *upper = path->nodes[1];
3204 2717268 : struct btrfs_map_token token;
3205 2717268 : struct btrfs_disk_key disk_key;
3206 2717268 : int slot;
3207 2717268 : u32 i;
3208 2717268 : int push_space = 0;
3209 2717268 : int push_items = 0;
3210 2717268 : u32 nr;
3211 2717268 : u32 right_nritems;
3212 2717268 : u32 data_end;
3213 2717268 : u32 this_item_size;
3214 :
3215 2717268 : if (empty)
3216 : nr = 0;
3217 : else
3218 2697641 : nr = max_t(u32, 1, min_slot);
3219 :
3220 2717268 : if (path->slots[0] >= left_nritems)
3221 3162 : push_space += data_size;
3222 :
3223 2717268 : slot = path->slots[1];
3224 2717268 : i = left_nritems - 1;
3225 92847668 : while (i >= nr) {
3226 92846212 : if (!empty && push_items > 0) {
3227 89521931 : if (path->slots[0] > i)
3228 : break;
3229 89327181 : if (path->slots[0] == i) {
3230 339464 : int space = btrfs_leaf_free_space(left);
3231 :
3232 339463 : if (space + push_space * 2 > free_space)
3233 : break;
3234 : }
3235 : }
3236 :
3237 92490938 : if (path->slots[0] == i)
3238 241893 : push_space += data_size;
3239 :
3240 92490938 : this_item_size = btrfs_item_size(left, i);
3241 92490994 : if (this_item_size + sizeof(struct btrfs_item) +
3242 92490994 : push_space > free_space)
3243 : break;
3244 :
3245 90150001 : push_items++;
3246 90150001 : push_space += this_item_size + sizeof(struct btrfs_item);
3247 90150001 : if (i == 0)
3248 : break;
3249 90130400 : i--;
3250 : }
3251 :
3252 2717323 : if (push_items == 0)
3253 337926 : goto out_unlock;
3254 :
3255 2379397 : WARN_ON(!empty && push_items == left_nritems);
3256 :
3257 : /* push left to right */
3258 2379397 : right_nritems = btrfs_header_nritems(right);
3259 :
3260 2379397 : push_space = btrfs_item_data_end(left, left_nritems - push_items);
3261 2379388 : push_space -= leaf_data_end(left);
3262 :
3263 : /* make room in the right data area */
3264 2379349 : data_end = leaf_data_end(right);
3265 2379369 : memmove_leaf_data(right, data_end - push_space, data_end,
3266 2379369 : BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3267 :
3268 : /* copy from the left data area */
3269 2379401 : copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3270 2379394 : leaf_data_end(left), push_space);
3271 :
3272 2379404 : memmove_leaf_items(right, push_items, 0, right_nritems);
3273 :
3274 : /* copy the items from left to right */
3275 2379396 : copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
3276 :
3277 : /* update the item pointers */
3278 2379402 : btrfs_init_map_token(&token, right);
3279 2379392 : right_nritems += push_items;
3280 2379392 : btrfs_set_header_nritems(right, right_nritems);
3281 2379392 : push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3282 613133821 : for (i = 0; i < right_nritems; i++) {
3283 610754417 : push_space -= btrfs_token_item_size(&token, i);
3284 610754589 : btrfs_set_token_item_offset(&token, i, push_space);
3285 : }
3286 :
3287 2379404 : left_nritems -= push_items;
3288 2379404 : btrfs_set_header_nritems(left, left_nritems);
3289 :
3290 2379404 : if (left_nritems)
3291 2359803 : btrfs_mark_buffer_dirty(left);
3292 : else
3293 19601 : btrfs_clear_buffer_dirty(trans, left);
3294 :
3295 2379395 : btrfs_mark_buffer_dirty(right);
3296 :
3297 2379406 : btrfs_item_key(right, &disk_key, 0);
3298 2379399 : btrfs_set_node_key(upper, &disk_key, slot + 1);
3299 2379358 : btrfs_mark_buffer_dirty(upper);
3300 :
3301 : /* then fixup the leaf pointer in the path */
3302 2379403 : if (path->slots[0] >= left_nritems) {
3303 215771 : path->slots[0] -= left_nritems;
3304 215771 : if (btrfs_header_nritems(path->nodes[0]) == 0)
3305 19601 : btrfs_clear_buffer_dirty(trans, path->nodes[0]);
3306 215771 : btrfs_tree_unlock(path->nodes[0]);
3307 215771 : free_extent_buffer(path->nodes[0]);
3308 215769 : path->nodes[0] = right;
3309 215769 : path->slots[1] += 1;
3310 : } else {
3311 2163632 : btrfs_tree_unlock(right);
3312 2163620 : free_extent_buffer(right);
3313 : }
3314 : return 0;
3315 :
3316 : out_unlock:
3317 337926 : btrfs_tree_unlock(right);
3318 337925 : free_extent_buffer(right);
3319 337925 : return 1;
3320 : }
3321 :
3322 : /*
3323 : * push some data in the path leaf to the right, trying to free up at
3324 : * least data_size bytes. returns zero if the push worked, nonzero otherwise
3325 : *
3326 : * returns 1 if the push failed because the other node didn't have enough
3327 : * room, 0 if everything worked out and < 0 if there were major errors.
3328 : *
3329 : * this will push starting from min_slot to the end of the leaf. It won't
3330 : * push any slot lower than min_slot
3331 : */
3332 7034291 : static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3333 : *root, struct btrfs_path *path,
3334 : int min_data_size, int data_size,
3335 : int empty, u32 min_slot)
3336 : {
3337 7034291 : struct extent_buffer *left = path->nodes[0];
3338 7034291 : struct extent_buffer *right;
3339 7034291 : struct extent_buffer *upper;
3340 7034291 : int slot;
3341 7034291 : int free_space;
3342 7034291 : u32 left_nritems;
3343 7034291 : int ret;
3344 :
3345 7034291 : if (!path->nodes[1])
3346 : return 1;
3347 :
3348 3982074 : slot = path->slots[1];
3349 3982074 : upper = path->nodes[1];
3350 3982074 : if (slot >= btrfs_header_nritems(upper) - 1)
3351 : return 1;
3352 :
3353 3384277 : btrfs_assert_tree_write_locked(path->nodes[1]);
3354 :
3355 3384277 : right = btrfs_read_node_slot(upper, slot + 1);
3356 3384395 : if (IS_ERR(right))
3357 0 : return PTR_ERR(right);
3358 :
3359 3384395 : __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
3360 :
3361 3384395 : free_space = btrfs_leaf_free_space(right);
3362 3384431 : if (free_space < data_size)
3363 647163 : goto out_unlock;
3364 :
3365 2737268 : ret = btrfs_cow_block(trans, root, right, upper,
3366 : slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
3367 2737254 : if (ret)
3368 0 : goto out_unlock;
3369 :
3370 2737254 : left_nritems = btrfs_header_nritems(left);
3371 2737254 : if (left_nritems == 0)
3372 0 : goto out_unlock;
3373 :
3374 2737254 : if (check_sibling_keys(left, right)) {
3375 0 : ret = -EUCLEAN;
3376 0 : btrfs_abort_transaction(trans, ret);
3377 0 : btrfs_tree_unlock(right);
3378 0 : free_extent_buffer(right);
3379 0 : return ret;
3380 : }
3381 2737246 : if (path->slots[0] == left_nritems && !empty) {
3382 : /* Key greater than all keys in the leaf, right neighbor has
3383 : * enough room for it and we're not emptying our leaf to delete
3384 : * it, therefore use right neighbor to insert the new item and
3385 : * no need to touch/dirty our left leaf. */
3386 19939 : btrfs_tree_unlock(left);
3387 19939 : free_extent_buffer(left);
3388 19939 : path->nodes[0] = right;
3389 19939 : path->slots[0] = 0;
3390 19939 : path->slots[1]++;
3391 19939 : return 0;
3392 : }
3393 :
3394 2717307 : return __push_leaf_right(trans, path, min_data_size, empty, right,
3395 : free_space, left_nritems, min_slot);
3396 647163 : out_unlock:
3397 647163 : btrfs_tree_unlock(right);
3398 647162 : free_extent_buffer(right);
3399 647162 : return 1;
3400 : }
3401 :
3402 : /*
3403 : * push some data in the path leaf to the left, trying to free up at
3404 : * least data_size bytes. returns zero if the push worked, nonzero otherwise
3405 : *
3406 : * max_slot can put a limit on how far into the leaf we'll push items. The
3407 : * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
3408 : * items
3409 : */
3410 1222987 : static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3411 : struct btrfs_path *path, int data_size,
3412 : int empty, struct extent_buffer *left,
3413 : int free_space, u32 right_nritems,
3414 : u32 max_slot)
3415 : {
3416 1222987 : struct btrfs_fs_info *fs_info = left->fs_info;
3417 1222987 : struct btrfs_disk_key disk_key;
3418 1222987 : struct extent_buffer *right = path->nodes[0];
3419 1222987 : int i;
3420 1222987 : int push_space = 0;
3421 1222987 : int push_items = 0;
3422 1222987 : u32 old_left_nritems;
3423 1222987 : u32 nr;
3424 1222987 : int ret = 0;
3425 1222987 : u32 this_item_size;
3426 1222987 : u32 old_left_item_size;
3427 1222987 : struct btrfs_map_token token;
3428 :
3429 1222987 : if (empty)
3430 163389 : nr = min(right_nritems, max_slot);
3431 : else
3432 1059598 : nr = min(right_nritems - 1, max_slot);
3433 :
3434 41294167 : for (i = 0; i < nr; i++) {
3435 41205231 : if (!empty && push_items > 0) {
3436 35578670 : if (path->slots[0] < i)
3437 : break;
3438 35482546 : if (path->slots[0] == i) {
3439 163342 : int space = btrfs_leaf_free_space(right);
3440 :
3441 163342 : if (space + push_space * 2 > free_space)
3442 : break;
3443 : }
3444 : }
3445 :
3446 41043927 : if (path->slots[0] == i)
3447 249857 : push_space += data_size;
3448 :
3449 41043927 : this_item_size = btrfs_item_size(right, i);
3450 41043946 : if (this_item_size + sizeof(struct btrfs_item) + push_space >
3451 : free_space)
3452 : break;
3453 :
3454 40071180 : push_items++;
3455 40071180 : push_space += this_item_size + sizeof(struct btrfs_item);
3456 : }
3457 :
3458 1223006 : if (push_items == 0) {
3459 182191 : ret = 1;
3460 182191 : goto out;
3461 : }
3462 1918239 : WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3463 :
3464 : /* push data from right to left */
3465 1040815 : copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items);
3466 :
3467 1040809 : push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3468 1040809 : btrfs_item_offset(right, push_items - 1);
3469 :
3470 1040808 : copy_leaf_data(left, right, leaf_data_end(left) - push_space,
3471 : btrfs_item_offset(right, push_items - 1), push_space);
3472 1040815 : old_left_nritems = btrfs_header_nritems(left);
3473 1040815 : BUG_ON(old_left_nritems <= 0);
3474 :
3475 1040815 : btrfs_init_map_token(&token, left);
3476 1040813 : old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
3477 41109362 : for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3478 40068549 : u32 ioff;
3479 :
3480 40068549 : ioff = btrfs_token_item_offset(&token, i);
3481 40068742 : btrfs_set_token_item_offset(&token, i,
3482 40068742 : ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3483 : }
3484 1040813 : btrfs_set_header_nritems(left, old_left_nritems + push_items);
3485 :
3486 : /* fixup right node */
3487 1040813 : if (push_items > right_nritems)
3488 0 : WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3489 : right_nritems);
3490 :
3491 1040813 : if (push_items < right_nritems) {
3492 952184 : push_space = btrfs_item_offset(right, push_items - 1) -
3493 952183 : leaf_data_end(right);
3494 952179 : memmove_leaf_data(right,
3495 952179 : BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3496 952184 : leaf_data_end(right), push_space);
3497 :
3498 952183 : memmove_leaf_items(right, 0, push_items,
3499 952183 : btrfs_header_nritems(right) - push_items);
3500 : }
3501 :
3502 1040815 : btrfs_init_map_token(&token, right);
3503 1040814 : right_nritems -= push_items;
3504 1040814 : btrfs_set_header_nritems(right, right_nritems);
3505 1040814 : push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3506 165508865 : for (i = 0; i < right_nritems; i++) {
3507 164468050 : push_space = push_space - btrfs_token_item_size(&token, i);
3508 164468795 : btrfs_set_token_item_offset(&token, i, push_space);
3509 : }
3510 :
3511 1040815 : btrfs_mark_buffer_dirty(left);
3512 1040813 : if (right_nritems)
3513 952182 : btrfs_mark_buffer_dirty(right);
3514 : else
3515 88631 : btrfs_clear_buffer_dirty(trans, right);
3516 :
3517 1040816 : btrfs_item_key(right, &disk_key, 0);
3518 1040815 : fixup_low_keys(path, &disk_key, 1);
3519 :
3520 : /* then fixup the leaf pointer in the path */
3521 1040815 : if (path->slots[0] < push_items) {
3522 240494 : path->slots[0] += old_left_nritems;
3523 240494 : btrfs_tree_unlock(path->nodes[0]);
3524 240494 : free_extent_buffer(path->nodes[0]);
3525 240494 : path->nodes[0] = left;
3526 240494 : path->slots[1] -= 1;
3527 : } else {
3528 800321 : btrfs_tree_unlock(left);
3529 800321 : free_extent_buffer(left);
3530 800323 : path->slots[0] -= push_items;
3531 : }
3532 1040817 : BUG_ON(path->slots[0] < 0);
3533 : return ret;
3534 : out:
3535 182191 : btrfs_tree_unlock(left);
3536 182192 : free_extent_buffer(left);
3537 182192 : return ret;
3538 : }
3539 :
3540 : /*
3541 : * push some data in the path leaf to the left, trying to free up at
3542 : * least data_size bytes. returns zero if the push worked, nonzero otherwise
3543 : *
3544 : * max_slot can put a limit on how far into the leaf we'll push items. The
3545 : * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
3546 : * items
3547 : */
3548 4803586 : static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3549 : *root, struct btrfs_path *path, int min_data_size,
3550 : int data_size, int empty, u32 max_slot)
3551 : {
3552 4803586 : struct extent_buffer *right = path->nodes[0];
3553 4803586 : struct extent_buffer *left;
3554 4803586 : int slot;
3555 4803586 : int free_space;
3556 4803586 : u32 right_nritems;
3557 4803586 : int ret = 0;
3558 :
3559 4803586 : slot = path->slots[1];
3560 4803586 : if (slot == 0)
3561 : return 1;
3562 1684628 : if (!path->nodes[1])
3563 : return 1;
3564 :
3565 1684628 : right_nritems = btrfs_header_nritems(right);
3566 1684628 : if (right_nritems == 0)
3567 : return 1;
3568 :
3569 1684627 : btrfs_assert_tree_write_locked(path->nodes[1]);
3570 :
3571 1684627 : left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3572 1684651 : if (IS_ERR(left))
3573 0 : return PTR_ERR(left);
3574 :
3575 1684651 : __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
3576 :
3577 1684653 : free_space = btrfs_leaf_free_space(left);
3578 1684653 : if (free_space < data_size) {
3579 460226 : ret = 1;
3580 460226 : goto out;
3581 : }
3582 :
3583 1224427 : ret = btrfs_cow_block(trans, root, left,
3584 : path->nodes[1], slot - 1, &left,
3585 : BTRFS_NESTING_LEFT_COW);
3586 1224424 : if (ret) {
3587 : /* we hit -ENOSPC, but it isn't fatal here */
3588 1423 : if (ret == -ENOSPC)
3589 1423 : ret = 1;
3590 1423 : goto out;
3591 : }
3592 :
3593 1223001 : if (check_sibling_keys(left, right)) {
3594 0 : ret = -EUCLEAN;
3595 0 : btrfs_abort_transaction(trans, ret);
3596 0 : goto out;
3597 : }
3598 1222995 : return __push_leaf_left(trans, path, min_data_size, empty, left,
3599 : free_space, right_nritems, max_slot);
3600 461649 : out:
3601 461649 : btrfs_tree_unlock(left);
3602 461649 : free_extent_buffer(left);
3603 461649 : return ret;
3604 : }
3605 :
3606 : /*
3607 : * split the path's leaf in two, making sure there is at least data_size
3608 : * available for the resulting leaf level of the path.
3609 : */
3610 361821 : static noinline int copy_for_split(struct btrfs_trans_handle *trans,
3611 : struct btrfs_path *path,
3612 : struct extent_buffer *l,
3613 : struct extent_buffer *right,
3614 : int slot, int mid, int nritems)
3615 : {
3616 361821 : struct btrfs_fs_info *fs_info = trans->fs_info;
3617 361821 : int data_copy_size;
3618 361821 : int rt_data_off;
3619 361821 : int i;
3620 361821 : int ret;
3621 361821 : struct btrfs_disk_key disk_key;
3622 361821 : struct btrfs_map_token token;
3623 :
3624 361821 : nritems = nritems - mid;
3625 361821 : btrfs_set_header_nritems(right, nritems);
3626 361821 : data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
3627 :
3628 361821 : copy_leaf_items(right, l, 0, mid, nritems);
3629 :
3630 361820 : copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size,
3631 361819 : leaf_data_end(l), data_copy_size);
3632 :
3633 361820 : rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
3634 :
3635 361821 : btrfs_init_map_token(&token, right);
3636 30220688 : for (i = 0; i < nritems; i++) {
3637 29497046 : u32 ioff;
3638 :
3639 29497046 : ioff = btrfs_token_item_offset(&token, i);
3640 29497064 : btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
3641 : }
3642 :
3643 361821 : btrfs_set_header_nritems(l, mid);
3644 361821 : btrfs_item_key(right, &disk_key, 0);
3645 361820 : ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
3646 361817 : if (ret < 0)
3647 : return ret;
3648 :
3649 361817 : btrfs_mark_buffer_dirty(right);
3650 361822 : btrfs_mark_buffer_dirty(l);
3651 361822 : BUG_ON(path->slots[0] != slot);
3652 :
3653 361822 : if (mid <= slot) {
3654 256382 : btrfs_tree_unlock(path->nodes[0]);
3655 256382 : free_extent_buffer(path->nodes[0]);
3656 256382 : path->nodes[0] = right;
3657 256382 : path->slots[0] -= mid;
3658 256382 : path->slots[1] += 1;
3659 : } else {
3660 105440 : btrfs_tree_unlock(right);
3661 105440 : free_extent_buffer(right);
3662 : }
3663 :
3664 361822 : BUG_ON(path->slots[0] < 0);
3665 :
3666 : return 0;
3667 : }
3668 :
3669 : /*
3670 : * double splits happen when we need to insert a big item in the middle
3671 : * of a leaf. A double split can leave us with 3 mostly empty leaves:
3672 : * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3673 : * A B C
3674 : *
3675 : * We avoid this by trying to push the items on either side of our target
3676 : * into the adjacent leaves. If all goes well we can avoid the double split
3677 : * completely.
3678 : */
3679 426 : static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3680 : struct btrfs_root *root,
3681 : struct btrfs_path *path,
3682 : int data_size)
3683 : {
3684 426 : int ret;
3685 426 : int progress = 0;
3686 426 : int slot;
3687 426 : u32 nritems;
3688 426 : int space_needed = data_size;
3689 :
3690 426 : slot = path->slots[0];
3691 426 : if (slot < btrfs_header_nritems(path->nodes[0]))
3692 426 : space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3693 :
3694 : /*
3695 : * try to push all the items after our slot into the
3696 : * right leaf
3697 : */
3698 426 : ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3699 426 : if (ret < 0)
3700 : return ret;
3701 :
3702 426 : if (ret == 0)
3703 1 : progress++;
3704 :
3705 426 : nritems = btrfs_header_nritems(path->nodes[0]);
3706 : /*
3707 : * our goal is to get our slot at the start or end of a leaf. If
3708 : * we've done so we're done
3709 : */
3710 426 : if (path->slots[0] == 0 || path->slots[0] == nritems)
3711 : return 0;
3712 :
3713 425 : if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3714 : return 0;
3715 :
3716 : /* try to push all the items before our slot into the next leaf */
3717 425 : slot = path->slots[0];
3718 425 : space_needed = data_size;
3719 425 : if (slot > 0)
3720 425 : space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3721 425 : ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3722 425 : if (ret < 0)
3723 : return ret;
3724 :
3725 425 : if (ret == 0)
3726 0 : progress++;
3727 :
3728 425 : if (progress)
3729 0 : return 0;
3730 : return 1;
3731 : }
3732 :
3733 : /*
3734 : * split the path's leaf in two, making sure there is at least data_size
3735 : * available for the resulting leaf level of the path.
3736 : *
3737 : * returns 0 if all went well and < 0 on failure.
3738 : */
3739 3594961 : static noinline int split_leaf(struct btrfs_trans_handle *trans,
3740 : struct btrfs_root *root,
3741 : const struct btrfs_key *ins_key,
3742 : struct btrfs_path *path, int data_size,
3743 : int extend)
3744 : {
3745 3594961 : struct btrfs_disk_key disk_key;
3746 3594961 : struct extent_buffer *l;
3747 3594961 : u32 nritems;
3748 3594961 : int mid;
3749 3594961 : int slot;
3750 3594961 : struct extent_buffer *right;
3751 3594961 : struct btrfs_fs_info *fs_info = root->fs_info;
3752 3594961 : int ret = 0;
3753 3594961 : int wret;
3754 3594961 : int split;
3755 3594961 : int num_doubles = 0;
3756 3594961 : int tried_avoid_double = 0;
3757 :
3758 3594961 : l = path->nodes[0];
3759 3594961 : slot = path->slots[0];
3760 4276203 : if (extend && data_size + btrfs_item_size(l, slot) +
3761 681242 : sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
3762 : return -EOVERFLOW;
3763 :
3764 : /* first try to make some room by pushing left and right */
3765 3586921 : if (data_size && path->nodes[1]) {
3766 3585002 : int space_needed = data_size;
3767 :
3768 3585002 : if (slot < btrfs_header_nritems(l))
3769 3288294 : space_needed -= btrfs_leaf_free_space(l);
3770 :
3771 3585109 : wret = push_leaf_right(trans, root, path, space_needed,
3772 : space_needed, 0, 0);
3773 3585100 : if (wret < 0)
3774 : return wret;
3775 3585100 : if (wret) {
3776 1205391 : space_needed = data_size;
3777 1205391 : if (slot > 0)
3778 1200668 : space_needed -= btrfs_leaf_free_space(l);
3779 1205397 : wret = push_leaf_left(trans, root, path, space_needed,
3780 : space_needed, 0, (u32)-1);
3781 1205407 : if (wret < 0)
3782 : return wret;
3783 : }
3784 3585116 : l = path->nodes[0];
3785 :
3786 : /* did the pushes work? */
3787 3585116 : if (btrfs_leaf_free_space(l) >= data_size)
3788 : return 0;
3789 : }
3790 :
3791 362114 : if (!path->nodes[1]) {
3792 1882 : ret = insert_new_root(trans, root, path, 1);
3793 1882 : if (ret)
3794 : return ret;
3795 : }
3796 362114 : again:
3797 362965 : split = 1;
3798 362965 : l = path->nodes[0];
3799 362965 : slot = path->slots[0];
3800 362965 : nritems = btrfs_header_nritems(l);
3801 362965 : mid = (nritems + 1) / 2;
3802 :
3803 362965 : if (mid <= slot) {
3804 513873 : if (nritems == 1 ||
3805 256885 : leaf_space_used(l, mid, nritems - mid) + data_size >
3806 : BTRFS_LEAF_DATA_SIZE(fs_info)) {
3807 3169 : if (slot >= nritems) {
3808 : split = 0;
3809 : } else {
3810 2776 : mid = slot;
3811 2776 : if (mid != nritems &&
3812 2776 : leaf_space_used(l, mid, nritems - mid) +
3813 : data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3814 723 : if (data_size && !tried_avoid_double)
3815 362 : goto push_for_double;
3816 : split = 2;
3817 : }
3818 : }
3819 : }
3820 : } else {
3821 105977 : if (leaf_space_used(l, 0, mid) + data_size >
3822 : BTRFS_LEAF_DATA_SIZE(fs_info)) {
3823 550 : if (!extend && data_size && slot == 0) {
3824 : split = 0;
3825 212 : } else if ((extend || !data_size) && slot == 0) {
3826 : mid = 1;
3827 : } else {
3828 212 : mid = slot;
3829 424 : if (mid != nritems &&
3830 212 : leaf_space_used(l, mid, nritems - mid) +
3831 : data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3832 128 : if (data_size && !tried_avoid_double)
3833 64 : goto push_for_double;
3834 : split = 2;
3835 : }
3836 : }
3837 : }
3838 : }
3839 :
3840 : if (split == 0)
3841 731 : btrfs_cpu_key_to_disk(&disk_key, ins_key);
3842 : else
3843 361809 : btrfs_item_key(l, &disk_key, mid);
3844 :
3845 : /*
3846 : * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3847 : * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3848 : * subclasses, which is 8 at the time of this patch, and we've maxed it
3849 : * out. In the future we could add a
3850 : * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3851 : * use BTRFS_NESTING_NEW_ROOT.
3852 : */
3853 724651 : right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3854 : &disk_key, 0, l->start, 0,
3855 : num_doubles ? BTRFS_NESTING_NEW_ROOT :
3856 : BTRFS_NESTING_SPLIT);
3857 362553 : if (IS_ERR(right))
3858 0 : return PTR_ERR(right);
3859 :
3860 362553 : root_add_used(root, fs_info->nodesize);
3861 :
3862 362553 : if (split == 0) {
3863 731 : if (mid <= slot) {
3864 393 : btrfs_set_header_nritems(right, 0);
3865 393 : ret = insert_ptr(trans, path, &disk_key,
3866 393 : right->start, path->slots[1] + 1, 1);
3867 393 : if (ret < 0) {
3868 0 : btrfs_tree_unlock(right);
3869 0 : free_extent_buffer(right);
3870 0 : return ret;
3871 : }
3872 393 : btrfs_tree_unlock(path->nodes[0]);
3873 393 : free_extent_buffer(path->nodes[0]);
3874 393 : path->nodes[0] = right;
3875 393 : path->slots[0] = 0;
3876 393 : path->slots[1] += 1;
3877 : } else {
3878 338 : btrfs_set_header_nritems(right, 0);
3879 338 : ret = insert_ptr(trans, path, &disk_key,
3880 : right->start, path->slots[1], 1);
3881 338 : if (ret < 0) {
3882 0 : btrfs_tree_unlock(right);
3883 0 : free_extent_buffer(right);
3884 0 : return ret;
3885 : }
3886 338 : btrfs_tree_unlock(path->nodes[0]);
3887 338 : free_extent_buffer(path->nodes[0]);
3888 338 : path->nodes[0] = right;
3889 338 : path->slots[0] = 0;
3890 338 : if (path->slots[1] == 0)
3891 46 : fixup_low_keys(path, &disk_key, 1);
3892 : }
3893 : /*
3894 : * We create a new leaf 'right' for the required ins_len and
3895 : * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3896 : * the content of ins_len to 'right'.
3897 : */
3898 731 : return ret;
3899 : }
3900 :
3901 361822 : ret = copy_for_split(trans, path, l, right, slot, mid, nritems);
3902 361821 : if (ret < 0) {
3903 0 : btrfs_tree_unlock(right);
3904 0 : free_extent_buffer(right);
3905 0 : return ret;
3906 : }
3907 :
3908 361821 : if (split == 2) {
3909 425 : BUG_ON(num_doubles != 0);
3910 425 : num_doubles++;
3911 425 : goto again;
3912 : }
3913 :
3914 : return 0;
3915 :
3916 426 : push_for_double:
3917 426 : push_for_double_split(trans, root, path, data_size);
3918 426 : tried_avoid_double = 1;
3919 426 : if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3920 : return 0;
3921 426 : goto again;
3922 : }
3923 :
3924 2806760 : static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3925 : struct btrfs_root *root,
3926 : struct btrfs_path *path, int ins_len)
3927 : {
3928 2806760 : struct btrfs_key key;
3929 2806760 : struct extent_buffer *leaf;
3930 2806760 : struct btrfs_file_extent_item *fi;
3931 2806760 : u64 extent_len = 0;
3932 2806760 : u32 item_size;
3933 2806760 : int ret;
3934 :
3935 2806760 : leaf = path->nodes[0];
3936 2806760 : btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3937 :
3938 2806761 : BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3939 : key.type != BTRFS_EXTENT_CSUM_KEY);
3940 :
3941 2806761 : if (btrfs_leaf_free_space(leaf) >= ins_len)
3942 : return 0;
3943 :
3944 41623 : item_size = btrfs_item_size(leaf, path->slots[0]);
3945 41623 : if (key.type == BTRFS_EXTENT_DATA_KEY) {
3946 39723 : fi = btrfs_item_ptr(leaf, path->slots[0],
3947 : struct btrfs_file_extent_item);
3948 39723 : extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3949 : }
3950 41623 : btrfs_release_path(path);
3951 :
3952 41623 : path->keep_locks = 1;
3953 41623 : path->search_for_split = 1;
3954 41623 : ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3955 41623 : path->search_for_split = 0;
3956 41623 : if (ret > 0)
3957 : ret = -EAGAIN;
3958 41622 : if (ret < 0)
3959 1 : goto err;
3960 :
3961 41622 : ret = -EAGAIN;
3962 41622 : leaf = path->nodes[0];
3963 : /* if our item isn't there, return now */
3964 41622 : if (item_size != btrfs_item_size(leaf, path->slots[0]))
3965 1 : goto err;
3966 :
3967 : /* the leaf has changed, it now has room. return now */
3968 41621 : if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
3969 28 : goto err;
3970 :
3971 41593 : if (key.type == BTRFS_EXTENT_DATA_KEY) {
3972 39695 : fi = btrfs_item_ptr(leaf, path->slots[0],
3973 : struct btrfs_file_extent_item);
3974 39695 : if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3975 0 : goto err;
3976 : }
3977 :
3978 41593 : ret = split_leaf(trans, root, &key, path, ins_len, 1);
3979 41593 : if (ret)
3980 0 : goto err;
3981 :
3982 41593 : path->keep_locks = 0;
3983 41593 : btrfs_unlock_up_safe(path, 1);
3984 41593 : return 0;
3985 30 : err:
3986 30 : path->keep_locks = 0;
3987 30 : return ret;
3988 : }
3989 :
3990 193777 : static noinline int split_item(struct btrfs_path *path,
3991 : const struct btrfs_key *new_key,
3992 : unsigned long split_offset)
3993 : {
3994 193777 : struct extent_buffer *leaf;
3995 193777 : int orig_slot, slot;
3996 193777 : char *buf;
3997 193777 : u32 nritems;
3998 193777 : u32 item_size;
3999 193777 : u32 orig_offset;
4000 193777 : struct btrfs_disk_key disk_key;
4001 :
4002 193777 : leaf = path->nodes[0];
4003 : /*
4004 : * Shouldn't happen because the caller must have previously called
4005 : * setup_leaf_for_split() to make room for the new item in the leaf.
4006 : */
4007 193777 : if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)))
4008 : return -ENOSPC;
4009 :
4010 193777 : orig_slot = path->slots[0];
4011 193777 : orig_offset = btrfs_item_offset(leaf, path->slots[0]);
4012 193777 : item_size = btrfs_item_size(leaf, path->slots[0]);
4013 :
4014 193777 : buf = kmalloc(item_size, GFP_NOFS);
4015 193777 : if (!buf)
4016 : return -ENOMEM;
4017 :
4018 193777 : read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4019 : path->slots[0]), item_size);
4020 :
4021 193777 : slot = path->slots[0] + 1;
4022 193777 : nritems = btrfs_header_nritems(leaf);
4023 193777 : if (slot != nritems) {
4024 : /* shift the items */
4025 187103 : memmove_leaf_items(leaf, slot + 1, slot, nritems - slot);
4026 : }
4027 :
4028 193777 : btrfs_cpu_key_to_disk(&disk_key, new_key);
4029 193777 : btrfs_set_item_key(leaf, &disk_key, slot);
4030 :
4031 193777 : btrfs_set_item_offset(leaf, slot, orig_offset);
4032 193777 : btrfs_set_item_size(leaf, slot, item_size - split_offset);
4033 :
4034 193777 : btrfs_set_item_offset(leaf, orig_slot,
4035 193777 : orig_offset + item_size - split_offset);
4036 193777 : btrfs_set_item_size(leaf, orig_slot, split_offset);
4037 :
4038 193777 : btrfs_set_header_nritems(leaf, nritems + 1);
4039 :
4040 : /* write the data for the start of the original item */
4041 387554 : write_extent_buffer(leaf, buf,
4042 193777 : btrfs_item_ptr_offset(leaf, path->slots[0]),
4043 : split_offset);
4044 :
4045 : /* write the data for the new item */
4046 193777 : write_extent_buffer(leaf, buf + split_offset,
4047 193777 : btrfs_item_ptr_offset(leaf, slot),
4048 : item_size - split_offset);
4049 193777 : btrfs_mark_buffer_dirty(leaf);
4050 :
4051 193777 : BUG_ON(btrfs_leaf_free_space(leaf) < 0);
4052 193777 : kfree(buf);
4053 193777 : return 0;
4054 : }
4055 :
4056 : /*
4057 : * This function splits a single item into two items,
4058 : * giving 'new_key' to the new item and splitting the
4059 : * old one at split_offset (from the start of the item).
4060 : *
4061 : * The path may be released by this operation. After
4062 : * the split, the path is pointing to the old item. The
4063 : * new item is going to be in the same node as the old one.
4064 : *
4065 : * Note, the item being split must be smaller enough to live alone on
4066 : * a tree block with room for one extra struct btrfs_item
4067 : *
4068 : * This allows us to split the item in place, keeping a lock on the
4069 : * leaf the entire time.
4070 : */
4071 193779 : int btrfs_split_item(struct btrfs_trans_handle *trans,
4072 : struct btrfs_root *root,
4073 : struct btrfs_path *path,
4074 : const struct btrfs_key *new_key,
4075 : unsigned long split_offset)
4076 : {
4077 193779 : int ret;
4078 193779 : ret = setup_leaf_for_split(trans, root, path,
4079 : sizeof(struct btrfs_item));
4080 193779 : if (ret)
4081 : return ret;
4082 :
4083 193777 : ret = split_item(path, new_key, split_offset);
4084 193777 : return ret;
4085 : }
4086 :
4087 : /*
4088 : * make the item pointed to by the path smaller. new_size indicates
4089 : * how small to make it, and from_end tells us if we just chop bytes
4090 : * off the end of the item or if we shift the item to chop bytes off
4091 : * the front.
4092 : */
4093 17178494 : void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
4094 : {
4095 17178494 : int slot;
4096 17178494 : struct extent_buffer *leaf;
4097 17178494 : u32 nritems;
4098 17178494 : unsigned int data_end;
4099 17178494 : unsigned int old_data_start;
4100 17178494 : unsigned int old_size;
4101 17178494 : unsigned int size_diff;
4102 17178494 : int i;
4103 17178494 : struct btrfs_map_token token;
4104 :
4105 17178494 : leaf = path->nodes[0];
4106 17178494 : slot = path->slots[0];
4107 :
4108 17178494 : old_size = btrfs_item_size(leaf, slot);
4109 17178490 : if (old_size == new_size)
4110 0 : return;
4111 :
4112 17178490 : nritems = btrfs_header_nritems(leaf);
4113 17178490 : data_end = leaf_data_end(leaf);
4114 :
4115 17178491 : old_data_start = btrfs_item_offset(leaf, slot);
4116 :
4117 17178490 : size_diff = old_size - new_size;
4118 :
4119 17178490 : BUG_ON(slot < 0);
4120 17178490 : BUG_ON(slot >= nritems);
4121 :
4122 : /*
4123 : * item0..itemN ... dataN.offset..dataN.size .. data0.size
4124 : */
4125 : /* first correct the data pointers */
4126 17178490 : btrfs_init_map_token(&token, leaf);
4127 1433202799 : for (i = slot; i < nritems; i++) {
4128 1398845815 : u32 ioff;
4129 :
4130 1398845815 : ioff = btrfs_token_item_offset(&token, i);
4131 1398845854 : btrfs_set_token_item_offset(&token, i, ioff + size_diff);
4132 : }
4133 :
4134 : /* shift the data */
4135 17178494 : if (from_end) {
4136 15176902 : memmove_leaf_data(leaf, data_end + size_diff, data_end,
4137 15176902 : old_data_start + new_size - data_end);
4138 : } else {
4139 2001592 : struct btrfs_disk_key disk_key;
4140 2001592 : u64 offset;
4141 :
4142 2001592 : btrfs_item_key(leaf, &disk_key, slot);
4143 :
4144 2001593 : if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4145 0 : unsigned long ptr;
4146 0 : struct btrfs_file_extent_item *fi;
4147 :
4148 0 : fi = btrfs_item_ptr(leaf, slot,
4149 : struct btrfs_file_extent_item);
4150 0 : fi = (struct btrfs_file_extent_item *)(
4151 0 : (unsigned long)fi - size_diff);
4152 :
4153 0 : if (btrfs_file_extent_type(leaf, fi) ==
4154 : BTRFS_FILE_EXTENT_INLINE) {
4155 0 : ptr = btrfs_item_ptr_offset(leaf, slot);
4156 0 : memmove_extent_buffer(leaf, ptr,
4157 : (unsigned long)fi,
4158 : BTRFS_FILE_EXTENT_INLINE_DATA_START);
4159 : }
4160 : }
4161 :
4162 2001593 : memmove_leaf_data(leaf, data_end + size_diff, data_end,
4163 2001593 : old_data_start - data_end);
4164 :
4165 2001596 : offset = btrfs_disk_key_offset(&disk_key);
4166 2001596 : btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4167 2001596 : btrfs_set_item_key(leaf, &disk_key, slot);
4168 2001596 : if (slot == 0)
4169 643596 : fixup_low_keys(path, &disk_key, 1);
4170 : }
4171 :
4172 17178495 : btrfs_set_item_size(leaf, slot, new_size);
4173 17178490 : btrfs_mark_buffer_dirty(leaf);
4174 :
4175 17178495 : if (btrfs_leaf_free_space(leaf) < 0) {
4176 0 : btrfs_print_leaf(leaf);
4177 17178486 : BUG();
4178 : }
4179 : }
4180 :
4181 : /*
4182 : * make the item pointed to by the path bigger, data_size is the added size.
4183 : */
4184 19707435 : void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4185 : {
4186 19707435 : int slot;
4187 19707435 : struct extent_buffer *leaf;
4188 19707435 : u32 nritems;
4189 19707435 : unsigned int data_end;
4190 19707435 : unsigned int old_data;
4191 19707435 : unsigned int old_size;
4192 19707435 : int i;
4193 19707435 : struct btrfs_map_token token;
4194 :
4195 19707435 : leaf = path->nodes[0];
4196 :
4197 19707435 : nritems = btrfs_header_nritems(leaf);
4198 19707435 : data_end = leaf_data_end(leaf);
4199 :
4200 19707356 : if (btrfs_leaf_free_space(leaf) < data_size) {
4201 0 : btrfs_print_leaf(leaf);
4202 0 : BUG();
4203 : }
4204 19707404 : slot = path->slots[0];
4205 19707404 : old_data = btrfs_item_data_end(leaf, slot);
4206 :
4207 19707396 : BUG_ON(slot < 0);
4208 19707396 : if (slot >= nritems) {
4209 0 : btrfs_print_leaf(leaf);
4210 0 : btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4211 : slot, nritems);
4212 0 : BUG();
4213 : }
4214 :
4215 : /*
4216 : * item0..itemN ... dataN.offset..dataN.size .. data0.size
4217 : */
4218 : /* first correct the data pointers */
4219 19707396 : btrfs_init_map_token(&token, leaf);
4220 1464722934 : for (i = slot; i < nritems; i++) {
4221 1425308064 : u32 ioff;
4222 :
4223 1425308064 : ioff = btrfs_token_item_offset(&token, i);
4224 1425310933 : btrfs_set_token_item_offset(&token, i, ioff - data_size);
4225 : }
4226 :
4227 : /* shift the data */
4228 19707474 : memmove_leaf_data(leaf, data_end - data_size, data_end,
4229 19707474 : old_data - data_end);
4230 :
4231 19707481 : data_end = old_data;
4232 19707481 : old_size = btrfs_item_size(leaf, slot);
4233 19707464 : btrfs_set_item_size(leaf, slot, old_size + data_size);
4234 19707437 : btrfs_mark_buffer_dirty(leaf);
4235 :
4236 19707471 : if (btrfs_leaf_free_space(leaf) < 0) {
4237 0 : btrfs_print_leaf(leaf);
4238 0 : BUG();
4239 : }
4240 19707457 : }
4241 :
4242 : /*
4243 : * Make space in the node before inserting one or more items.
4244 : *
4245 : * @root: root we are inserting items to
4246 : * @path: points to the leaf/slot where we are going to insert new items
4247 : * @batch: information about the batch of items to insert
4248 : *
4249 : * Main purpose is to save stack depth by doing the bulk of the work in a
4250 : * function that doesn't call btrfs_search_slot
4251 : */
4252 66841454 : static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4253 : const struct btrfs_item_batch *batch)
4254 : {
4255 66841454 : struct btrfs_fs_info *fs_info = root->fs_info;
4256 66841454 : int i;
4257 66841454 : u32 nritems;
4258 66841454 : unsigned int data_end;
4259 66841454 : struct btrfs_disk_key disk_key;
4260 66841454 : struct extent_buffer *leaf;
4261 66841454 : int slot;
4262 66841454 : struct btrfs_map_token token;
4263 66841454 : u32 total_size;
4264 :
4265 : /*
4266 : * Before anything else, update keys in the parent and other ancestors
4267 : * if needed, then release the write locks on them, so that other tasks
4268 : * can use them while we modify the leaf.
4269 : */
4270 66841454 : if (path->slots[0] == 0) {
4271 372841 : btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
4272 372838 : fixup_low_keys(path, &disk_key, 1);
4273 : }
4274 66841453 : btrfs_unlock_up_safe(path, 1);
4275 :
4276 66847543 : leaf = path->nodes[0];
4277 66847543 : slot = path->slots[0];
4278 :
4279 66847543 : nritems = btrfs_header_nritems(leaf);
4280 66847543 : data_end = leaf_data_end(leaf);
4281 66840156 : total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4282 :
4283 66840156 : if (btrfs_leaf_free_space(leaf) < total_size) {
4284 0 : btrfs_print_leaf(leaf);
4285 0 : btrfs_crit(fs_info, "not enough freespace need %u have %d",
4286 : total_size, btrfs_leaf_free_space(leaf));
4287 0 : BUG();
4288 : }
4289 :
4290 66839487 : btrfs_init_map_token(&token, leaf);
4291 66831987 : if (slot != nritems) {
4292 39291159 : unsigned int old_data = btrfs_item_data_end(leaf, slot);
4293 :
4294 39290696 : if (old_data < data_end) {
4295 0 : btrfs_print_leaf(leaf);
4296 0 : btrfs_crit(fs_info,
4297 : "item at slot %d with data offset %u beyond data end of leaf %u",
4298 : slot, old_data, data_end);
4299 0 : BUG();
4300 : }
4301 : /*
4302 : * item0..itemN ... dataN.offset..dataN.size .. data0.size
4303 : */
4304 : /* first correct the data pointers */
4305 3612363073 : for (i = slot; i < nritems; i++) {
4306 3573071647 : u32 ioff;
4307 :
4308 3573071647 : ioff = btrfs_token_item_offset(&token, i);
4309 3573556551 : btrfs_set_token_item_offset(&token, i,
4310 3573556551 : ioff - batch->total_data_size);
4311 : }
4312 : /* shift the items */
4313 39291426 : memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot);
4314 :
4315 : /* shift the data */
4316 39291285 : memmove_leaf_data(leaf, data_end - batch->total_data_size,
4317 39291285 : data_end, old_data - data_end);
4318 39291285 : data_end = old_data;
4319 : }
4320 :
4321 : /* setup the item for the new data */
4322 140685958 : for (i = 0; i < batch->nr; i++) {
4323 73833406 : btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
4324 73821278 : btrfs_set_item_key(leaf, &disk_key, slot + i);
4325 73820585 : data_end -= batch->data_sizes[i];
4326 73820585 : btrfs_set_token_item_offset(&token, slot + i, data_end);
4327 73817599 : btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
4328 : }
4329 :
4330 66852552 : btrfs_set_header_nritems(leaf, nritems + batch->nr);
4331 66852552 : btrfs_mark_buffer_dirty(leaf);
4332 :
4333 66854176 : if (btrfs_leaf_free_space(leaf) < 0) {
4334 0 : btrfs_print_leaf(leaf);
4335 0 : BUG();
4336 : }
4337 66847588 : }
4338 :
4339 : /*
4340 : * Insert a new item into a leaf.
4341 : *
4342 : * @root: The root of the btree.
4343 : * @path: A path pointing to the target leaf and slot.
4344 : * @key: The key of the new item.
4345 : * @data_size: The size of the data associated with the new key.
4346 : */
4347 3916194 : void btrfs_setup_item_for_insert(struct btrfs_root *root,
4348 : struct btrfs_path *path,
4349 : const struct btrfs_key *key,
4350 : u32 data_size)
4351 : {
4352 6529150 : struct btrfs_item_batch batch;
4353 :
4354 6529150 : batch.keys = key;
4355 6529150 : batch.data_sizes = &data_size;
4356 6529150 : batch.total_data_size = data_size;
4357 6529150 : batch.nr = 1;
4358 :
4359 3916194 : setup_items_for_insert(root, path, &batch);
4360 3916261 : }
4361 :
4362 : /*
4363 : * Given a key and some data, insert items into the tree.
4364 : * This does all the path init required, making room in the tree if needed.
4365 : */
4366 60748088 : int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4367 : struct btrfs_root *root,
4368 : struct btrfs_path *path,
4369 : const struct btrfs_item_batch *batch)
4370 : {
4371 60748088 : int ret = 0;
4372 60748088 : int slot;
4373 60748088 : u32 total_size;
4374 :
4375 60748088 : total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4376 60748088 : ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
4377 60736430 : if (ret == 0)
4378 : return -EEXIST;
4379 60320865 : if (ret < 0)
4380 : return ret;
4381 :
4382 60312819 : slot = path->slots[0];
4383 60312819 : BUG_ON(slot < 0);
4384 :
4385 60312819 : setup_items_for_insert(root, path, batch);
4386 60312819 : return 0;
4387 : }
4388 :
4389 : /*
4390 : * Given a key and some data, insert an item into the tree.
4391 : * This does all the path init required, making room in the tree if needed.
4392 : */
4393 10328 : int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4394 : const struct btrfs_key *cpu_key, void *data,
4395 : u32 data_size)
4396 : {
4397 10328 : int ret = 0;
4398 10328 : struct btrfs_path *path;
4399 10328 : struct extent_buffer *leaf;
4400 10328 : unsigned long ptr;
4401 :
4402 10328 : path = btrfs_alloc_path();
4403 10328 : if (!path)
4404 : return -ENOMEM;
4405 10328 : ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4406 10328 : if (!ret) {
4407 10328 : leaf = path->nodes[0];
4408 10328 : ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4409 10328 : write_extent_buffer(leaf, data, ptr, data_size);
4410 10328 : btrfs_mark_buffer_dirty(leaf);
4411 : }
4412 10328 : btrfs_free_path(path);
4413 10328 : return ret;
4414 : }
4415 :
4416 : /*
4417 : * This function duplicates an item, giving 'new_key' to the new item.
4418 : * It guarantees both items live in the same tree leaf and the new item is
4419 : * contiguous with the original item.
4420 : *
4421 : * This allows us to split a file extent in place, keeping a lock on the leaf
4422 : * the entire time.
4423 : */
4424 2612982 : int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4425 : struct btrfs_root *root,
4426 : struct btrfs_path *path,
4427 : const struct btrfs_key *new_key)
4428 : {
4429 2612982 : struct extent_buffer *leaf;
4430 2612982 : int ret;
4431 2612982 : u32 item_size;
4432 :
4433 2612982 : leaf = path->nodes[0];
4434 2612982 : item_size = btrfs_item_size(leaf, path->slots[0]);
4435 2612981 : ret = setup_leaf_for_split(trans, root, path,
4436 2612981 : item_size + sizeof(struct btrfs_item));
4437 2612984 : if (ret)
4438 : return ret;
4439 :
4440 2612956 : path->slots[0]++;
4441 2612956 : btrfs_setup_item_for_insert(root, path, new_key, item_size);
4442 2612954 : leaf = path->nodes[0];
4443 7838850 : memcpy_extent_buffer(leaf,
4444 5225896 : btrfs_item_ptr_offset(leaf, path->slots[0]),
4445 2612954 : btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4446 : item_size);
4447 2612949 : return 0;
4448 : }
4449 :
4450 : /*
4451 : * delete the pointer from a given node.
4452 : *
4453 : * the tree should have been previously balanced so the deletion does not
4454 : * empty a node.
4455 : *
4456 : * This is exported for use inside btrfs-progs, don't un-export it.
4457 : */
4458 119013 : int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4459 : struct btrfs_path *path, int level, int slot)
4460 : {
4461 119013 : struct extent_buffer *parent = path->nodes[level];
4462 119013 : u32 nritems;
4463 119013 : int ret;
4464 :
4465 119013 : nritems = btrfs_header_nritems(parent);
4466 119013 : if (slot != nritems - 1) {
4467 112642 : if (level) {
4468 112642 : ret = btrfs_tree_mod_log_insert_move(parent, slot,
4469 112642 : slot + 1, nritems - slot - 1);
4470 112641 : if (ret < 0) {
4471 0 : btrfs_abort_transaction(trans, ret);
4472 0 : return ret;
4473 : }
4474 : }
4475 112641 : memmove_extent_buffer(parent,
4476 : btrfs_node_key_ptr_offset(parent, slot),
4477 : btrfs_node_key_ptr_offset(parent, slot + 1),
4478 : sizeof(struct btrfs_key_ptr) *
4479 112641 : (nritems - slot - 1));
4480 6371 : } else if (level) {
4481 6371 : ret = btrfs_tree_mod_log_insert_key(parent, slot,
4482 : BTRFS_MOD_LOG_KEY_REMOVE);
4483 6371 : if (ret < 0) {
4484 0 : btrfs_abort_transaction(trans, ret);
4485 0 : return ret;
4486 : }
4487 : }
4488 :
4489 119014 : nritems--;
4490 119014 : btrfs_set_header_nritems(parent, nritems);
4491 119014 : if (nritems == 0 && parent == root->node) {
4492 0 : BUG_ON(btrfs_header_level(root->node) != 1);
4493 : /* just turn the root into a leaf and break */
4494 0 : btrfs_set_header_level(root->node, 0);
4495 119014 : } else if (slot == 0) {
4496 5001 : struct btrfs_disk_key disk_key;
4497 :
4498 5001 : btrfs_node_key(parent, &disk_key, 0);
4499 5001 : fixup_low_keys(path, &disk_key, level + 1);
4500 : }
4501 119014 : btrfs_mark_buffer_dirty(parent);
4502 119014 : return 0;
4503 : }
4504 :
4505 : /*
4506 : * a helper function to delete the leaf pointed to by path->slots[1] and
4507 : * path->nodes[1].
4508 : *
4509 : * This deletes the pointer in path->nodes[1] and frees the leaf
4510 : * block extent. zero is returned if it all worked out, < 0 otherwise.
4511 : *
4512 : * The path must have already been setup for deleting the leaf, including
4513 : * all the proper balancing. path->nodes[1] must be locked.
4514 : */
4515 118899 : static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
4516 : struct btrfs_root *root,
4517 : struct btrfs_path *path,
4518 : struct extent_buffer *leaf)
4519 : {
4520 118899 : int ret;
4521 :
4522 118899 : WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4523 118899 : ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]);
4524 118900 : if (ret < 0)
4525 : return ret;
4526 :
4527 : /*
4528 : * btrfs_free_extent is expensive, we want to make sure we
4529 : * aren't holding any locks when we call it
4530 : */
4531 118900 : btrfs_unlock_up_safe(path, 0);
4532 :
4533 118900 : root_sub_used(root, leaf->len);
4534 :
4535 118900 : atomic_inc(&leaf->refs);
4536 118900 : btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
4537 118900 : free_extent_buffer_stale(leaf);
4538 118900 : return 0;
4539 : }
4540 : /*
4541 : * delete the item at the leaf level in path. If that empties
4542 : * the leaf, remove it from the tree
4543 : */
4544 26979226 : int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4545 : struct btrfs_path *path, int slot, int nr)
4546 : {
4547 26979226 : struct btrfs_fs_info *fs_info = root->fs_info;
4548 26979226 : struct extent_buffer *leaf;
4549 26979226 : int ret = 0;
4550 26979226 : int wret;
4551 26979226 : u32 nritems;
4552 :
4553 26979226 : leaf = path->nodes[0];
4554 26979226 : nritems = btrfs_header_nritems(leaf);
4555 :
4556 26979226 : if (slot + nr != nritems) {
4557 21783133 : const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
4558 21782582 : const int data_end = leaf_data_end(leaf);
4559 21781967 : struct btrfs_map_token token;
4560 21781967 : u32 dsize = 0;
4561 21781967 : int i;
4562 :
4563 48849416 : for (i = 0; i < nr; i++)
4564 27067251 : dsize += btrfs_item_size(leaf, slot + i);
4565 :
4566 21782165 : memmove_leaf_data(leaf, data_end + dsize, data_end,
4567 21782165 : last_off - data_end);
4568 :
4569 21782599 : btrfs_init_map_token(&token, leaf);
4570 2100165874 : for (i = slot + nr; i < nritems; i++) {
4571 2056600408 : u32 ioff;
4572 :
4573 2056600408 : ioff = btrfs_token_item_offset(&token, i);
4574 2056709104 : btrfs_set_token_item_offset(&token, i, ioff + dsize);
4575 : }
4576 :
4577 21782867 : memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
4578 : }
4579 26979338 : btrfs_set_header_nritems(leaf, nritems - nr);
4580 26979338 : nritems -= nr;
4581 :
4582 : /* delete the leaf if we've emptied it */
4583 26979338 : if (nritems == 0) {
4584 11134 : if (leaf == root->node) {
4585 466 : btrfs_set_header_level(leaf, 0);
4586 : } else {
4587 10668 : btrfs_clear_buffer_dirty(trans, leaf);
4588 10668 : ret = btrfs_del_leaf(trans, root, path, leaf);
4589 10668 : if (ret < 0)
4590 : return ret;
4591 : }
4592 : } else {
4593 26968204 : int used = leaf_space_used(leaf, 0, nritems);
4594 26967921 : if (slot == 0) {
4595 2408779 : struct btrfs_disk_key disk_key;
4596 :
4597 2408779 : btrfs_item_key(leaf, &disk_key, 0);
4598 2408766 : fixup_low_keys(path, &disk_key, 1);
4599 : }
4600 :
4601 : /*
4602 : * Try to delete the leaf if it is mostly empty. We do this by
4603 : * trying to move all its items into its left and right neighbours.
4604 : * If we can't move all the items, then we don't delete it - it's
4605 : * not ideal, but future insertions might fill the leaf with more
4606 : * items, or items from other leaves might be moved later into our
4607 : * leaf due to deletions on those leaves.
4608 : */
4609 26967938 : if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4610 3597768 : u32 min_push_space;
4611 :
4612 : /* push_leaf_left fixes the path.
4613 : * make sure the path still points to our leaf
4614 : * for possible call to btrfs_del_ptr below
4615 : */
4616 3597768 : slot = path->slots[1];
4617 3597768 : atomic_inc(&leaf->refs);
4618 : /*
4619 : * We want to be able to at least push one item to the
4620 : * left neighbour leaf, and that's the first item.
4621 : */
4622 3597773 : min_push_space = sizeof(struct btrfs_item) +
4623 : btrfs_item_size(leaf, 0);
4624 3597762 : wret = push_leaf_left(trans, root, path, 0,
4625 : min_push_space, 1, (u32)-1);
4626 3597725 : if (wret < 0 && wret != -ENOSPC)
4627 0 : ret = wret;
4628 :
4629 3597725 : if (path->nodes[0] == leaf &&
4630 : btrfs_header_nritems(leaf)) {
4631 : /*
4632 : * If we were not able to push all items from our
4633 : * leaf to its left neighbour, then attempt to
4634 : * either push all the remaining items to the
4635 : * right neighbour or none. There's no advantage
4636 : * in pushing only some items, instead of all, as
4637 : * it's pointless to end up with a leaf having
4638 : * too few items while the neighbours can be full
4639 : * or nearly full.
4640 : */
4641 3448762 : nritems = btrfs_header_nritems(leaf);
4642 3448762 : min_push_space = leaf_space_used(leaf, 0, nritems);
4643 3448774 : wret = push_leaf_right(trans, root, path, 0,
4644 : min_push_space, 1, 0);
4645 3448778 : if (wret < 0 && wret != -ENOSPC)
4646 0 : ret = wret;
4647 : }
4648 :
4649 3597741 : if (btrfs_header_nritems(leaf) == 0) {
4650 108231 : path->slots[1] = slot;
4651 108231 : ret = btrfs_del_leaf(trans, root, path, leaf);
4652 108232 : if (ret < 0)
4653 : return ret;
4654 108232 : free_extent_buffer(leaf);
4655 108232 : ret = 0;
4656 : } else {
4657 : /* if we're still in the path, make sure
4658 : * we're dirty. Otherwise, one of the
4659 : * push_leaf functions must have already
4660 : * dirtied this buffer
4661 : */
4662 3489510 : if (path->nodes[0] == leaf)
4663 3429173 : btrfs_mark_buffer_dirty(leaf);
4664 3489511 : free_extent_buffer(leaf);
4665 : }
4666 : } else {
4667 23370170 : btrfs_mark_buffer_dirty(leaf);
4668 : }
4669 : }
4670 : return ret;
4671 : }
4672 :
4673 : /*
4674 : * A helper function to walk down the tree starting at min_key, and looking
4675 : * for nodes or leaves that are have a minimum transaction id.
4676 : * This is used by the btree defrag code, and tree logging
4677 : *
4678 : * This does not cow, but it does stuff the starting key it finds back
4679 : * into min_key, so you can call btrfs_search_slot with cow=1 on the
4680 : * key and get a writable path.
4681 : *
4682 : * This honors path->lowest_level to prevent descent past a given level
4683 : * of the tree.
4684 : *
4685 : * min_trans indicates the oldest transaction that you are interested
4686 : * in walking through. Any nodes or leaves older than min_trans are
4687 : * skipped over (without reading them).
4688 : *
4689 : * returns zero if something useful was found, < 0 on error and 1 if there
4690 : * was nothing in the tree that matched the search criteria.
4691 : */
4692 202394 : int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4693 : struct btrfs_path *path,
4694 : u64 min_trans)
4695 : {
4696 202394 : struct extent_buffer *cur;
4697 202394 : struct btrfs_key found_key;
4698 202394 : int slot;
4699 202394 : int sret;
4700 202394 : u32 nritems;
4701 202394 : int level;
4702 202394 : int ret = 1;
4703 202394 : int keep_locks = path->keep_locks;
4704 :
4705 202394 : ASSERT(!path->nowait);
4706 202394 : path->keep_locks = 1;
4707 206841 : again:
4708 206841 : cur = btrfs_read_lock_root_node(root);
4709 206847 : level = btrfs_header_level(cur);
4710 206847 : WARN_ON(path->nodes[level]);
4711 206847 : path->nodes[level] = cur;
4712 206847 : path->locks[level] = BTRFS_READ_LOCK;
4713 :
4714 206847 : if (btrfs_header_generation(cur) < min_trans) {
4715 118 : ret = 1;
4716 118 : goto out;
4717 : }
4718 590865 : while (1) {
4719 398796 : nritems = btrfs_header_nritems(cur);
4720 398796 : level = btrfs_header_level(cur);
4721 398796 : sret = btrfs_bin_search(cur, 0, min_key, &slot);
4722 398794 : if (sret < 0) {
4723 0 : ret = sret;
4724 0 : goto out;
4725 : }
4726 :
4727 : /* at the lowest level, we're done, setup the path and exit */
4728 398794 : if (level == path->lowest_level) {
4729 206675 : if (slot >= nritems)
4730 55912 : goto find_next_key;
4731 150763 : ret = 0;
4732 150763 : path->slots[level] = slot;
4733 150763 : btrfs_item_key_to_cpu(cur, &found_key, slot);
4734 150762 : goto out;
4735 : }
4736 192119 : if (sret && slot > 0)
4737 187512 : slot--;
4738 : /*
4739 : * check this node pointer against the min_trans parameters.
4740 : * If it is too old, skip to the next one.
4741 : */
4742 200390 : while (slot < nritems) {
4743 200340 : u64 gen;
4744 :
4745 200340 : gen = btrfs_node_ptr_generation(cur, slot);
4746 200339 : if (gen < min_trans) {
4747 8271 : slot++;
4748 8271 : continue;
4749 : }
4750 : break;
4751 : }
4752 50 : find_next_key:
4753 : /*
4754 : * we didn't find a candidate key in this node, walk forward
4755 : * and find another one
4756 : */
4757 248030 : if (slot >= nritems) {
4758 55963 : path->slots[level] = slot;
4759 55963 : sret = btrfs_find_next_key(root, path, min_key, level,
4760 : min_trans);
4761 55964 : if (sret == 0) {
4762 4447 : btrfs_release_path(path);
4763 4447 : goto again;
4764 : } else {
4765 51517 : goto out;
4766 : }
4767 : }
4768 : /* save our key for returning back */
4769 192067 : btrfs_node_key_to_cpu(cur, &found_key, slot);
4770 192067 : path->slots[level] = slot;
4771 192067 : if (level == path->lowest_level) {
4772 0 : ret = 0;
4773 0 : goto out;
4774 : }
4775 192067 : cur = btrfs_read_node_slot(cur, slot);
4776 192068 : if (IS_ERR(cur)) {
4777 0 : ret = PTR_ERR(cur);
4778 0 : goto out;
4779 : }
4780 :
4781 192068 : btrfs_tree_read_lock(cur);
4782 :
4783 192069 : path->locks[level - 1] = BTRFS_READ_LOCK;
4784 192069 : path->nodes[level - 1] = cur;
4785 192069 : unlock_up(path, level, 1, 0, NULL);
4786 : }
4787 0 : out:
4788 202397 : path->keep_locks = keep_locks;
4789 202397 : if (ret == 0) {
4790 150762 : btrfs_unlock_up_safe(path, path->lowest_level + 1);
4791 301526 : memcpy(min_key, &found_key, sizeof(found_key));
4792 : }
4793 202398 : return ret;
4794 : }
4795 :
4796 : /*
4797 : * this is similar to btrfs_next_leaf, but does not try to preserve
4798 : * and fixup the path. It looks for and returns the next key in the
4799 : * tree based on the current path and the min_trans parameters.
4800 : *
4801 : * 0 is returned if another key is found, < 0 if there are any errors
4802 : * and 1 is returned if there are no higher keys in the tree
4803 : *
4804 : * path->keep_locks should be set to 1 on the search made before
4805 : * calling this function.
4806 : */
4807 55974 : int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4808 : struct btrfs_key *key, int level, u64 min_trans)
4809 : {
4810 55974 : int slot;
4811 55974 : struct extent_buffer *c;
4812 :
4813 55974 : WARN_ON(!path->keep_locks && !path->skip_locking);
4814 109483 : while (level < BTRFS_MAX_LEVEL) {
4815 109483 : if (!path->nodes[level])
4816 : return 1;
4817 :
4818 109483 : slot = path->slots[level] + 1;
4819 109483 : c = path->nodes[level];
4820 : next:
4821 109743 : if (slot >= btrfs_header_nritems(c)) {
4822 105038 : int ret;
4823 105038 : int orig_lowest;
4824 105038 : struct btrfs_key cur_key;
4825 105038 : if (level + 1 >= BTRFS_MAX_LEVEL ||
4826 105038 : !path->nodes[level + 1])
4827 51527 : return 1;
4828 :
4829 53511 : if (path->locks[level + 1] || path->skip_locking) {
4830 53509 : level++;
4831 53509 : continue;
4832 : }
4833 :
4834 2 : slot = btrfs_header_nritems(c) - 1;
4835 2 : if (level == 0)
4836 0 : btrfs_item_key_to_cpu(c, &cur_key, slot);
4837 : else
4838 2 : btrfs_node_key_to_cpu(c, &cur_key, slot);
4839 :
4840 2 : orig_lowest = path->lowest_level;
4841 2 : btrfs_release_path(path);
4842 2 : path->lowest_level = level;
4843 2 : ret = btrfs_search_slot(NULL, root, &cur_key, path,
4844 : 0, 0);
4845 2 : path->lowest_level = orig_lowest;
4846 2 : if (ret < 0)
4847 0 : return ret;
4848 :
4849 2 : c = path->nodes[level];
4850 2 : slot = path->slots[level];
4851 2 : if (ret == 0)
4852 2 : slot++;
4853 2 : goto next;
4854 : }
4855 :
4856 4705 : if (level == 0)
4857 0 : btrfs_item_key_to_cpu(c, key, slot);
4858 : else {
4859 4705 : u64 gen = btrfs_node_ptr_generation(c, slot);
4860 :
4861 4705 : if (gen < min_trans) {
4862 258 : slot++;
4863 258 : goto next;
4864 : }
4865 4447 : btrfs_node_key_to_cpu(c, key, slot);
4866 : }
4867 : return 0;
4868 : }
4869 : return 1;
4870 : }
4871 :
4872 27379818 : int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4873 : u64 time_seq)
4874 : {
4875 27379818 : int slot;
4876 27379818 : int level;
4877 27379818 : struct extent_buffer *c;
4878 27379818 : struct extent_buffer *next;
4879 27379818 : struct btrfs_fs_info *fs_info = root->fs_info;
4880 27379818 : struct btrfs_key key;
4881 27379818 : bool need_commit_sem = false;
4882 27379818 : u32 nritems;
4883 27379818 : int ret;
4884 27379818 : int i;
4885 :
4886 : /*
4887 : * The nowait semantics are used only for write paths, where we don't
4888 : * use the tree mod log and sequence numbers.
4889 : */
4890 27379818 : if (time_seq)
4891 : ASSERT(!path->nowait);
4892 :
4893 27379818 : nritems = btrfs_header_nritems(path->nodes[0]);
4894 27379818 : if (nritems == 0)
4895 : return 1;
4896 :
4897 27375859 : btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4898 : again:
4899 27381525 : level = 1;
4900 27381525 : next = NULL;
4901 27381525 : btrfs_release_path(path);
4902 :
4903 27383172 : path->keep_locks = 1;
4904 :
4905 27383172 : if (time_seq) {
4906 10675 : ret = btrfs_search_old_slot(root, &key, path, time_seq);
4907 : } else {
4908 27372497 : if (path->need_commit_sem) {
4909 68897 : path->need_commit_sem = 0;
4910 68897 : need_commit_sem = true;
4911 68897 : if (path->nowait) {
4912 0 : if (!down_read_trylock(&fs_info->commit_root_sem)) {
4913 0 : ret = -EAGAIN;
4914 0 : goto done;
4915 : }
4916 : } else {
4917 68897 : down_read(&fs_info->commit_root_sem);
4918 : }
4919 : }
4920 27372497 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4921 : }
4922 27382463 : path->keep_locks = 0;
4923 :
4924 27382463 : if (ret < 0)
4925 0 : goto done;
4926 :
4927 27382463 : nritems = btrfs_header_nritems(path->nodes[0]);
4928 : /*
4929 : * by releasing the path above we dropped all our locks. A balance
4930 : * could have added more items next to the key that used to be
4931 : * at the very end of the block. So, check again here and
4932 : * advance the path if there are now more items available.
4933 : */
4934 27382463 : if (nritems > 0 && path->slots[0] < nritems - 1) {
4935 14203 : if (ret == 0)
4936 14202 : path->slots[0]++;
4937 14203 : ret = 0;
4938 14203 : goto done;
4939 : }
4940 : /*
4941 : * So the above check misses one case:
4942 : * - after releasing the path above, someone has removed the item that
4943 : * used to be at the very end of the block, and balance between leafs
4944 : * gets another one with bigger key.offset to replace it.
4945 : *
4946 : * This one should be returned as well, or we can get leaf corruption
4947 : * later(esp. in __btrfs_drop_extents()).
4948 : *
4949 : * And a bit more explanation about this check,
4950 : * with ret > 0, the key isn't found, the path points to the slot
4951 : * where it should be inserted, so the path->slots[0] item must be the
4952 : * bigger one.
4953 : */
4954 27368260 : if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4955 0 : ret = 0;
4956 0 : goto done;
4957 : }
4958 :
4959 55651190 : while (level < BTRFS_MAX_LEVEL) {
4960 55651190 : if (!path->nodes[level]) {
4961 15688969 : ret = 1;
4962 15688969 : goto done;
4963 : }
4964 :
4965 39962221 : slot = path->slots[level] + 1;
4966 39962221 : c = path->nodes[level];
4967 39962221 : if (slot >= btrfs_header_nritems(c)) {
4968 28282930 : level++;
4969 28282930 : if (level == BTRFS_MAX_LEVEL) {
4970 0 : ret = 1;
4971 0 : goto done;
4972 : }
4973 28282930 : continue;
4974 : }
4975 :
4976 :
4977 : /*
4978 : * Our current level is where we're going to start from, and to
4979 : * make sure lockdep doesn't complain we need to drop our locks
4980 : * and nodes from 0 to our current level.
4981 : */
4982 23397240 : for (i = 0; i < level; i++) {
4983 11717469 : if (path->locks[level]) {
4984 11582953 : btrfs_tree_read_unlock(path->nodes[i]);
4985 11583114 : path->locks[i] = 0;
4986 : }
4987 11717630 : free_extent_buffer(path->nodes[i]);
4988 11717949 : path->nodes[i] = NULL;
4989 : }
4990 :
4991 11679771 : next = c;
4992 11679771 : ret = read_block_for_search(root, path, &next, level,
4993 : slot, &key);
4994 11678670 : if (ret == -EAGAIN && !path->nowait)
4995 5927 : goto again;
4996 :
4997 11672743 : if (ret < 0) {
4998 0 : btrfs_release_path(path);
4999 0 : goto done;
5000 : }
5001 :
5002 11672743 : if (!path->skip_locking) {
5003 11538594 : ret = btrfs_try_tree_read_lock(next);
5004 11538610 : if (!ret && path->nowait) {
5005 0 : ret = -EAGAIN;
5006 0 : goto done;
5007 : }
5008 11538610 : if (!ret && time_seq) {
5009 : /*
5010 : * If we don't get the lock, we may be racing
5011 : * with push_leaf_left, holding that lock while
5012 : * itself waiting for the leaf we've currently
5013 : * locked. To solve this situation, we give up
5014 : * on our lock and cycle.
5015 : */
5016 0 : free_extent_buffer(next);
5017 0 : btrfs_release_path(path);
5018 0 : cond_resched();
5019 0 : goto again;
5020 : }
5021 11538610 : if (!ret)
5022 151 : btrfs_tree_read_lock(next);
5023 : }
5024 : break;
5025 : }
5026 11672759 : path->slots[level] = slot;
5027 11710838 : while (1) {
5028 11710838 : level--;
5029 11710838 : path->nodes[level] = next;
5030 11710838 : path->slots[level] = 0;
5031 11710838 : if (!path->skip_locking)
5032 11576442 : path->locks[level] = BTRFS_READ_LOCK;
5033 11710838 : if (!level)
5034 : break;
5035 :
5036 38256 : ret = read_block_for_search(root, path, &next, level,
5037 : 0, &key);
5038 38213 : if (ret == -EAGAIN && !path->nowait)
5039 134 : goto again;
5040 :
5041 38079 : if (ret < 0) {
5042 0 : btrfs_release_path(path);
5043 0 : goto done;
5044 : }
5045 :
5046 38079 : if (!path->skip_locking) {
5047 37881 : if (path->nowait) {
5048 0 : if (!btrfs_try_tree_read_lock(next)) {
5049 0 : ret = -EAGAIN;
5050 0 : goto done;
5051 : }
5052 : } else {
5053 37881 : btrfs_tree_read_lock(next);
5054 : }
5055 : }
5056 : }
5057 : ret = 0;
5058 27375754 : done:
5059 27375754 : unlock_up(path, 0, 1, 0, NULL);
5060 27375712 : if (need_commit_sem) {
5061 68897 : int ret2;
5062 :
5063 68897 : path->need_commit_sem = 1;
5064 68897 : ret2 = finish_need_commit_sem_search(path);
5065 68897 : up_read(&fs_info->commit_root_sem);
5066 68897 : if (ret2)
5067 0 : ret = ret2;
5068 : }
5069 :
5070 : return ret;
5071 : }
5072 :
5073 17875463 : int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq)
5074 : {
5075 17875463 : path->slots[0]++;
5076 17875463 : if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
5077 81410 : return btrfs_next_old_leaf(root, path, time_seq);
5078 : return 0;
5079 : }
5080 :
5081 : /*
5082 : * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5083 : * searching until it gets past min_objectid or finds an item of 'type'
5084 : *
5085 : * returns 0 if something is found, 1 if nothing was found and < 0 on error
5086 : */
5087 208367 : int btrfs_previous_item(struct btrfs_root *root,
5088 : struct btrfs_path *path, u64 min_objectid,
5089 : int type)
5090 : {
5091 208755 : struct btrfs_key found_key;
5092 208755 : struct extent_buffer *leaf;
5093 208755 : u32 nritems;
5094 208755 : int ret;
5095 :
5096 208755 : while (1) {
5097 208755 : if (path->slots[0] == 0) {
5098 939 : ret = btrfs_prev_leaf(root, path);
5099 939 : if (ret != 0)
5100 937 : return ret;
5101 : } else {
5102 207816 : path->slots[0]--;
5103 : }
5104 207818 : leaf = path->nodes[0];
5105 207818 : nritems = btrfs_header_nritems(leaf);
5106 207818 : if (nritems == 0)
5107 : return 1;
5108 207818 : if (path->slots[0] == nritems)
5109 2 : path->slots[0]--;
5110 :
5111 207818 : btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5112 207818 : if (found_key.objectid < min_objectid)
5113 : break;
5114 207102 : if (found_key.type == type)
5115 : return 0;
5116 1635 : if (found_key.objectid == min_objectid &&
5117 : found_key.type < type)
5118 : break;
5119 : }
5120 : return 1;
5121 : }
5122 :
5123 : /*
5124 : * search in extent tree to find a previous Metadata/Data extent item with
5125 : * min objecitd.
5126 : *
5127 : * returns 0 if something is found, 1 if nothing was found and < 0 on error
5128 : */
5129 67820 : int btrfs_previous_extent_item(struct btrfs_root *root,
5130 : struct btrfs_path *path, u64 min_objectid)
5131 : {
5132 302872 : struct btrfs_key found_key;
5133 302872 : struct extent_buffer *leaf;
5134 302872 : u32 nritems;
5135 302872 : int ret;
5136 :
5137 302872 : while (1) {
5138 302872 : if (path->slots[0] == 0) {
5139 546 : ret = btrfs_prev_leaf(root, path);
5140 546 : if (ret != 0)
5141 9 : return ret;
5142 : } else {
5143 302326 : path->slots[0]--;
5144 : }
5145 302863 : leaf = path->nodes[0];
5146 302863 : nritems = btrfs_header_nritems(leaf);
5147 302863 : if (nritems == 0)
5148 : return 1;
5149 302863 : if (path->slots[0] == nritems)
5150 537 : path->slots[0]--;
5151 :
5152 302863 : btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5153 302863 : if (found_key.objectid < min_objectid)
5154 : break;
5155 302863 : if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5156 : found_key.type == BTRFS_METADATA_ITEM_KEY)
5157 : return 0;
5158 235052 : if (found_key.objectid == min_objectid &&
5159 : found_key.type < BTRFS_EXTENT_ITEM_KEY)
5160 : break;
5161 : }
5162 : return 1;
5163 : }
5164 :
5165 11 : int __init btrfs_ctree_init(void)
5166 : {
5167 11 : btrfs_path_cachep = kmem_cache_create("btrfs_path",
5168 : sizeof(struct btrfs_path), 0,
5169 : SLAB_MEM_SPREAD, NULL);
5170 11 : if (!btrfs_path_cachep)
5171 0 : return -ENOMEM;
5172 : return 0;
5173 : }
5174 :
5175 0 : void __cold btrfs_ctree_exit(void)
5176 : {
5177 0 : kmem_cache_destroy(btrfs_path_cachep);
5178 0 : }
|