Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (C) 2007,2008 Oracle. All rights reserved.
4 : */
5 :
6 : #include <linux/sched.h>
7 : #include <linux/slab.h>
8 : #include <linux/rbtree.h>
9 : #include <linux/mm.h>
10 : #include <linux/error-injection.h>
11 : #include "messages.h"
12 : #include "ctree.h"
13 : #include "disk-io.h"
14 : #include "transaction.h"
15 : #include "print-tree.h"
16 : #include "locking.h"
17 : #include "volumes.h"
18 : #include "qgroup.h"
19 : #include "tree-mod-log.h"
20 : #include "tree-checker.h"
21 : #include "fs.h"
22 : #include "accessors.h"
23 : #include "extent-tree.h"
24 : #include "relocation.h"
25 : #include "file-item.h"
26 :
27 : static struct kmem_cache *btrfs_path_cachep;
28 :
29 : static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
30 : *root, struct btrfs_path *path, int level);
31 : static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
32 : const struct btrfs_key *ins_key, struct btrfs_path *path,
33 : int data_size, int extend);
34 : static int push_node_left(struct btrfs_trans_handle *trans,
35 : struct extent_buffer *dst,
36 : struct extent_buffer *src, int empty);
37 : static int balance_node_right(struct btrfs_trans_handle *trans,
38 : struct extent_buffer *dst_buf,
39 : struct extent_buffer *src_buf);
40 :
41 : static const struct btrfs_csums {
42 : u16 size;
43 : const char name[10];
44 : const char driver[12];
45 : } btrfs_csums[] = {
46 : [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
47 : [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
48 : [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
49 : [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
50 : .driver = "blake2b-256" },
51 : };
52 :
53 : /*
54 : * The leaf data grows from end-to-front in the node. this returns the address
55 : * of the start of the last item, which is the stop of the leaf data stack.
56 : */
57 135793593 : static unsigned int leaf_data_end(const struct extent_buffer *leaf)
58 : {
59 135793593 : u32 nr = btrfs_header_nritems(leaf);
60 :
61 135793593 : if (nr == 0)
62 10063 : return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
63 135783530 : return btrfs_item_offset(leaf, nr - 1);
64 : }
65 :
66 : /*
67 : * Move data in a @leaf (using memmove, safe for overlapping ranges).
68 : *
69 : * @leaf: leaf that we're doing a memmove on
70 : * @dst_offset: item data offset we're moving to
71 : * @src_offset: item data offset were' moving from
72 : * @len: length of the data we're moving
73 : *
74 : * Wrapper around memmove_extent_buffer() that takes into account the header on
75 : * the leaf. The btrfs_item offset's start directly after the header, so we
76 : * have to adjust any offsets to account for the header in the leaf. This
77 : * handles that math to simplify the callers.
78 : */
79 : static inline void memmove_leaf_data(const struct extent_buffer *leaf,
80 : unsigned long dst_offset,
81 : unsigned long src_offset,
82 : unsigned long len)
83 : {
84 100364810 : memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset,
85 : btrfs_item_nr_offset(leaf, 0) + src_offset, len);
86 14249340 : }
87 :
88 : /*
89 : * Copy item data from @src into @dst at the given @offset.
90 : *
91 : * @dst: destination leaf that we're copying into
92 : * @src: source leaf that we're copying from
93 : * @dst_offset: item data offset we're copying to
94 : * @src_offset: item data offset were' copying from
95 : * @len: length of the data we're copying
96 : *
97 : * Wrapper around copy_extent_buffer() that takes into account the header on
98 : * the leaf. The btrfs_item offset's start directly after the header, so we
99 : * have to adjust any offsets to account for the header in the leaf. This
100 : * handles that math to simplify the callers.
101 : */
102 : static inline void copy_leaf_data(const struct extent_buffer *dst,
103 : const struct extent_buffer *src,
104 : unsigned long dst_offset,
105 : unsigned long src_offset, unsigned long len)
106 : {
107 2928652 : copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset,
108 : btrfs_item_nr_offset(src, 0) + src_offset, len);
109 : }
110 :
111 : /*
112 : * Move items in a @leaf (using memmove).
113 : *
114 : * @dst: destination leaf for the items
115 : * @dst_item: the item nr we're copying into
116 : * @src_item: the item nr we're copying from
117 : * @nr_items: the number of items to copy
118 : *
119 : * Wrapper around memmove_extent_buffer() that does the math to get the
120 : * appropriate offsets into the leaf from the item numbers.
121 : */
122 61865126 : static inline void memmove_leaf_items(const struct extent_buffer *leaf,
123 : int dst_item, int src_item, int nr_items)
124 : {
125 61865126 : memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item),
126 : btrfs_item_nr_offset(leaf, src_item),
127 : nr_items * sizeof(struct btrfs_item));
128 1012993 : }
129 :
130 : /*
131 : * Copy items from @src into @dst at the given @offset.
132 : *
133 : * @dst: destination leaf for the items
134 : * @src: source leaf for the items
135 : * @dst_item: the item nr we're copying into
136 : * @src_item: the item nr we're copying from
137 : * @nr_items: the number of items to copy
138 : *
139 : * Wrapper around copy_extent_buffer() that does the math to get the
140 : * appropriate offsets into the leaf from the item numbers.
141 : */
142 : static inline void copy_leaf_items(const struct extent_buffer *dst,
143 : const struct extent_buffer *src,
144 : int dst_item, int src_item, int nr_items)
145 : {
146 1097818 : copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item),
147 : btrfs_item_nr_offset(src, src_item),
148 : nr_items * sizeof(struct btrfs_item));
149 : }
150 :
151 : /* This exists for btrfs-progs usages. */
152 0 : u16 btrfs_csum_type_size(u16 type)
153 : {
154 3250 : return btrfs_csums[type].size;
155 : }
156 :
157 3250 : int btrfs_super_csum_size(const struct btrfs_super_block *s)
158 : {
159 3250 : u16 t = btrfs_super_csum_type(s);
160 : /*
161 : * csum type is validated at mount time
162 : */
163 3250 : return btrfs_csum_type_size(t);
164 : }
165 :
166 3238 : const char *btrfs_super_csum_name(u16 csum_type)
167 : {
168 : /* csum type is validated at mount time */
169 3238 : return btrfs_csums[csum_type].name;
170 : }
171 :
172 : /*
173 : * Return driver name if defined, otherwise the name that's also a valid driver
174 : * name
175 : */
176 3238 : const char *btrfs_super_csum_driver(u16 csum_type)
177 : {
178 : /* csum type is validated at mount time */
179 3238 : return btrfs_csums[csum_type].driver[0] ?
180 3238 : btrfs_csums[csum_type].driver :
181 : btrfs_csums[csum_type].name;
182 : }
183 :
184 0 : size_t __attribute_const__ btrfs_get_num_csums(void)
185 : {
186 0 : return ARRAY_SIZE(btrfs_csums);
187 : }
188 :
189 195919145 : struct btrfs_path *btrfs_alloc_path(void)
190 : {
191 195919145 : might_sleep();
192 :
193 195905519 : return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
194 : }
195 :
196 : /* this also releases the path */
197 293549515 : void btrfs_free_path(struct btrfs_path *p)
198 : {
199 293549515 : if (!p)
200 : return;
201 195929398 : btrfs_release_path(p);
202 195941508 : kmem_cache_free(btrfs_path_cachep, p);
203 : }
204 :
205 : /*
206 : * path release drops references on the extent buffers in the path
207 : * and it drops any locks held by this path
208 : *
209 : * It is safe to call this on paths that no locks or extent buffers held.
210 : */
211 577018141 : noinline void btrfs_release_path(struct btrfs_path *p)
212 : {
213 577018141 : int i;
214 :
215 5190920240 : for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
216 4613964429 : p->slots[i] = 0;
217 4613964429 : if (!p->nodes[i])
218 3673106516 : continue;
219 940857913 : if (p->locks[i]) {
220 416740227 : btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
221 416687320 : p->locks[i] = 0;
222 : }
223 940805006 : free_extent_buffer(p->nodes[i]);
224 940795583 : p->nodes[i] = NULL;
225 : }
226 576955811 : }
227 :
228 : /*
229 : * We want the transaction abort to print stack trace only for errors where the
230 : * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
231 : * caused by external factors.
232 : */
233 17 : bool __cold abort_should_print_stack(int errno)
234 : {
235 17 : switch (errno) {
236 : case -EIO:
237 : case -EROFS:
238 : case -ENOMEM:
239 : return false;
240 : }
241 0 : return true;
242 : }
243 :
244 : /*
245 : * safely gets a reference on the root node of a tree. A lock
246 : * is not taken, so a concurrent writer may put a different node
247 : * at the root of the tree. See btrfs_lock_root_node for the
248 : * looping required.
249 : *
250 : * The extent buffer returned by this has a reference taken, so
251 : * it won't disappear. It may stop being the root of the tree
252 : * at any time because there are no locks held.
253 : */
254 487171094 : struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
255 : {
256 487171094 : struct extent_buffer *eb;
257 :
258 487171094 : while (1) {
259 487171094 : rcu_read_lock();
260 487238304 : eb = rcu_dereference(root->node);
261 :
262 : /*
263 : * RCU really hurts here, we could free up the root node because
264 : * it was COWed but we may not get the new root node yet so do
265 : * the inc_not_zero dance and if it doesn't work then
266 : * synchronize_rcu and try again.
267 : */
268 974660781 : if (atomic_inc_not_zero(&eb->refs)) {
269 487422477 : rcu_read_unlock();
270 487410955 : break;
271 : }
272 0 : rcu_read_unlock();
273 0 : synchronize_rcu();
274 : }
275 487410955 : return eb;
276 : }
277 :
278 : /*
279 : * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
280 : * just get put onto a simple dirty list. Transaction walks this list to make
281 : * sure they get properly updated on disk.
282 : */
283 1318714 : static void add_root_to_dirty_list(struct btrfs_root *root)
284 : {
285 1318714 : struct btrfs_fs_info *fs_info = root->fs_info;
286 :
287 1318714 : if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
288 0 : !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
289 : return;
290 :
291 446535 : spin_lock(&fs_info->trans_lock);
292 446533 : if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
293 : /* Want the extent tree to be the last on the list */
294 446533 : if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
295 203513 : list_move_tail(&root->dirty_list,
296 : &fs_info->dirty_cowonly_roots);
297 : else
298 243020 : list_move(&root->dirty_list,
299 : &fs_info->dirty_cowonly_roots);
300 : }
301 446533 : spin_unlock(&fs_info->trans_lock);
302 : }
303 :
304 : /*
305 : * used by snapshot creation to make a copy of a root for a tree with
306 : * a given objectid. The buffer with the new root node is returned in
307 : * cow_ret, and this func returns zero on success or a negative error code.
308 : */
309 2528 : int btrfs_copy_root(struct btrfs_trans_handle *trans,
310 : struct btrfs_root *root,
311 : struct extent_buffer *buf,
312 : struct extent_buffer **cow_ret, u64 new_root_objectid)
313 : {
314 2528 : struct btrfs_fs_info *fs_info = root->fs_info;
315 2528 : struct extent_buffer *cow;
316 2528 : int ret = 0;
317 2528 : int level;
318 2528 : struct btrfs_disk_key disk_key;
319 :
320 5056 : WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
321 : trans->transid != fs_info->running_transaction->transid);
322 5056 : WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
323 : trans->transid != root->last_trans);
324 :
325 2528 : level = btrfs_header_level(buf);
326 2528 : if (level == 0)
327 817 : btrfs_item_key(buf, &disk_key, 0);
328 : else
329 1711 : btrfs_node_key(buf, &disk_key, 0);
330 :
331 2528 : cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
332 : &disk_key, level, buf->start, 0,
333 : BTRFS_NESTING_NEW_ROOT);
334 2528 : if (IS_ERR(cow))
335 0 : return PTR_ERR(cow);
336 :
337 2528 : copy_extent_buffer_full(cow, buf);
338 2528 : btrfs_set_header_bytenr(cow, cow->start);
339 2528 : btrfs_set_header_generation(cow, trans->transid);
340 2528 : btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
341 2528 : btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
342 : BTRFS_HEADER_FLAG_RELOC);
343 2528 : if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
344 1500 : btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
345 : else
346 1028 : btrfs_set_header_owner(cow, new_root_objectid);
347 :
348 2528 : write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
349 :
350 2528 : WARN_ON(btrfs_header_generation(buf) > trans->transid);
351 2528 : if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
352 1500 : ret = btrfs_inc_ref(trans, root, cow, 1);
353 : else
354 1028 : ret = btrfs_inc_ref(trans, root, cow, 0);
355 2528 : if (ret) {
356 0 : btrfs_tree_unlock(cow);
357 0 : free_extent_buffer(cow);
358 0 : btrfs_abort_transaction(trans, ret);
359 0 : return ret;
360 : }
361 :
362 2528 : btrfs_mark_buffer_dirty(cow);
363 2528 : *cow_ret = cow;
364 2528 : return 0;
365 : }
366 :
367 : /*
368 : * check if the tree block can be shared by multiple trees
369 : */
370 8506895 : int btrfs_block_can_be_shared(struct btrfs_root *root,
371 : struct extent_buffer *buf)
372 : {
373 : /*
374 : * Tree blocks not in shareable trees and tree roots are never shared.
375 : * If a block was allocated after the last snapshot and the block was
376 : * not allocated by tree relocation, we know the block is not shared.
377 : */
378 17013790 : if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
379 4657169 : buf != root->node && buf != root->commit_root &&
380 : (btrfs_header_generation(buf) <=
381 2596648 : btrfs_root_last_snapshot(&root->root_item) ||
382 : btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
383 1894110 : return 1;
384 :
385 : return 0;
386 : }
387 :
388 8417287 : static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
389 : struct btrfs_root *root,
390 : struct extent_buffer *buf,
391 : struct extent_buffer *cow,
392 : int *last_ref)
393 : {
394 8417287 : struct btrfs_fs_info *fs_info = root->fs_info;
395 8417287 : u64 refs;
396 8417287 : u64 owner;
397 8417287 : u64 flags;
398 8417287 : u64 new_flags = 0;
399 8417287 : int ret;
400 :
401 : /*
402 : * Backrefs update rules:
403 : *
404 : * Always use full backrefs for extent pointers in tree block
405 : * allocated by tree relocation.
406 : *
407 : * If a shared tree block is no longer referenced by its owner
408 : * tree (btrfs_header_owner(buf) == root->root_key.objectid),
409 : * use full backrefs for extent pointers in tree block.
410 : *
411 : * If a tree block is been relocating
412 : * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
413 : * use full backrefs for extent pointers in tree block.
414 : * The reason for this is some operations (such as drop tree)
415 : * are only allowed for blocks use full backrefs.
416 : */
417 :
418 8417287 : if (btrfs_block_can_be_shared(root, buf)) {
419 1893960 : ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
420 : btrfs_header_level(buf), 1,
421 : &refs, &flags);
422 1893960 : if (ret)
423 : return ret;
424 1893960 : if (unlikely(refs == 0)) {
425 0 : btrfs_crit(fs_info,
426 : "found 0 references for tree block at bytenr %llu level %d root %llu",
427 : buf->start, btrfs_header_level(buf),
428 : btrfs_root_id(root));
429 0 : ret = -EUCLEAN;
430 0 : btrfs_abort_transaction(trans, ret);
431 0 : return ret;
432 : }
433 : } else {
434 6523227 : refs = 1;
435 6523227 : if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
436 : btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
437 635 : flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
438 : else
439 6522592 : flags = 0;
440 : }
441 :
442 8417187 : owner = btrfs_header_owner(buf);
443 8417187 : BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
444 : !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
445 :
446 8417187 : if (refs > 1) {
447 1892325 : if ((owner == root->root_key.objectid ||
448 1891401 : root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
449 1891401 : !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
450 71699 : ret = btrfs_inc_ref(trans, root, buf, 1);
451 71699 : if (ret)
452 : return ret;
453 :
454 71699 : if (root->root_key.objectid ==
455 : BTRFS_TREE_RELOC_OBJECTID) {
456 32684 : ret = btrfs_dec_ref(trans, root, buf, 0);
457 32684 : if (ret)
458 : return ret;
459 32684 : ret = btrfs_inc_ref(trans, root, cow, 1);
460 32684 : if (ret)
461 : return ret;
462 : }
463 71699 : new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
464 : } else {
465 :
466 1820626 : if (root->root_key.objectid ==
467 : BTRFS_TREE_RELOC_OBJECTID)
468 1819681 : ret = btrfs_inc_ref(trans, root, cow, 1);
469 : else
470 945 : ret = btrfs_inc_ref(trans, root, cow, 0);
471 1820626 : if (ret)
472 0 : return ret;
473 : }
474 71699 : if (new_flags != 0) {
475 71699 : ret = btrfs_set_disk_extent_flags(trans, buf, new_flags);
476 71699 : if (ret)
477 0 : return ret;
478 : }
479 : } else {
480 6524862 : if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
481 1217 : if (root->root_key.objectid ==
482 : BTRFS_TREE_RELOC_OBJECTID)
483 1048 : ret = btrfs_inc_ref(trans, root, cow, 1);
484 : else
485 169 : ret = btrfs_inc_ref(trans, root, cow, 0);
486 1217 : if (ret)
487 : return ret;
488 1217 : ret = btrfs_dec_ref(trans, root, buf, 1);
489 1217 : if (ret)
490 : return ret;
491 : }
492 6524862 : btrfs_clear_buffer_dirty(trans, buf);
493 6524868 : *last_ref = 1;
494 : }
495 : return 0;
496 : }
497 :
498 : /*
499 : * does the dirty work in cow of a single block. The parent block (if
500 : * supplied) is updated to point to the new cow copy. The new buffer is marked
501 : * dirty and returned locked. If you modify the block it needs to be marked
502 : * dirty again.
503 : *
504 : * search_start -- an allocation hint for the new block
505 : *
506 : * empty_size -- a hint that you plan on doing more cow. This is the size in
507 : * bytes the allocator should try to find free next to the block it returns.
508 : * This is just a hint and may be ignored by the allocator.
509 : */
510 8419474 : static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
511 : struct btrfs_root *root,
512 : struct extent_buffer *buf,
513 : struct extent_buffer *parent, int parent_slot,
514 : struct extent_buffer **cow_ret,
515 : u64 search_start, u64 empty_size,
516 : enum btrfs_lock_nesting nest)
517 : {
518 8419474 : struct btrfs_fs_info *fs_info = root->fs_info;
519 8419474 : struct btrfs_disk_key disk_key;
520 8419474 : struct extent_buffer *cow;
521 8419474 : int level, ret;
522 8419474 : int last_ref = 0;
523 8419474 : int unlock_orig = 0;
524 8419474 : u64 parent_start = 0;
525 :
526 8419474 : if (*cow_ret == buf)
527 8419432 : unlock_orig = 1;
528 :
529 8419474 : btrfs_assert_tree_write_locked(buf);
530 :
531 13077962 : WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
532 : trans->transid != fs_info->running_transaction->transid);
533 13077952 : WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
534 : trans->transid != root->last_trans);
535 :
536 8419474 : level = btrfs_header_level(buf);
537 :
538 8419474 : if (level == 0)
539 7551445 : btrfs_item_key(buf, &disk_key, 0);
540 : else
541 868029 : btrfs_node_key(buf, &disk_key, 0);
542 :
543 8419418 : if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
544 1852748 : parent_start = parent->start;
545 :
546 8419418 : cow = btrfs_alloc_tree_block(trans, root, parent_start,
547 : root->root_key.objectid, &disk_key, level,
548 : search_start, empty_size, nest);
549 8419641 : if (IS_ERR(cow))
550 2257 : return PTR_ERR(cow);
551 :
552 : /* cow is set to blocking by btrfs_init_new_buffer */
553 :
554 8417384 : copy_extent_buffer_full(cow, buf);
555 8417322 : btrfs_set_header_bytenr(cow, cow->start);
556 8417322 : btrfs_set_header_generation(cow, trans->transid);
557 8417322 : btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
558 8417269 : btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
559 : BTRFS_HEADER_FLAG_RELOC);
560 8417303 : if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
561 1853413 : btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
562 : else
563 6563890 : btrfs_set_header_owner(cow, root->root_key.objectid);
564 :
565 8417303 : write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
566 :
567 8417314 : ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
568 8417163 : if (ret) {
569 0 : btrfs_tree_unlock(cow);
570 0 : free_extent_buffer(cow);
571 0 : btrfs_abort_transaction(trans, ret);
572 0 : return ret;
573 : }
574 :
575 16834326 : if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
576 4656159 : ret = btrfs_reloc_cow_block(trans, root, buf, cow);
577 4656115 : if (ret) {
578 0 : btrfs_tree_unlock(cow);
579 0 : free_extent_buffer(cow);
580 0 : btrfs_abort_transaction(trans, ret);
581 0 : return ret;
582 : }
583 : }
584 :
585 8417119 : if (buf == root->node) {
586 1315789 : WARN_ON(parent && parent != buf);
587 1315789 : if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
588 : btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
589 665 : parent_start = buf->start;
590 :
591 1315789 : ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
592 1315776 : if (ret < 0) {
593 0 : btrfs_tree_unlock(cow);
594 0 : free_extent_buffer(cow);
595 0 : btrfs_abort_transaction(trans, ret);
596 0 : return ret;
597 : }
598 1315776 : atomic_inc(&cow->refs);
599 1315790 : rcu_assign_pointer(root->node, cow);
600 :
601 1315790 : btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
602 : parent_start, last_ref);
603 1315796 : free_extent_buffer(buf);
604 1315799 : add_root_to_dirty_list(root);
605 : } else {
606 7101330 : WARN_ON(trans->transid != btrfs_header_generation(parent));
607 7101330 : ret = btrfs_tree_mod_log_insert_key(parent, parent_slot,
608 : BTRFS_MOD_LOG_KEY_REPLACE);
609 7101270 : if (ret) {
610 0 : btrfs_tree_unlock(cow);
611 0 : free_extent_buffer(cow);
612 0 : btrfs_abort_transaction(trans, ret);
613 0 : return ret;
614 : }
615 7101270 : btrfs_set_node_blockptr(parent, parent_slot,
616 : cow->start);
617 7101159 : btrfs_set_node_ptr_generation(parent, parent_slot,
618 : trans->transid);
619 7101212 : btrfs_mark_buffer_dirty(parent);
620 7101427 : if (last_ref) {
621 5209102 : ret = btrfs_tree_mod_log_free_eb(buf);
622 5208756 : if (ret) {
623 0 : btrfs_tree_unlock(cow);
624 0 : free_extent_buffer(cow);
625 0 : btrfs_abort_transaction(trans, ret);
626 0 : return ret;
627 : }
628 : }
629 7101081 : btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
630 : parent_start, last_ref);
631 : }
632 8417402 : if (unlock_orig)
633 8417398 : btrfs_tree_unlock(buf);
634 8417389 : free_extent_buffer_stale(buf);
635 8417193 : btrfs_mark_buffer_dirty(cow);
636 8417411 : *cow_ret = cow;
637 8417411 : return 0;
638 : }
639 :
640 484252619 : static inline int should_cow_block(struct btrfs_trans_handle *trans,
641 : struct btrfs_root *root,
642 : struct extent_buffer *buf)
643 : {
644 484252619 : if (btrfs_is_testing(root->fs_info))
645 : return 0;
646 :
647 : /* Ensure we can see the FORCE_COW bit */
648 484252619 : smp_mb__before_atomic();
649 :
650 : /*
651 : * We do not need to cow a block if
652 : * 1) this block is not created or changed in this transaction;
653 : * 2) this block does not belong to TREE_RELOC tree;
654 : * 3) the root is not forced COW.
655 : *
656 : * What is forced COW:
657 : * when we create snapshot during committing the transaction,
658 : * after we've finished copying src root, we must COW the shared
659 : * block to ensure the metadata consistency.
660 : */
661 484252619 : if (btrfs_header_generation(buf) == trans->transid &&
662 468541065 : !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
663 468541065 : !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
664 468538030 : btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
665 468538030 : !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
666 468513116 : return 0;
667 : return 1;
668 : }
669 :
670 : /*
671 : * cows a single block, see __btrfs_cow_block for the real work.
672 : * This version of it has extra checks so that a block isn't COWed more than
673 : * once per transaction, as long as it hasn't been written yet
674 : */
675 12714945 : noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
676 : struct btrfs_root *root, struct extent_buffer *buf,
677 : struct extent_buffer *parent, int parent_slot,
678 : struct extent_buffer **cow_ret,
679 : enum btrfs_lock_nesting nest)
680 : {
681 12714945 : struct btrfs_fs_info *fs_info = root->fs_info;
682 12714945 : u64 search_start;
683 12714945 : int ret;
684 :
685 25429890 : if (test_bit(BTRFS_ROOT_DELETING, &root->state))
686 0 : btrfs_err(fs_info,
687 : "COW'ing blocks on a fs root that's being dropped");
688 :
689 12714945 : if (trans->transaction != fs_info->running_transaction)
690 0 : WARN(1, KERN_CRIT "trans %llu running %llu\n",
691 : trans->transid,
692 : fs_info->running_transaction->transid);
693 :
694 12714889 : if (trans->transid != fs_info->generation)
695 0 : WARN(1, KERN_CRIT "trans %llu running %llu\n",
696 : trans->transid, fs_info->generation);
697 :
698 12714889 : if (!should_cow_block(trans, root, buf)) {
699 4295405 : *cow_ret = buf;
700 4295405 : return 0;
701 : }
702 :
703 8419454 : search_start = buf->start & ~((u64)SZ_1G - 1);
704 :
705 : /*
706 : * Before CoWing this block for later modification, check if it's
707 : * the subtree root and do the delayed subtree trace if needed.
708 : *
709 : * Also We don't care about the error, as it's handled internally.
710 : */
711 8419454 : btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
712 8419435 : ret = __btrfs_cow_block(trans, root, buf, parent,
713 : parent_slot, cow_ret, search_start, 0, nest);
714 :
715 8419669 : trace_btrfs_cow_block(root, buf, *cow_ret);
716 :
717 8419669 : return ret;
718 : }
719 : ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
720 :
721 : /*
722 : * helper function for defrag to decide if two blocks pointed to by a
723 : * node are actually close by
724 : */
725 0 : static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
726 : {
727 0 : if (blocknr < other && other - (blocknr + blocksize) < 32768)
728 : return 1;
729 0 : if (blocknr > other && blocknr - (other + blocksize) < 32768)
730 0 : return 1;
731 : return 0;
732 : }
733 :
734 : #ifdef __LITTLE_ENDIAN
735 :
736 : /*
737 : * Compare two keys, on little-endian the disk order is same as CPU order and
738 : * we can avoid the conversion.
739 : */
740 : static int comp_keys(const struct btrfs_disk_key *disk_key,
741 : const struct btrfs_key *k2)
742 : {
743 5359488591 : const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
744 :
745 5359488591 : return btrfs_comp_cpu_keys(k1, k2);
746 : }
747 :
748 : #else
749 :
750 : /*
751 : * compare two keys in a memcmp fashion
752 : */
753 : static int comp_keys(const struct btrfs_disk_key *disk,
754 : const struct btrfs_key *k2)
755 : {
756 : struct btrfs_key k1;
757 :
758 : btrfs_disk_key_to_cpu(&k1, disk);
759 :
760 : return btrfs_comp_cpu_keys(&k1, k2);
761 : }
762 : #endif
763 :
764 : /*
765 : * same as comp_keys only with two btrfs_key's
766 : */
767 6861017998 : int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
768 : {
769 6861017998 : if (k1->objectid > k2->objectid)
770 : return 1;
771 5593874165 : if (k1->objectid < k2->objectid)
772 : return -1;
773 3521614119 : if (k1->type > k2->type)
774 : return 1;
775 3414851286 : if (k1->type < k2->type)
776 : return -1;
777 3242368379 : if (k1->offset > k2->offset)
778 : return 1;
779 2571356965 : if (k1->offset < k2->offset)
780 2031259698 : return -1;
781 : return 0;
782 : }
783 :
784 : /*
785 : * this is used by the defrag code to go through all the
786 : * leaves pointed to by a node and reallocate them so that
787 : * disk order is close to key order
788 : */
789 0 : int btrfs_realloc_node(struct btrfs_trans_handle *trans,
790 : struct btrfs_root *root, struct extent_buffer *parent,
791 : int start_slot, u64 *last_ret,
792 : struct btrfs_key *progress)
793 : {
794 0 : struct btrfs_fs_info *fs_info = root->fs_info;
795 0 : struct extent_buffer *cur;
796 0 : u64 blocknr;
797 0 : u64 search_start = *last_ret;
798 0 : u64 last_block = 0;
799 0 : u64 other;
800 0 : u32 parent_nritems;
801 0 : int end_slot;
802 0 : int i;
803 0 : int err = 0;
804 0 : u32 blocksize;
805 0 : int progress_passed = 0;
806 0 : struct btrfs_disk_key disk_key;
807 :
808 0 : WARN_ON(trans->transaction != fs_info->running_transaction);
809 0 : WARN_ON(trans->transid != fs_info->generation);
810 :
811 0 : parent_nritems = btrfs_header_nritems(parent);
812 0 : blocksize = fs_info->nodesize;
813 0 : end_slot = parent_nritems - 1;
814 :
815 0 : if (parent_nritems <= 1)
816 : return 0;
817 :
818 0 : for (i = start_slot; i <= end_slot; i++) {
819 0 : int close = 1;
820 :
821 0 : btrfs_node_key(parent, &disk_key, i);
822 0 : if (!progress_passed && comp_keys(&disk_key, progress) < 0)
823 0 : continue;
824 :
825 0 : progress_passed = 1;
826 0 : blocknr = btrfs_node_blockptr(parent, i);
827 0 : if (last_block == 0)
828 0 : last_block = blocknr;
829 :
830 0 : if (i > 0) {
831 0 : other = btrfs_node_blockptr(parent, i - 1);
832 0 : close = close_blocks(blocknr, other, blocksize);
833 : }
834 0 : if (!close && i < end_slot) {
835 0 : other = btrfs_node_blockptr(parent, i + 1);
836 0 : close = close_blocks(blocknr, other, blocksize);
837 : }
838 0 : if (close) {
839 0 : last_block = blocknr;
840 0 : continue;
841 : }
842 :
843 0 : cur = btrfs_read_node_slot(parent, i);
844 0 : if (IS_ERR(cur))
845 0 : return PTR_ERR(cur);
846 0 : if (search_start == 0)
847 0 : search_start = last_block;
848 :
849 0 : btrfs_tree_lock(cur);
850 0 : err = __btrfs_cow_block(trans, root, cur, parent, i,
851 : &cur, search_start,
852 0 : min(16 * blocksize,
853 : (end_slot - i) * blocksize),
854 : BTRFS_NESTING_COW);
855 0 : if (err) {
856 0 : btrfs_tree_unlock(cur);
857 0 : free_extent_buffer(cur);
858 0 : break;
859 : }
860 0 : search_start = cur->start;
861 0 : last_block = cur->start;
862 0 : *last_ret = search_start;
863 0 : btrfs_tree_unlock(cur);
864 0 : free_extent_buffer(cur);
865 : }
866 : return err;
867 : }
868 :
869 : /*
870 : * Search for a key in the given extent_buffer.
871 : *
872 : * The lower boundary for the search is specified by the slot number @first_slot.
873 : * Use a value of 0 to search over the whole extent buffer. Works for both
874 : * leaves and nodes.
875 : *
876 : * The slot in the extent buffer is returned via @slot. If the key exists in the
877 : * extent buffer, then @slot will point to the slot where the key is, otherwise
878 : * it points to the slot where you would insert the key.
879 : *
880 : * Slot may point to the total number of items (i.e. one position beyond the last
881 : * key) if the key is bigger than the last key in the extent buffer.
882 : */
883 893494478 : int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
884 : const struct btrfs_key *key, int *slot)
885 : {
886 893494478 : unsigned long p;
887 893494478 : int item_size;
888 : /*
889 : * Use unsigned types for the low and high slots, so that we get a more
890 : * efficient division in the search loop below.
891 : */
892 893494478 : u32 low = first_slot;
893 893494478 : u32 high = btrfs_header_nritems(eb);
894 893494478 : int ret;
895 893494478 : const int key_size = sizeof(struct btrfs_disk_key);
896 :
897 893494478 : if (unlikely(low > high)) {
898 0 : btrfs_err(eb->fs_info,
899 : "%s: low (%u) > high (%u) eb %llu owner %llu level %d",
900 : __func__, low, high, eb->start,
901 : btrfs_header_owner(eb), btrfs_header_level(eb));
902 0 : return -EINVAL;
903 : }
904 :
905 893494478 : if (btrfs_header_level(eb) == 0) {
906 : p = offsetof(struct btrfs_leaf, items);
907 : item_size = sizeof(struct btrfs_item);
908 : } else {
909 558820682 : p = offsetof(struct btrfs_node, ptrs);
910 558820682 : item_size = sizeof(struct btrfs_key_ptr);
911 : }
912 :
913 5967952374 : while (low < high) {
914 5270309155 : unsigned long oip;
915 5270309155 : unsigned long offset;
916 5270309155 : struct btrfs_disk_key *tmp;
917 5270309155 : struct btrfs_disk_key unaligned;
918 5270309155 : int mid;
919 :
920 5270309155 : mid = (low + high) / 2;
921 5270309155 : offset = p + mid * item_size;
922 5270309155 : oip = offset_in_page(offset);
923 :
924 5270309155 : if (oip + key_size <= PAGE_SIZE) {
925 5260931645 : const unsigned long idx = get_eb_page_index(offset);
926 5260931645 : char *kaddr = page_address(eb->pages[idx]);
927 :
928 5260931645 : oip = get_eb_offset_in_page(eb, offset);
929 5260931645 : tmp = (struct btrfs_disk_key *)(kaddr + oip);
930 : } else {
931 9377510 : read_extent_buffer(eb, &unaligned, offset, key_size);
932 9377510 : tmp = &unaligned;
933 : }
934 :
935 5270414762 : ret = comp_keys(tmp, key);
936 :
937 5270414762 : if (ret < 0)
938 3079080942 : low = mid + 1;
939 2191333820 : else if (ret > 0)
940 : high = mid;
941 : else {
942 195956866 : *slot = mid;
943 195956866 : return 0;
944 : }
945 : }
946 697643219 : *slot = low;
947 697643219 : return 1;
948 : }
949 :
950 366759 : static void root_add_used(struct btrfs_root *root, u32 size)
951 : {
952 366759 : spin_lock(&root->accounting_lock);
953 366759 : btrfs_set_root_used(&root->root_item,
954 : btrfs_root_used(&root->root_item) + size);
955 366759 : spin_unlock(&root->accounting_lock);
956 366759 : }
957 :
958 114812 : static void root_sub_used(struct btrfs_root *root, u32 size)
959 : {
960 114812 : spin_lock(&root->accounting_lock);
961 114812 : btrfs_set_root_used(&root->root_item,
962 : btrfs_root_used(&root->root_item) - size);
963 114812 : spin_unlock(&root->accounting_lock);
964 114812 : }
965 :
966 : /* given a node and slot number, this reads the blocks it points to. The
967 : * extent buffer is returned with a reference taken (but unlocked).
968 : */
969 8757501 : struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
970 : int slot)
971 : {
972 8757501 : int level = btrfs_header_level(parent);
973 8757501 : struct btrfs_tree_parent_check check = { 0 };
974 8757501 : struct extent_buffer *eb;
975 :
976 8757501 : if (slot < 0 || slot >= btrfs_header_nritems(parent))
977 : return ERR_PTR(-ENOENT);
978 :
979 8757455 : ASSERT(level);
980 :
981 8757455 : check.level = level - 1;
982 8757455 : check.transid = btrfs_node_ptr_generation(parent, slot);
983 8757492 : check.owner_root = btrfs_header_owner(parent);
984 8757492 : check.has_first_key = true;
985 8757492 : btrfs_node_key_to_cpu(parent, &check.first_key, slot);
986 :
987 8757583 : eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
988 : &check);
989 8757581 : if (IS_ERR(eb))
990 : return eb;
991 17515162 : if (!extent_buffer_uptodate(eb)) {
992 0 : free_extent_buffer(eb);
993 0 : return ERR_PTR(-EIO);
994 : }
995 :
996 : return eb;
997 : }
998 :
999 : /*
1000 : * node level balancing, used to make sure nodes are in proper order for
1001 : * item deletion. We balance from the top down, so we have to make sure
1002 : * that a deletion won't leave an node completely empty later on.
1003 : */
1004 51448221 : static noinline int balance_level(struct btrfs_trans_handle *trans,
1005 : struct btrfs_root *root,
1006 : struct btrfs_path *path, int level)
1007 : {
1008 51448221 : struct btrfs_fs_info *fs_info = root->fs_info;
1009 51448221 : struct extent_buffer *right = NULL;
1010 51448221 : struct extent_buffer *mid;
1011 51448221 : struct extent_buffer *left = NULL;
1012 51448221 : struct extent_buffer *parent = NULL;
1013 51448221 : int ret = 0;
1014 51448221 : int wret;
1015 51448221 : int pslot;
1016 51448221 : int orig_slot = path->slots[level];
1017 51448221 : u64 orig_ptr;
1018 :
1019 51448221 : ASSERT(level > 0);
1020 :
1021 51448221 : mid = path->nodes[level];
1022 :
1023 51448221 : WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
1024 51448221 : WARN_ON(btrfs_header_generation(mid) != trans->transid);
1025 :
1026 51448221 : orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1027 :
1028 51448506 : if (level < BTRFS_MAX_LEVEL - 1) {
1029 51448600 : parent = path->nodes[level + 1];
1030 51448600 : pslot = path->slots[level + 1];
1031 : }
1032 :
1033 : /*
1034 : * deal with the case where there is only one pointer in the root
1035 : * by promoting the node below to a root
1036 : */
1037 51448600 : if (!parent) {
1038 45994425 : struct extent_buffer *child;
1039 :
1040 45994425 : if (btrfs_header_nritems(mid) != 1)
1041 45994425 : return 0;
1042 :
1043 : /* promote the child to a root */
1044 1061 : child = btrfs_read_node_slot(mid, 0);
1045 1061 : if (IS_ERR(child)) {
1046 0 : ret = PTR_ERR(child);
1047 0 : goto out;
1048 : }
1049 :
1050 1061 : btrfs_tree_lock(child);
1051 1061 : ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
1052 : BTRFS_NESTING_COW);
1053 1061 : if (ret) {
1054 0 : btrfs_tree_unlock(child);
1055 0 : free_extent_buffer(child);
1056 0 : goto out;
1057 : }
1058 :
1059 1061 : ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
1060 1061 : if (ret < 0) {
1061 0 : btrfs_tree_unlock(child);
1062 0 : free_extent_buffer(child);
1063 0 : btrfs_abort_transaction(trans, ret);
1064 0 : goto out;
1065 : }
1066 1061 : rcu_assign_pointer(root->node, child);
1067 :
1068 1061 : add_root_to_dirty_list(root);
1069 1061 : btrfs_tree_unlock(child);
1070 :
1071 1061 : path->locks[level] = 0;
1072 1061 : path->nodes[level] = NULL;
1073 1061 : btrfs_clear_buffer_dirty(trans, mid);
1074 1061 : btrfs_tree_unlock(mid);
1075 : /* once for the path */
1076 1061 : free_extent_buffer(mid);
1077 :
1078 1061 : root_sub_used(root, mid->len);
1079 1061 : btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
1080 : /* once for the root ptr */
1081 1061 : free_extent_buffer_stale(mid);
1082 1061 : return 0;
1083 : }
1084 5454081 : if (btrfs_header_nritems(mid) >
1085 5454081 : BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1086 : return 0;
1087 :
1088 28308 : if (pslot) {
1089 28282 : left = btrfs_read_node_slot(parent, pslot - 1);
1090 28282 : if (IS_ERR(left)) {
1091 0 : ret = PTR_ERR(left);
1092 0 : left = NULL;
1093 0 : goto out;
1094 : }
1095 :
1096 28282 : __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1097 28282 : wret = btrfs_cow_block(trans, root, left,
1098 : parent, pslot - 1, &left,
1099 : BTRFS_NESTING_LEFT_COW);
1100 28282 : if (wret) {
1101 0 : ret = wret;
1102 0 : goto out;
1103 : }
1104 : }
1105 :
1106 28308 : if (pslot + 1 < btrfs_header_nritems(parent)) {
1107 114 : right = btrfs_read_node_slot(parent, pslot + 1);
1108 114 : if (IS_ERR(right)) {
1109 0 : ret = PTR_ERR(right);
1110 0 : right = NULL;
1111 0 : goto out;
1112 : }
1113 :
1114 114 : __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1115 114 : wret = btrfs_cow_block(trans, root, right,
1116 : parent, pslot + 1, &right,
1117 : BTRFS_NESTING_RIGHT_COW);
1118 114 : if (wret) {
1119 0 : ret = wret;
1120 0 : goto out;
1121 : }
1122 : }
1123 :
1124 : /* first, try to make some room in the middle buffer */
1125 28308 : if (left) {
1126 28282 : orig_slot += btrfs_header_nritems(left);
1127 28282 : wret = push_node_left(trans, left, mid, 1);
1128 28282 : if (wret < 0)
1129 : ret = wret;
1130 : }
1131 :
1132 : /*
1133 : * then try to empty the right most buffer into the middle
1134 : */
1135 28308 : if (right) {
1136 114 : wret = push_node_left(trans, mid, right, 1);
1137 114 : if (wret < 0 && wret != -ENOSPC)
1138 : ret = wret;
1139 114 : if (btrfs_header_nritems(right) == 0) {
1140 87 : btrfs_clear_buffer_dirty(trans, right);
1141 87 : btrfs_tree_unlock(right);
1142 87 : ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1);
1143 87 : if (ret < 0) {
1144 0 : free_extent_buffer_stale(right);
1145 0 : right = NULL;
1146 0 : goto out;
1147 : }
1148 87 : root_sub_used(root, right->len);
1149 87 : btrfs_free_tree_block(trans, btrfs_root_id(root), right,
1150 : 0, 1);
1151 87 : free_extent_buffer_stale(right);
1152 87 : right = NULL;
1153 : } else {
1154 27 : struct btrfs_disk_key right_key;
1155 27 : btrfs_node_key(right, &right_key, 0);
1156 27 : ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1157 : BTRFS_MOD_LOG_KEY_REPLACE);
1158 27 : if (ret < 0) {
1159 0 : btrfs_abort_transaction(trans, ret);
1160 0 : goto out;
1161 : }
1162 27 : btrfs_set_node_key(parent, &right_key, pslot + 1);
1163 27 : btrfs_mark_buffer_dirty(parent);
1164 : }
1165 : }
1166 28308 : if (btrfs_header_nritems(mid) == 1) {
1167 : /*
1168 : * we're not allowed to leave a node with one item in the
1169 : * tree during a delete. A deletion from lower in the tree
1170 : * could try to delete the only pointer in this node.
1171 : * So, pull some keys from the left.
1172 : * There has to be a left pointer at this point because
1173 : * otherwise we would have pulled some pointers from the
1174 : * right
1175 : */
1176 0 : if (unlikely(!left)) {
1177 0 : btrfs_crit(fs_info,
1178 : "missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu",
1179 : parent->start, btrfs_header_level(parent),
1180 : mid->start, btrfs_root_id(root));
1181 0 : ret = -EUCLEAN;
1182 0 : btrfs_abort_transaction(trans, ret);
1183 0 : goto out;
1184 : }
1185 0 : wret = balance_node_right(trans, mid, left);
1186 0 : if (wret < 0) {
1187 0 : ret = wret;
1188 0 : goto out;
1189 : }
1190 0 : if (wret == 1) {
1191 0 : wret = push_node_left(trans, left, mid, 1);
1192 0 : if (wret < 0)
1193 : ret = wret;
1194 : }
1195 0 : BUG_ON(wret == 1);
1196 : }
1197 28308 : if (btrfs_header_nritems(mid) == 0) {
1198 22 : btrfs_clear_buffer_dirty(trans, mid);
1199 22 : btrfs_tree_unlock(mid);
1200 22 : ret = btrfs_del_ptr(trans, root, path, level + 1, pslot);
1201 22 : if (ret < 0) {
1202 0 : free_extent_buffer_stale(mid);
1203 0 : mid = NULL;
1204 0 : goto out;
1205 : }
1206 22 : root_sub_used(root, mid->len);
1207 22 : btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
1208 22 : free_extent_buffer_stale(mid);
1209 22 : mid = NULL;
1210 : } else {
1211 : /* update the parent key to reflect our changes */
1212 28286 : struct btrfs_disk_key mid_key;
1213 28286 : btrfs_node_key(mid, &mid_key, 0);
1214 28286 : ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1215 : BTRFS_MOD_LOG_KEY_REPLACE);
1216 28286 : if (ret < 0) {
1217 0 : btrfs_abort_transaction(trans, ret);
1218 0 : goto out;
1219 : }
1220 28286 : btrfs_set_node_key(parent, &mid_key, pslot);
1221 28286 : btrfs_mark_buffer_dirty(parent);
1222 : }
1223 :
1224 : /* update the path */
1225 28308 : if (left) {
1226 28282 : if (btrfs_header_nritems(left) > orig_slot) {
1227 88 : atomic_inc(&left->refs);
1228 : /* left was locked after cow */
1229 88 : path->nodes[level] = left;
1230 88 : path->slots[level + 1] -= 1;
1231 88 : path->slots[level] = orig_slot;
1232 88 : if (mid) {
1233 66 : btrfs_tree_unlock(mid);
1234 66 : free_extent_buffer(mid);
1235 : }
1236 : } else {
1237 28194 : orig_slot -= btrfs_header_nritems(left);
1238 28194 : path->slots[level] = orig_slot;
1239 : }
1240 : }
1241 : /* double check we haven't messed things up */
1242 28308 : if (orig_ptr !=
1243 28308 : btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1244 0 : BUG();
1245 28308 : out:
1246 28308 : if (right) {
1247 27 : btrfs_tree_unlock(right);
1248 27 : free_extent_buffer(right);
1249 : }
1250 28308 : if (left) {
1251 28282 : if (path->nodes[level] != left)
1252 28194 : btrfs_tree_unlock(left);
1253 28282 : free_extent_buffer(left);
1254 : }
1255 : return ret;
1256 : }
1257 :
1258 : /* Node balancing for insertion. Here we only split or push nodes around
1259 : * when they are completely full. This is also done top down, so we
1260 : * have to be pessimistic.
1261 : */
1262 115945 : static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1263 : struct btrfs_root *root,
1264 : struct btrfs_path *path, int level)
1265 : {
1266 115945 : struct btrfs_fs_info *fs_info = root->fs_info;
1267 115945 : struct extent_buffer *right = NULL;
1268 115945 : struct extent_buffer *mid;
1269 115945 : struct extent_buffer *left = NULL;
1270 115945 : struct extent_buffer *parent = NULL;
1271 115945 : int ret = 0;
1272 115945 : int wret;
1273 115945 : int pslot;
1274 115945 : int orig_slot = path->slots[level];
1275 :
1276 115945 : if (level == 0)
1277 : return 1;
1278 :
1279 115945 : mid = path->nodes[level];
1280 115945 : WARN_ON(btrfs_header_generation(mid) != trans->transid);
1281 :
1282 115945 : if (level < BTRFS_MAX_LEVEL - 1) {
1283 115945 : parent = path->nodes[level + 1];
1284 115945 : pslot = path->slots[level + 1];
1285 : }
1286 :
1287 115945 : if (!parent)
1288 : return 1;
1289 :
1290 : /* first, try to make some room in the middle buffer */
1291 115945 : if (pslot) {
1292 69338 : u32 left_nr;
1293 :
1294 69338 : left = btrfs_read_node_slot(parent, pslot - 1);
1295 69338 : if (IS_ERR(left))
1296 0 : return PTR_ERR(left);
1297 :
1298 69338 : __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1299 :
1300 69338 : left_nr = btrfs_header_nritems(left);
1301 69338 : if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1302 : wret = 1;
1303 : } else {
1304 58321 : ret = btrfs_cow_block(trans, root, left, parent,
1305 : pslot - 1, &left,
1306 : BTRFS_NESTING_LEFT_COW);
1307 58321 : if (ret)
1308 : wret = 1;
1309 : else {
1310 58321 : wret = push_node_left(trans, left, mid, 0);
1311 : }
1312 : }
1313 58321 : if (wret < 0)
1314 : ret = wret;
1315 58321 : if (wret == 0) {
1316 58321 : struct btrfs_disk_key disk_key;
1317 58321 : orig_slot += left_nr;
1318 58321 : btrfs_node_key(mid, &disk_key, 0);
1319 58321 : ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1320 : BTRFS_MOD_LOG_KEY_REPLACE);
1321 58321 : if (ret < 0) {
1322 0 : btrfs_tree_unlock(left);
1323 0 : free_extent_buffer(left);
1324 0 : btrfs_abort_transaction(trans, ret);
1325 0 : return ret;
1326 : }
1327 58321 : btrfs_set_node_key(parent, &disk_key, pslot);
1328 58321 : btrfs_mark_buffer_dirty(parent);
1329 58321 : if (btrfs_header_nritems(left) > orig_slot) {
1330 279 : path->nodes[level] = left;
1331 279 : path->slots[level + 1] -= 1;
1332 279 : path->slots[level] = orig_slot;
1333 279 : btrfs_tree_unlock(mid);
1334 279 : free_extent_buffer(mid);
1335 : } else {
1336 58042 : orig_slot -=
1337 : btrfs_header_nritems(left);
1338 58042 : path->slots[level] = orig_slot;
1339 58042 : btrfs_tree_unlock(left);
1340 58042 : free_extent_buffer(left);
1341 : }
1342 58321 : return 0;
1343 : }
1344 11017 : btrfs_tree_unlock(left);
1345 11017 : free_extent_buffer(left);
1346 : }
1347 :
1348 : /*
1349 : * then try to empty the right most buffer into the middle
1350 : */
1351 57624 : if (pslot + 1 < btrfs_header_nritems(parent)) {
1352 57326 : u32 right_nr;
1353 :
1354 57326 : right = btrfs_read_node_slot(parent, pslot + 1);
1355 57326 : if (IS_ERR(right))
1356 0 : return PTR_ERR(right);
1357 :
1358 57326 : __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1359 :
1360 57326 : right_nr = btrfs_header_nritems(right);
1361 57326 : if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1362 : wret = 1;
1363 : } else {
1364 57287 : ret = btrfs_cow_block(trans, root, right,
1365 : parent, pslot + 1,
1366 : &right, BTRFS_NESTING_RIGHT_COW);
1367 57287 : if (ret)
1368 : wret = 1;
1369 : else {
1370 57287 : wret = balance_node_right(trans, right, mid);
1371 : }
1372 : }
1373 57287 : if (wret < 0)
1374 : ret = wret;
1375 57287 : if (wret == 0) {
1376 57287 : struct btrfs_disk_key disk_key;
1377 :
1378 57287 : btrfs_node_key(right, &disk_key, 0);
1379 57287 : ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1380 : BTRFS_MOD_LOG_KEY_REPLACE);
1381 57287 : if (ret < 0) {
1382 0 : btrfs_tree_unlock(right);
1383 0 : free_extent_buffer(right);
1384 0 : btrfs_abort_transaction(trans, ret);
1385 0 : return ret;
1386 : }
1387 57287 : btrfs_set_node_key(parent, &disk_key, pslot + 1);
1388 57287 : btrfs_mark_buffer_dirty(parent);
1389 :
1390 57287 : if (btrfs_header_nritems(mid) <= orig_slot) {
1391 80 : path->nodes[level] = right;
1392 80 : path->slots[level + 1] += 1;
1393 80 : path->slots[level] = orig_slot -
1394 : btrfs_header_nritems(mid);
1395 80 : btrfs_tree_unlock(mid);
1396 80 : free_extent_buffer(mid);
1397 : } else {
1398 57207 : btrfs_tree_unlock(right);
1399 57207 : free_extent_buffer(right);
1400 : }
1401 57287 : return 0;
1402 : }
1403 39 : btrfs_tree_unlock(right);
1404 39 : free_extent_buffer(right);
1405 : }
1406 : return 1;
1407 : }
1408 :
1409 : /*
1410 : * readahead one full node of leaves, finding things that are close
1411 : * to the block in 'slot', and triggering ra on them.
1412 : */
1413 29297 : static void reada_for_search(struct btrfs_fs_info *fs_info,
1414 : struct btrfs_path *path,
1415 : int level, int slot, u64 objectid)
1416 : {
1417 29297 : struct extent_buffer *node;
1418 29297 : struct btrfs_disk_key disk_key;
1419 29297 : u32 nritems;
1420 29297 : u64 search;
1421 29297 : u64 target;
1422 29297 : u64 nread = 0;
1423 29297 : u64 nread_max;
1424 29297 : u32 nr;
1425 29297 : u32 blocksize;
1426 29297 : u32 nscan = 0;
1427 :
1428 29297 : if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
1429 470 : return;
1430 :
1431 28828 : if (!path->nodes[level])
1432 : return;
1433 :
1434 28828 : node = path->nodes[level];
1435 :
1436 : /*
1437 : * Since the time between visiting leaves is much shorter than the time
1438 : * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1439 : * much IO at once (possibly random).
1440 : */
1441 28828 : if (path->reada == READA_FORWARD_ALWAYS) {
1442 20120 : if (level > 1)
1443 6742 : nread_max = node->fs_info->nodesize;
1444 : else
1445 : nread_max = SZ_128K;
1446 : } else {
1447 : nread_max = SZ_64K;
1448 : }
1449 :
1450 28828 : search = btrfs_node_blockptr(node, slot);
1451 28827 : blocksize = fs_info->nodesize;
1452 28827 : if (path->reada != READA_FORWARD_ALWAYS) {
1453 8707 : struct extent_buffer *eb;
1454 :
1455 8707 : eb = find_extent_buffer(fs_info, search);
1456 8707 : if (eb) {
1457 1 : free_extent_buffer(eb);
1458 1 : return;
1459 : }
1460 : }
1461 :
1462 28826 : target = search;
1463 :
1464 28826 : nritems = btrfs_header_nritems(node);
1465 28826 : nr = slot;
1466 :
1467 372338 : while (1) {
1468 372338 : if (path->reada == READA_BACK) {
1469 418 : if (nr == 0)
1470 : break;
1471 418 : nr--;
1472 371920 : } else if (path->reada == READA_FORWARD ||
1473 : path->reada == READA_FORWARD_ALWAYS) {
1474 371916 : nr++;
1475 371916 : if (nr >= nritems)
1476 : break;
1477 : }
1478 366441 : if (path->reada == READA_BACK && objectid) {
1479 418 : btrfs_node_key(node, &disk_key, nr);
1480 418 : if (btrfs_disk_key_objectid(&disk_key) != objectid)
1481 : break;
1482 : }
1483 366227 : search = btrfs_node_blockptr(node, nr);
1484 366232 : if (path->reada == READA_FORWARD_ALWAYS ||
1485 241019 : (search <= target && target - search <= 65536) ||
1486 190984 : (search > target && search - target <= 65536)) {
1487 135931 : btrfs_readahead_node_child(node, nr);
1488 135927 : nread += blocksize;
1489 : }
1490 366228 : nscan++;
1491 366228 : if (nread > nread_max || nscan > 32)
1492 : break;
1493 : }
1494 : }
1495 :
1496 51564605 : static noinline void reada_for_balance(struct btrfs_path *path, int level)
1497 : {
1498 51564605 : struct extent_buffer *parent;
1499 51564605 : int slot;
1500 51564605 : int nritems;
1501 :
1502 51564605 : parent = path->nodes[level + 1];
1503 51564605 : if (!parent)
1504 : return;
1505 :
1506 5570026 : nritems = btrfs_header_nritems(parent);
1507 5570026 : slot = path->slots[level + 1];
1508 :
1509 5570026 : if (slot > 0)
1510 5157829 : btrfs_readahead_node_child(parent, slot - 1);
1511 5570026 : if (slot + 1 < nritems)
1512 1157749 : btrfs_readahead_node_child(parent, slot + 1);
1513 : }
1514 :
1515 :
1516 : /*
1517 : * when we walk down the tree, it is usually safe to unlock the higher layers
1518 : * in the tree. The exceptions are when our path goes through slot 0, because
1519 : * operations on the tree might require changing key pointers higher up in the
1520 : * tree.
1521 : *
1522 : * callers might also have set path->keep_locks, which tells this code to keep
1523 : * the lock if the path points to the last slot in the block. This is part of
1524 : * walking through the tree, and selecting the next slot in the higher block.
1525 : *
1526 : * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1527 : * if lowest_unlock is 1, level 0 won't be unlocked
1528 : */
1529 917385251 : static noinline void unlock_up(struct btrfs_path *path, int level,
1530 : int lowest_unlock, int min_write_lock_level,
1531 : int *write_lock_level)
1532 : {
1533 917385251 : int i;
1534 917385251 : int skip_level = level;
1535 917385251 : bool check_skip = true;
1536 :
1537 2057615874 : for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1538 2057615874 : if (!path->nodes[i])
1539 : break;
1540 1522075511 : if (!path->locks[i])
1541 : break;
1542 :
1543 1139197498 : if (check_skip) {
1544 1110867862 : if (path->slots[i] == 0) {
1545 148176357 : skip_level = i + 1;
1546 148176357 : continue;
1547 : }
1548 :
1549 962691505 : if (path->keep_locks) {
1550 238483190 : u32 nritems;
1551 :
1552 238483190 : nritems = btrfs_header_nritems(path->nodes[i]);
1553 238483190 : if (nritems < 1 || path->slots[i] >= nritems - 1) {
1554 129421837 : skip_level = i + 1;
1555 129421837 : continue;
1556 : }
1557 : }
1558 : }
1559 :
1560 861599304 : if (i >= lowest_unlock && i > skip_level) {
1561 136448049 : check_skip = false;
1562 136448049 : btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
1563 137481174 : path->locks[i] = 0;
1564 137481174 : if (write_lock_level &&
1565 137481174 : i > min_write_lock_level &&
1566 42719767 : i <= *write_lock_level) {
1567 152377 : *write_lock_level = i - 1;
1568 : }
1569 : }
1570 : }
1571 918418376 : }
1572 :
1573 : /*
1574 : * Helper function for btrfs_search_slot() and other functions that do a search
1575 : * on a btree. The goal is to find a tree block in the cache (the radix tree at
1576 : * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
1577 : * its pages from disk.
1578 : *
1579 : * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
1580 : * whole btree search, starting again from the current root node.
1581 : */
1582 : static int
1583 541363727 : read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1584 : struct extent_buffer **eb_ret, int level, int slot,
1585 : const struct btrfs_key *key)
1586 : {
1587 541363727 : struct btrfs_fs_info *fs_info = root->fs_info;
1588 541363727 : struct btrfs_tree_parent_check check = { 0 };
1589 541363727 : u64 blocknr;
1590 541363727 : u64 gen;
1591 541363727 : struct extent_buffer *tmp;
1592 541363727 : int ret;
1593 541363727 : int parent_level;
1594 541363727 : bool unlock_up;
1595 :
1596 541363727 : unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
1597 541363727 : blocknr = btrfs_node_blockptr(*eb_ret, slot);
1598 541230112 : gen = btrfs_node_ptr_generation(*eb_ret, slot);
1599 541264900 : parent_level = btrfs_header_level(*eb_ret);
1600 541264900 : btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
1601 541251595 : check.has_first_key = true;
1602 541251595 : check.level = parent_level - 1;
1603 541251595 : check.transid = gen;
1604 541251595 : check.owner_root = root->root_key.objectid;
1605 :
1606 : /*
1607 : * If we need to read an extent buffer from disk and we are holding locks
1608 : * on upper level nodes, we unlock all the upper nodes before reading the
1609 : * extent buffer, and then return -EAGAIN to the caller as it needs to
1610 : * restart the search. We don't release the lock on the current level
1611 : * because we need to walk this node to figure out which blocks to read.
1612 : */
1613 541251595 : tmp = find_extent_buffer(fs_info, blocknr);
1614 541570210 : if (tmp) {
1615 541469143 : if (p->reada == READA_FORWARD_ALWAYS)
1616 20111 : reada_for_search(fs_info, p, level, slot, key->objectid);
1617 :
1618 : /* first we do an atomic uptodate check */
1619 541469143 : if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
1620 : /*
1621 : * Do extra check for first_key, eb can be stale due to
1622 : * being cached, read from scrub, or have multiple
1623 : * parents (shared tree blocks).
1624 : */
1625 541429181 : if (btrfs_verify_level_key(tmp,
1626 : parent_level - 1, &check.first_key, gen)) {
1627 0 : free_extent_buffer(tmp);
1628 0 : return -EUCLEAN;
1629 : }
1630 541335921 : *eb_ret = tmp;
1631 541335921 : return 0;
1632 : }
1633 :
1634 232 : if (p->nowait) {
1635 0 : free_extent_buffer(tmp);
1636 0 : return -EAGAIN;
1637 : }
1638 :
1639 232 : if (unlock_up)
1640 1 : btrfs_unlock_up_safe(p, level + 1);
1641 :
1642 : /* now we're allowed to do a blocking uptodate check */
1643 232 : ret = btrfs_read_extent_buffer(tmp, &check);
1644 232 : if (ret) {
1645 0 : free_extent_buffer(tmp);
1646 0 : btrfs_release_path(p);
1647 0 : return -EIO;
1648 : }
1649 232 : if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
1650 0 : free_extent_buffer(tmp);
1651 0 : btrfs_release_path(p);
1652 0 : return -EUCLEAN;
1653 : }
1654 :
1655 232 : if (unlock_up)
1656 1 : ret = -EAGAIN;
1657 :
1658 231 : goto out;
1659 101067 : } else if (p->nowait) {
1660 : return -EAGAIN;
1661 : }
1662 :
1663 101067 : if (unlock_up) {
1664 3971 : btrfs_unlock_up_safe(p, level + 1);
1665 3971 : ret = -EAGAIN;
1666 : } else {
1667 : ret = 0;
1668 : }
1669 :
1670 101067 : if (p->reada != READA_NONE)
1671 9185 : reada_for_search(fs_info, p, level, slot, key->objectid);
1672 :
1673 101067 : tmp = read_tree_block(fs_info, blocknr, &check);
1674 101070 : if (IS_ERR(tmp)) {
1675 0 : btrfs_release_path(p);
1676 0 : return PTR_ERR(tmp);
1677 : }
1678 : /*
1679 : * If the read above didn't mark this buffer up to date,
1680 : * it will never end up being up to date. Set ret to EIO now
1681 : * and give up so that our caller doesn't loop forever
1682 : * on our EAGAINs.
1683 : */
1684 202140 : if (!extent_buffer_uptodate(tmp))
1685 : ret = -EIO;
1686 :
1687 101070 : out:
1688 101302 : if (ret == 0) {
1689 97328 : *eb_ret = tmp;
1690 : } else {
1691 3974 : free_extent_buffer(tmp);
1692 3973 : btrfs_release_path(p);
1693 : }
1694 :
1695 : return ret;
1696 : }
1697 :
1698 : /*
1699 : * helper function for btrfs_search_slot. This does all of the checks
1700 : * for node-level blocks and does any balancing required based on
1701 : * the ins_len.
1702 : *
1703 : * If no extra work was required, zero is returned. If we had to
1704 : * drop the path, -EAGAIN is returned and btrfs_search_slot must
1705 : * start over
1706 : */
1707 : static int
1708 570187625 : setup_nodes_for_search(struct btrfs_trans_handle *trans,
1709 : struct btrfs_root *root, struct btrfs_path *p,
1710 : struct extent_buffer *b, int level, int ins_len,
1711 : int *write_lock_level)
1712 : {
1713 570187625 : struct btrfs_fs_info *fs_info = root->fs_info;
1714 570187625 : int ret = 0;
1715 :
1716 570187625 : if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1717 155042777 : BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
1718 :
1719 186801 : if (*write_lock_level < level + 1) {
1720 70785 : *write_lock_level = level + 1;
1721 70785 : btrfs_release_path(p);
1722 70785 : return -EAGAIN;
1723 : }
1724 :
1725 116016 : reada_for_balance(p, level);
1726 116016 : ret = split_node(trans, root, p, level);
1727 :
1728 116016 : b = p->nodes[level];
1729 570000824 : } else if (ins_len < 0 && btrfs_header_nritems(b) <
1730 103814438 : BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
1731 :
1732 79511438 : if (*write_lock_level < level + 1) {
1733 28063012 : *write_lock_level = level + 1;
1734 28063012 : btrfs_release_path(p);
1735 28063012 : return -EAGAIN;
1736 : }
1737 :
1738 51448426 : reada_for_balance(p, level);
1739 51448156 : ret = balance_level(trans, root, p, level);
1740 51447872 : if (ret)
1741 : return ret;
1742 :
1743 51447872 : b = p->nodes[level];
1744 51447872 : if (!b) {
1745 1061 : btrfs_release_path(p);
1746 1061 : return -EAGAIN;
1747 : }
1748 51446811 : BUG_ON(btrfs_header_nritems(b) == 1);
1749 : }
1750 : return ret;
1751 : }
1752 :
1753 32445 : int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
1754 : u64 iobjectid, u64 ioff, u8 key_type,
1755 : struct btrfs_key *found_key)
1756 : {
1757 32445 : int ret;
1758 32445 : struct btrfs_key key;
1759 32445 : struct extent_buffer *eb;
1760 :
1761 32445 : ASSERT(path);
1762 32445 : ASSERT(found_key);
1763 :
1764 32445 : key.type = key_type;
1765 32445 : key.objectid = iobjectid;
1766 32445 : key.offset = ioff;
1767 :
1768 32445 : ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1769 32445 : if (ret < 0)
1770 : return ret;
1771 :
1772 32445 : eb = path->nodes[0];
1773 32445 : if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1774 495 : ret = btrfs_next_leaf(fs_root, path);
1775 495 : if (ret)
1776 : return ret;
1777 495 : eb = path->nodes[0];
1778 : }
1779 :
1780 32445 : btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1781 32445 : if (found_key->type != key.type ||
1782 31507 : found_key->objectid != key.objectid)
1783 938 : return 1;
1784 :
1785 : return 0;
1786 : }
1787 :
1788 394116648 : static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1789 : struct btrfs_path *p,
1790 : int write_lock_level)
1791 : {
1792 394116648 : struct extent_buffer *b;
1793 394116648 : int root_lock = 0;
1794 394116648 : int level = 0;
1795 :
1796 394116648 : if (p->search_commit_root) {
1797 33671501 : b = root->commit_root;
1798 33671501 : atomic_inc(&b->refs);
1799 33673400 : level = btrfs_header_level(b);
1800 : /*
1801 : * Ensure that all callers have set skip_locking when
1802 : * p->search_commit_root = 1.
1803 : */
1804 33673400 : ASSERT(p->skip_locking == 1);
1805 :
1806 33673400 : goto out;
1807 : }
1808 :
1809 360445147 : if (p->skip_locking) {
1810 7540898 : b = btrfs_root_node(root);
1811 7540898 : level = btrfs_header_level(b);
1812 7540898 : goto out;
1813 : }
1814 :
1815 : /* We try very hard to do read locks on the root */
1816 352904249 : root_lock = BTRFS_READ_LOCK;
1817 :
1818 : /*
1819 : * If the level is set to maximum, we can skip trying to get the read
1820 : * lock.
1821 : */
1822 352904249 : if (write_lock_level < BTRFS_MAX_LEVEL) {
1823 : /*
1824 : * We don't know the level of the root node until we actually
1825 : * have it read locked
1826 : */
1827 321699431 : if (p->nowait) {
1828 0 : b = btrfs_try_read_lock_root_node(root);
1829 0 : if (IS_ERR(b))
1830 : return b;
1831 : } else {
1832 321699431 : b = btrfs_read_lock_root_node(root);
1833 : }
1834 321763312 : level = btrfs_header_level(b);
1835 321763312 : if (level > write_lock_level)
1836 196873944 : goto out;
1837 :
1838 : /* Whoops, must trade for write lock */
1839 124889368 : btrfs_tree_read_unlock(b);
1840 124889682 : free_extent_buffer(b);
1841 : }
1842 :
1843 156110208 : b = btrfs_lock_root_node(root);
1844 156103333 : root_lock = BTRFS_WRITE_LOCK;
1845 :
1846 : /* The level might have changed, check again */
1847 156103333 : level = btrfs_header_level(b);
1848 :
1849 394191575 : out:
1850 : /*
1851 : * The root may have failed to write out at some point, and thus is no
1852 : * longer valid, return an error in this case.
1853 : */
1854 788383150 : if (!extent_buffer_uptodate(b)) {
1855 16 : if (root_lock)
1856 16 : btrfs_tree_unlock_rw(b, root_lock);
1857 16 : free_extent_buffer(b);
1858 16 : return ERR_PTR(-EIO);
1859 : }
1860 :
1861 394191559 : p->nodes[level] = b;
1862 394191559 : if (!p->skip_locking)
1863 352902605 : p->locks[level] = root_lock;
1864 : /*
1865 : * Callers are responsible for dropping b's references.
1866 : */
1867 : return b;
1868 : }
1869 :
1870 : /*
1871 : * Replace the extent buffer at the lowest level of the path with a cloned
1872 : * version. The purpose is to be able to use it safely, after releasing the
1873 : * commit root semaphore, even if relocation is happening in parallel, the
1874 : * transaction used for relocation is committed and the extent buffer is
1875 : * reallocated in the next transaction.
1876 : *
1877 : * This is used in a context where the caller does not prevent transaction
1878 : * commits from happening, either by holding a transaction handle or holding
1879 : * some lock, while it's doing searches through a commit root.
1880 : * At the moment it's only used for send operations.
1881 : */
1882 18833708 : static int finish_need_commit_sem_search(struct btrfs_path *path)
1883 : {
1884 18833708 : const int i = path->lowest_level;
1885 18833708 : const int slot = path->slots[i];
1886 18833708 : struct extent_buffer *lowest = path->nodes[i];
1887 18833708 : struct extent_buffer *clone;
1888 :
1889 18833708 : ASSERT(path->need_commit_sem);
1890 :
1891 18833708 : if (!lowest)
1892 : return 0;
1893 :
1894 18833708 : lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
1895 :
1896 18833708 : clone = btrfs_clone_extent_buffer(lowest);
1897 18834098 : if (!clone)
1898 : return -ENOMEM;
1899 :
1900 18834098 : btrfs_release_path(path);
1901 18834020 : path->nodes[i] = clone;
1902 18834020 : path->slots[i] = slot;
1903 :
1904 18834020 : return 0;
1905 : }
1906 :
1907 : static inline int search_for_key_slot(struct extent_buffer *eb,
1908 : int search_low_slot,
1909 : const struct btrfs_key *key,
1910 : int prev_cmp,
1911 : int *slot)
1912 : {
1913 : /*
1914 : * If a previous call to btrfs_bin_search() on a parent node returned an
1915 : * exact match (prev_cmp == 0), we can safely assume the target key will
1916 : * always be at slot 0 on lower levels, since each key pointer
1917 : * (struct btrfs_key_ptr) refers to the lowest key accessible from the
1918 : * subtree it points to. Thus we can skip searching lower levels.
1919 : */
1920 914530599 : if (prev_cmp == 0) {
1921 21491045 : *slot = 0;
1922 21491045 : return 0;
1923 : }
1924 :
1925 893039554 : return btrfs_bin_search(eb, search_low_slot, key, slot);
1926 : }
1927 :
1928 351700363 : static int search_leaf(struct btrfs_trans_handle *trans,
1929 : struct btrfs_root *root,
1930 : const struct btrfs_key *key,
1931 : struct btrfs_path *path,
1932 : int ins_len,
1933 : int prev_cmp)
1934 : {
1935 351700363 : struct extent_buffer *leaf = path->nodes[0];
1936 351700363 : int leaf_free_space = -1;
1937 351700363 : int search_low_slot = 0;
1938 351700363 : int ret;
1939 351700363 : bool do_bin_search = true;
1940 :
1941 : /*
1942 : * If we are doing an insertion, the leaf has enough free space and the
1943 : * destination slot for the key is not slot 0, then we can unlock our
1944 : * write lock on the parent, and any other upper nodes, before doing the
1945 : * binary search on the leaf (with search_for_key_slot()), allowing other
1946 : * tasks to lock the parent and any other upper nodes.
1947 : */
1948 351700363 : if (ins_len > 0) {
1949 : /*
1950 : * Cache the leaf free space, since we will need it later and it
1951 : * will not change until then.
1952 : */
1953 90815723 : leaf_free_space = btrfs_leaf_free_space(leaf);
1954 :
1955 : /*
1956 : * !path->locks[1] means we have a single node tree, the leaf is
1957 : * the root of the tree.
1958 : */
1959 90798835 : if (path->locks[1] && leaf_free_space >= ins_len) {
1960 79542980 : struct btrfs_disk_key first_key;
1961 :
1962 79542980 : ASSERT(btrfs_header_nritems(leaf) > 0);
1963 79542980 : btrfs_item_key(leaf, &first_key, 0);
1964 :
1965 : /*
1966 : * Doing the extra comparison with the first key is cheap,
1967 : * taking into account that the first key is very likely
1968 : * already in a cache line because it immediately follows
1969 : * the extent buffer's header and we have recently accessed
1970 : * the header's level field.
1971 : */
1972 79551521 : ret = comp_keys(&first_key, key);
1973 79551521 : if (ret < 0) {
1974 : /*
1975 : * The first key is smaller than the key we want
1976 : * to insert, so we are safe to unlock all upper
1977 : * nodes and we have to do the binary search.
1978 : *
1979 : * We do use btrfs_unlock_up_safe() and not
1980 : * unlock_up() because the later does not unlock
1981 : * nodes with a slot of 0 - we can safely unlock
1982 : * any node even if its slot is 0 since in this
1983 : * case the key does not end up at slot 0 of the
1984 : * leaf and there's no need to split the leaf.
1985 : */
1986 72280795 : btrfs_unlock_up_safe(path, 1);
1987 72280795 : search_low_slot = 1;
1988 : } else {
1989 : /*
1990 : * The first key is >= then the key we want to
1991 : * insert, so we can skip the binary search as
1992 : * the target key will be at slot 0.
1993 : *
1994 : * We can not unlock upper nodes when the key is
1995 : * less than the first key, because we will need
1996 : * to update the key at slot 0 of the parent node
1997 : * and possibly of other upper nodes too.
1998 : * If the key matches the first key, then we can
1999 : * unlock all the upper nodes, using
2000 : * btrfs_unlock_up_safe() instead of unlock_up()
2001 : * as stated above.
2002 : */
2003 7270726 : if (ret == 0)
2004 7167264 : btrfs_unlock_up_safe(path, 1);
2005 : /*
2006 : * ret is already 0 or 1, matching the result of
2007 : * a btrfs_bin_search() call, so there is no need
2008 : * to adjust it.
2009 : */
2010 7270726 : do_bin_search = false;
2011 7270726 : path->slots[0] = 0;
2012 : }
2013 : }
2014 : }
2015 :
2016 351698558 : if (do_bin_search) {
2017 344286653 : ret = search_for_key_slot(leaf, search_low_slot, key,
2018 : prev_cmp, &path->slots[0]);
2019 344195757 : if (ret < 0)
2020 : return ret;
2021 : }
2022 :
2023 351607662 : if (ins_len > 0) {
2024 : /*
2025 : * Item key already exists. In this case, if we are allowed to
2026 : * insert the item (for example, in dir_item case, item key
2027 : * collision is allowed), it will be merged with the original
2028 : * item. Only the item size grows, no new btrfs item will be
2029 : * added. If search_for_extension is not set, ins_len already
2030 : * accounts the size btrfs_item, deduct it here so leaf space
2031 : * check will be correct.
2032 : */
2033 90799517 : if (ret == 0 && !path->search_for_extension) {
2034 453317 : ASSERT(ins_len >= sizeof(struct btrfs_item));
2035 453317 : ins_len -= sizeof(struct btrfs_item);
2036 : }
2037 :
2038 90799517 : ASSERT(leaf_free_space >= 0);
2039 :
2040 90799517 : if (leaf_free_space < ins_len) {
2041 3817097 : int err;
2042 :
2043 3817097 : err = split_leaf(trans, root, key, path, ins_len,
2044 : (ret == 0));
2045 3817144 : ASSERT(err <= 0);
2046 3817144 : if (WARN_ON(err > 0))
2047 : err = -EUCLEAN;
2048 3817144 : if (err)
2049 : ret = err;
2050 : }
2051 : }
2052 :
2053 : return ret;
2054 : }
2055 :
2056 : /*
2057 : * btrfs_search_slot - look for a key in a tree and perform necessary
2058 : * modifications to preserve tree invariants.
2059 : *
2060 : * @trans: Handle of transaction, used when modifying the tree
2061 : * @p: Holds all btree nodes along the search path
2062 : * @root: The root node of the tree
2063 : * @key: The key we are looking for
2064 : * @ins_len: Indicates purpose of search:
2065 : * >0 for inserts it's size of item inserted (*)
2066 : * <0 for deletions
2067 : * 0 for plain searches, not modifying the tree
2068 : *
2069 : * (*) If size of item inserted doesn't include
2070 : * sizeof(struct btrfs_item), then p->search_for_extension must
2071 : * be set.
2072 : * @cow: boolean should CoW operations be performed. Must always be 1
2073 : * when modifying the tree.
2074 : *
2075 : * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2076 : * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2077 : *
2078 : * If @key is found, 0 is returned and you can find the item in the leaf level
2079 : * of the path (level 0)
2080 : *
2081 : * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2082 : * points to the slot where it should be inserted
2083 : *
2084 : * If an error is encountered while searching the tree a negative error number
2085 : * is returned
2086 : */
2087 359780297 : int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2088 : const struct btrfs_key *key, struct btrfs_path *p,
2089 : int ins_len, int cow)
2090 : {
2091 359780297 : struct btrfs_fs_info *fs_info = root->fs_info;
2092 359780297 : struct extent_buffer *b;
2093 359780297 : int slot;
2094 359780297 : int ret;
2095 359780297 : int err;
2096 359780297 : int level;
2097 359780297 : int lowest_unlock = 1;
2098 : /* everything at write_lock_level or lower must be write locked */
2099 359780297 : int write_lock_level = 0;
2100 359780297 : u8 lowest_level = 0;
2101 359780297 : int min_write_lock_level;
2102 359780297 : int prev_cmp;
2103 :
2104 359780297 : might_sleep();
2105 :
2106 359783671 : lowest_level = p->lowest_level;
2107 359783671 : WARN_ON(lowest_level && ins_len > 0);
2108 359783671 : WARN_ON(p->nodes[0] != NULL);
2109 359783671 : BUG_ON(!cow && ins_len);
2110 :
2111 : /*
2112 : * For now only allow nowait for read only operations. There's no
2113 : * strict reason why we can't, we just only need it for reads so it's
2114 : * only implemented for reads.
2115 : */
2116 359783671 : ASSERT(!p->nowait || !cow);
2117 :
2118 359783671 : if (ins_len < 0) {
2119 58007466 : lowest_unlock = 2;
2120 :
2121 : /* when we are removing items, we might have to go up to level
2122 : * two as we update tree pointers Make sure we keep write
2123 : * for those levels as well
2124 : */
2125 58007466 : write_lock_level = 2;
2126 301776205 : } else if (ins_len > 0) {
2127 : /*
2128 : * for inserting items, make sure we have a write lock on
2129 : * level 1 so we can update keys
2130 : */
2131 90808700 : write_lock_level = 1;
2132 : }
2133 :
2134 359783671 : if (!cow)
2135 177665403 : write_lock_level = -1;
2136 :
2137 359783671 : if (cow && (p->keep_locks || p->lowest_level))
2138 31204732 : write_lock_level = BTRFS_MAX_LEVEL;
2139 :
2140 359783671 : min_write_lock_level = write_lock_level;
2141 :
2142 359783671 : if (p->need_commit_sem) {
2143 18765276 : ASSERT(p->search_commit_root);
2144 18765276 : if (p->nowait) {
2145 0 : if (!down_read_trylock(&fs_info->commit_root_sem))
2146 : return -EAGAIN;
2147 : } else {
2148 18765276 : down_read(&fs_info->commit_root_sem);
2149 : }
2150 : }
2151 :
2152 341018395 : again:
2153 394102127 : prev_cmp = -1;
2154 394102127 : b = btrfs_search_slot_get_root(root, p, write_lock_level);
2155 394060886 : if (IS_ERR(b)) {
2156 16 : ret = PTR_ERR(b);
2157 16 : goto done;
2158 : }
2159 :
2160 923044040 : while (b) {
2161 923044040 : int dec = 0;
2162 :
2163 923044040 : level = btrfs_header_level(b);
2164 :
2165 923044040 : if (cow) {
2166 471612142 : bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2167 :
2168 : /*
2169 : * if we don't really need to cow this block
2170 : * then we don't want to set the path blocking,
2171 : * so we test it here
2172 : */
2173 471612142 : if (!should_cow_block(trans, root, b))
2174 464198873 : goto cow_done;
2175 :
2176 : /*
2177 : * must have write locks on this node and the
2178 : * parent
2179 : */
2180 7381840 : if (level > write_lock_level ||
2181 7132039 : (level + 1 > write_lock_level &&
2182 1555272 : level + 1 < BTRFS_MAX_LEVEL &&
2183 1555272 : p->nodes[level + 1])) {
2184 1143236 : write_lock_level = level + 1;
2185 1143236 : btrfs_release_path(p);
2186 1143325 : goto again;
2187 : }
2188 :
2189 6238604 : if (last_level)
2190 0 : err = btrfs_cow_block(trans, root, b, NULL, 0,
2191 : &b,
2192 : BTRFS_NESTING_COW);
2193 : else
2194 6238604 : err = btrfs_cow_block(trans, root, b,
2195 : p->nodes[level + 1],
2196 : p->slots[level + 1], &b,
2197 : BTRFS_NESTING_COW);
2198 6238720 : if (err) {
2199 65 : ret = err;
2200 65 : goto done;
2201 : }
2202 : }
2203 457670553 : cow_done:
2204 921869426 : p->nodes[level] = b;
2205 :
2206 : /*
2207 : * we have a lock on b and as long as we aren't changing
2208 : * the tree, there is no way to for the items in b to change.
2209 : * It is safe to drop the lock on our parent before we
2210 : * go through the expensive btree search on b.
2211 : *
2212 : * If we're inserting or deleting (ins_len != 0), then we might
2213 : * be changing slot zero, which may require changing the parent.
2214 : * So, we can't drop the lock until after we know which slot
2215 : * we're operating on.
2216 : */
2217 921869426 : if (!ins_len && !p->keep_locks) {
2218 431279158 : int u = level + 1;
2219 :
2220 431279158 : if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2221 189295438 : btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2222 189254993 : p->locks[u] = 0;
2223 : }
2224 : }
2225 :
2226 921828981 : if (level == 0) {
2227 351585035 : if (ins_len > 0)
2228 : ASSERT(write_lock_level >= 1);
2229 :
2230 351585035 : ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
2231 351422803 : if (!p->search_for_split)
2232 351394329 : unlock_up(p, level, lowest_unlock,
2233 : min_write_lock_level, NULL);
2234 351427913 : goto done;
2235 : }
2236 :
2237 570243946 : ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
2238 570061598 : if (ret < 0)
2239 0 : goto done;
2240 570061598 : prev_cmp = ret;
2241 :
2242 570061598 : if (ret && slot > 0) {
2243 532919371 : dec = 1;
2244 532919371 : slot--;
2245 : }
2246 570061598 : p->slots[level] = slot;
2247 570061598 : err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2248 : &write_lock_level);
2249 570297286 : if (err == -EAGAIN)
2250 28134889 : goto again;
2251 542162397 : if (err) {
2252 0 : ret = err;
2253 0 : goto done;
2254 : }
2255 542162397 : b = p->nodes[level];
2256 542162397 : slot = p->slots[level];
2257 :
2258 : /*
2259 : * Slot 0 is special, if we change the key we have to update
2260 : * the parent pointer which means we must have a write lock on
2261 : * the parent
2262 : */
2263 542162397 : if (slot == 0 && ins_len && write_lock_level < level + 1) {
2264 5038087 : write_lock_level = level + 1;
2265 5038087 : btrfs_release_path(p);
2266 5038204 : goto again;
2267 : }
2268 :
2269 537124310 : unlock_up(p, level, lowest_unlock, min_write_lock_level,
2270 : &write_lock_level);
2271 :
2272 536923991 : if (level == lowest_level) {
2273 8236222 : if (dec)
2274 26099 : p->slots[level]++;
2275 8236222 : goto done;
2276 : }
2277 :
2278 528687769 : err = read_block_for_search(root, p, &b, level, slot, key);
2279 528940254 : if (err == -EAGAIN)
2280 1159 : goto again;
2281 528939095 : if (err) {
2282 0 : ret = err;
2283 0 : goto done;
2284 : }
2285 :
2286 528939095 : if (!p->skip_locking) {
2287 469584573 : level = btrfs_header_level(b);
2288 :
2289 469584573 : btrfs_maybe_reset_lockdep_class(root, b);
2290 :
2291 469584573 : if (level <= write_lock_level) {
2292 249360214 : btrfs_tree_lock(b);
2293 249389094 : p->locks[level] = BTRFS_WRITE_LOCK;
2294 : } else {
2295 220224359 : if (p->nowait) {
2296 0 : if (!btrfs_try_tree_read_lock(b)) {
2297 0 : free_extent_buffer(b);
2298 0 : ret = -EAGAIN;
2299 0 : goto done;
2300 : }
2301 : } else {
2302 220224359 : btrfs_tree_read_lock(b);
2303 : }
2304 220239554 : p->locks[level] = BTRFS_READ_LOCK;
2305 : }
2306 469628648 : p->nodes[level] = b;
2307 : }
2308 : }
2309 : ret = 1;
2310 359664216 : done:
2311 359664216 : if (ret < 0 && !p->skip_release_on_error)
2312 81 : btrfs_release_path(p);
2313 :
2314 359664216 : if (p->need_commit_sem) {
2315 18765895 : int ret2;
2316 :
2317 18765895 : ret2 = finish_need_commit_sem_search(p);
2318 18766457 : up_read(&fs_info->commit_root_sem);
2319 18766870 : if (ret2)
2320 0 : ret = ret2;
2321 : }
2322 :
2323 : return ret;
2324 : }
2325 : ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
2326 :
2327 : /*
2328 : * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2329 : * current state of the tree together with the operations recorded in the tree
2330 : * modification log to search for the key in a previous version of this tree, as
2331 : * denoted by the time_seq parameter.
2332 : *
2333 : * Naturally, there is no support for insert, delete or cow operations.
2334 : *
2335 : * The resulting path and return value will be set up as if we called
2336 : * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2337 : */
2338 3495570 : int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2339 : struct btrfs_path *p, u64 time_seq)
2340 : {
2341 3495570 : struct btrfs_fs_info *fs_info = root->fs_info;
2342 3495570 : struct extent_buffer *b;
2343 3495570 : int slot;
2344 3495570 : int ret;
2345 3495570 : int err;
2346 3495570 : int level;
2347 3495570 : int lowest_unlock = 1;
2348 3495570 : u8 lowest_level = 0;
2349 :
2350 3495570 : lowest_level = p->lowest_level;
2351 3495570 : WARN_ON(p->nodes[0] != NULL);
2352 3495570 : ASSERT(!p->nowait);
2353 :
2354 3495570 : if (p->search_commit_root) {
2355 3298435 : BUG_ON(time_seq);
2356 3298435 : return btrfs_search_slot(NULL, root, key, p, 0, 0);
2357 : }
2358 :
2359 197135 : again:
2360 197135 : b = btrfs_get_old_root(root, time_seq);
2361 197135 : if (!b) {
2362 0 : ret = -EIO;
2363 0 : goto done;
2364 : }
2365 197135 : level = btrfs_header_level(b);
2366 197135 : p->locks[level] = BTRFS_READ_LOCK;
2367 :
2368 403142 : while (b) {
2369 403142 : int dec = 0;
2370 :
2371 403142 : level = btrfs_header_level(b);
2372 403142 : p->nodes[level] = b;
2373 :
2374 : /*
2375 : * we have a lock on b and as long as we aren't changing
2376 : * the tree, there is no way to for the items in b to change.
2377 : * It is safe to drop the lock on our parent before we
2378 : * go through the expensive btree search on b.
2379 : */
2380 403142 : btrfs_unlock_up_safe(p, level + 1);
2381 :
2382 403142 : ret = btrfs_bin_search(b, 0, key, &slot);
2383 403142 : if (ret < 0)
2384 0 : goto done;
2385 :
2386 403142 : if (level == 0) {
2387 76674 : p->slots[level] = slot;
2388 76674 : unlock_up(p, level, lowest_unlock, 0, NULL);
2389 76674 : goto done;
2390 : }
2391 :
2392 326468 : if (ret && slot > 0) {
2393 203401 : dec = 1;
2394 203401 : slot--;
2395 : }
2396 326468 : p->slots[level] = slot;
2397 326468 : unlock_up(p, level, lowest_unlock, 0, NULL);
2398 :
2399 326468 : if (level == lowest_level) {
2400 120461 : if (dec)
2401 0 : p->slots[level]++;
2402 120461 : goto done;
2403 : }
2404 :
2405 206007 : err = read_block_for_search(root, p, &b, level, slot, key);
2406 206007 : if (err == -EAGAIN)
2407 0 : goto again;
2408 206007 : if (err) {
2409 0 : ret = err;
2410 0 : goto done;
2411 : }
2412 :
2413 206007 : level = btrfs_header_level(b);
2414 206007 : btrfs_tree_read_lock(b);
2415 206007 : b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
2416 206007 : if (!b) {
2417 0 : ret = -ENOMEM;
2418 0 : goto done;
2419 : }
2420 206007 : p->locks[level] = BTRFS_READ_LOCK;
2421 206007 : p->nodes[level] = b;
2422 : }
2423 : ret = 1;
2424 197135 : done:
2425 197135 : if (ret < 0)
2426 0 : btrfs_release_path(p);
2427 :
2428 : return ret;
2429 : }
2430 :
2431 : /*
2432 : * Search the tree again to find a leaf with smaller keys.
2433 : * Returns 0 if it found something.
2434 : * Returns 1 if there are no smaller keys.
2435 : * Returns < 0 on error.
2436 : *
2437 : * This may release the path, and so you may lose any locks held at the
2438 : * time you call it.
2439 : */
2440 963 : static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
2441 : {
2442 963 : struct btrfs_key key;
2443 963 : struct btrfs_key orig_key;
2444 963 : struct btrfs_disk_key found_key;
2445 963 : int ret;
2446 :
2447 963 : btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
2448 963 : orig_key = key;
2449 :
2450 963 : if (key.offset > 0) {
2451 963 : key.offset--;
2452 0 : } else if (key.type > 0) {
2453 0 : key.type--;
2454 0 : key.offset = (u64)-1;
2455 0 : } else if (key.objectid > 0) {
2456 0 : key.objectid--;
2457 0 : key.type = (u8)-1;
2458 0 : key.offset = (u64)-1;
2459 : } else {
2460 : return 1;
2461 : }
2462 :
2463 963 : btrfs_release_path(path);
2464 963 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2465 963 : if (ret <= 0)
2466 : return ret;
2467 :
2468 : /*
2469 : * Previous key not found. Even if we were at slot 0 of the leaf we had
2470 : * before releasing the path and calling btrfs_search_slot(), we now may
2471 : * be in a slot pointing to the same original key - this can happen if
2472 : * after we released the path, one of more items were moved from a
2473 : * sibling leaf into the front of the leaf we had due to an insertion
2474 : * (see push_leaf_right()).
2475 : * If we hit this case and our slot is > 0 and just decrement the slot
2476 : * so that the caller does not process the same key again, which may or
2477 : * may not break the caller, depending on its logic.
2478 : */
2479 963 : if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
2480 918 : btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
2481 918 : ret = comp_keys(&found_key, &orig_key);
2482 918 : if (ret == 0) {
2483 918 : if (path->slots[0] > 0) {
2484 0 : path->slots[0]--;
2485 0 : return 0;
2486 : }
2487 : /*
2488 : * At slot 0, same key as before, it means orig_key is
2489 : * the lowest, leftmost, key in the tree. We're done.
2490 : */
2491 : return 1;
2492 : }
2493 : }
2494 :
2495 45 : btrfs_item_key(path->nodes[0], &found_key, 0);
2496 45 : ret = comp_keys(&found_key, &key);
2497 : /*
2498 : * We might have had an item with the previous key in the tree right
2499 : * before we released our path. And after we released our path, that
2500 : * item might have been pushed to the first slot (0) of the leaf we
2501 : * were holding due to a tree balance. Alternatively, an item with the
2502 : * previous key can exist as the only element of a leaf (big fat item).
2503 : * Therefore account for these 2 cases, so that our callers (like
2504 : * btrfs_previous_item) don't miss an existing item with a key matching
2505 : * the previous key we computed above.
2506 : */
2507 45 : if (ret <= 0)
2508 45 : return 0;
2509 : return 1;
2510 : }
2511 :
2512 : /*
2513 : * helper to use instead of search slot if no exact match is needed but
2514 : * instead the next or previous item should be returned.
2515 : * When find_higher is true, the next higher item is returned, the next lower
2516 : * otherwise.
2517 : * When return_any and find_higher are both true, and no higher item is found,
2518 : * return the next lower instead.
2519 : * When return_any is true and find_higher is false, and no lower item is found,
2520 : * return the next higher instead.
2521 : * It returns 0 if any item is found, 1 if none is found (tree empty), and
2522 : * < 0 on error
2523 : */
2524 1842681 : int btrfs_search_slot_for_read(struct btrfs_root *root,
2525 : const struct btrfs_key *key,
2526 : struct btrfs_path *p, int find_higher,
2527 : int return_any)
2528 : {
2529 1842681 : int ret;
2530 1842681 : struct extent_buffer *leaf;
2531 :
2532 : again:
2533 1842681 : ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2534 1842701 : if (ret <= 0)
2535 1019 : return ret;
2536 : /*
2537 : * a return value of 1 means the path is at the position where the
2538 : * item should be inserted. Normally this is the next bigger item,
2539 : * but in case the previous item is the last in a leaf, path points
2540 : * to the first free slot in the previous leaf, i.e. at an invalid
2541 : * item.
2542 : */
2543 1841682 : leaf = p->nodes[0];
2544 :
2545 1841682 : if (find_higher) {
2546 1433344 : if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2547 60130 : ret = btrfs_next_leaf(root, p);
2548 60130 : if (ret <= 0)
2549 60053 : return ret;
2550 77 : if (!return_any)
2551 : return 1;
2552 : /*
2553 : * no higher item found, return the next
2554 : * lower instead
2555 : */
2556 0 : return_any = 0;
2557 0 : find_higher = 0;
2558 0 : btrfs_release_path(p);
2559 0 : goto again;
2560 : }
2561 : } else {
2562 408338 : if (p->slots[0] == 0) {
2563 0 : ret = btrfs_prev_leaf(root, p);
2564 0 : if (ret < 0)
2565 0 : return ret;
2566 0 : if (!ret) {
2567 0 : leaf = p->nodes[0];
2568 0 : if (p->slots[0] == btrfs_header_nritems(leaf))
2569 0 : p->slots[0]--;
2570 0 : return 0;
2571 : }
2572 0 : if (!return_any)
2573 : return 1;
2574 : /*
2575 : * no lower item found, return the next
2576 : * higher instead
2577 : */
2578 0 : return_any = 0;
2579 0 : find_higher = 1;
2580 0 : btrfs_release_path(p);
2581 0 : goto again;
2582 : } else {
2583 408338 : --p->slots[0];
2584 : }
2585 : }
2586 : return 0;
2587 : }
2588 :
2589 : /*
2590 : * Execute search and call btrfs_previous_item to traverse backwards if the item
2591 : * was not found.
2592 : *
2593 : * Return 0 if found, 1 if not found and < 0 if error.
2594 : */
2595 4073 : int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
2596 : struct btrfs_path *path)
2597 : {
2598 4073 : int ret;
2599 :
2600 4073 : ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2601 4073 : if (ret > 0)
2602 3853 : ret = btrfs_previous_item(root, path, key->objectid, key->type);
2603 :
2604 4072 : if (ret == 0)
2605 2816 : btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2606 :
2607 4073 : return ret;
2608 : }
2609 :
2610 : /*
2611 : * Search for a valid slot for the given path.
2612 : *
2613 : * @root: The root node of the tree.
2614 : * @key: Will contain a valid item if found.
2615 : * @path: The starting point to validate the slot.
2616 : *
2617 : * Return: 0 if the item is valid
2618 : * 1 if not found
2619 : * <0 if error.
2620 : */
2621 2807717109 : int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
2622 : struct btrfs_path *path)
2623 : {
2624 2807717109 : if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2625 11849130 : int ret;
2626 :
2627 11849130 : ret = btrfs_next_leaf(root, path);
2628 11848832 : if (ret)
2629 : return ret;
2630 : }
2631 :
2632 2807701401 : btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2633 2807701401 : return 0;
2634 : }
2635 :
2636 : /*
2637 : * adjust the pointers going up the tree, starting at level
2638 : * making sure the right key of each node is points to 'key'.
2639 : * This is used after shifting pointers to the left, so it stops
2640 : * fixing up pointers when a given leaf/node is not in slot 0 of the
2641 : * higher levels
2642 : *
2643 : */
2644 5049891 : static void fixup_low_keys(struct btrfs_path *path,
2645 : struct btrfs_disk_key *key, int level)
2646 : {
2647 5049891 : int i;
2648 5049891 : struct extent_buffer *t;
2649 5049891 : int ret;
2650 :
2651 5358137 : for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2652 5358137 : int tslot = path->slots[i];
2653 :
2654 5358137 : if (!path->nodes[i])
2655 : break;
2656 4921317 : t = path->nodes[i];
2657 4921317 : ret = btrfs_tree_mod_log_insert_key(t, tslot,
2658 : BTRFS_MOD_LOG_KEY_REPLACE);
2659 4921315 : BUG_ON(ret < 0);
2660 4921315 : btrfs_set_node_key(t, key, tslot);
2661 4921267 : btrfs_mark_buffer_dirty(path->nodes[i]);
2662 4921343 : if (tslot != 0)
2663 : break;
2664 : }
2665 5049917 : }
2666 :
2667 : /*
2668 : * update item key.
2669 : *
2670 : * This function isn't completely safe. It's the caller's responsibility
2671 : * that the new key won't break the order
2672 : */
2673 5294594 : void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2674 : struct btrfs_path *path,
2675 : const struct btrfs_key *new_key)
2676 : {
2677 5294594 : struct btrfs_disk_key disk_key;
2678 5294594 : struct extent_buffer *eb;
2679 5294594 : int slot;
2680 :
2681 5294594 : eb = path->nodes[0];
2682 5294594 : slot = path->slots[0];
2683 5294594 : if (slot > 0) {
2684 4671391 : btrfs_item_key(eb, &disk_key, slot - 1);
2685 4671380 : if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
2686 0 : btrfs_print_leaf(eb);
2687 0 : btrfs_crit(fs_info,
2688 : "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2689 : slot, btrfs_disk_key_objectid(&disk_key),
2690 : btrfs_disk_key_type(&disk_key),
2691 : btrfs_disk_key_offset(&disk_key),
2692 : new_key->objectid, new_key->type,
2693 : new_key->offset);
2694 0 : BUG();
2695 : }
2696 : }
2697 5294583 : if (slot < btrfs_header_nritems(eb) - 1) {
2698 4849965 : btrfs_item_key(eb, &disk_key, slot + 1);
2699 4849965 : if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
2700 0 : btrfs_print_leaf(eb);
2701 0 : btrfs_crit(fs_info,
2702 : "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2703 : slot, btrfs_disk_key_objectid(&disk_key),
2704 : btrfs_disk_key_type(&disk_key),
2705 : btrfs_disk_key_offset(&disk_key),
2706 : new_key->objectid, new_key->type,
2707 : new_key->offset);
2708 0 : BUG();
2709 : }
2710 : }
2711 :
2712 5294583 : btrfs_cpu_key_to_disk(&disk_key, new_key);
2713 5294577 : btrfs_set_item_key(eb, &disk_key, slot);
2714 5294581 : btrfs_mark_buffer_dirty(eb);
2715 5294584 : if (slot == 0)
2716 623200 : fixup_low_keys(path, &disk_key, 1);
2717 5294584 : }
2718 :
2719 : /*
2720 : * Check key order of two sibling extent buffers.
2721 : *
2722 : * Return true if something is wrong.
2723 : * Return false if everything is fine.
2724 : *
2725 : * Tree-checker only works inside one tree block, thus the following
2726 : * corruption can not be detected by tree-checker:
2727 : *
2728 : * Leaf @left | Leaf @right
2729 : * --------------------------------------------------------------
2730 : * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
2731 : *
2732 : * Key f6 in leaf @left itself is valid, but not valid when the next
2733 : * key in leaf @right is 7.
2734 : * This can only be checked at tree block merge time.
2735 : * And since tree checker has ensured all key order in each tree block
2736 : * is correct, we only need to bother the last key of @left and the first
2737 : * key of @right.
2738 : */
2739 4355223 : static bool check_sibling_keys(struct extent_buffer *left,
2740 : struct extent_buffer *right)
2741 : {
2742 4355223 : struct btrfs_key left_last;
2743 4355223 : struct btrfs_key right_first;
2744 4355223 : int level = btrfs_header_level(left);
2745 4355223 : int nr_left = btrfs_header_nritems(left);
2746 4355223 : int nr_right = btrfs_header_nritems(right);
2747 :
2748 : /* No key to check in one of the tree blocks */
2749 4355223 : if (!nr_left || !nr_right)
2750 : return false;
2751 :
2752 4355170 : if (level) {
2753 116287 : btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2754 116287 : btrfs_node_key_to_cpu(right, &right_first, 0);
2755 : } else {
2756 4238883 : btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2757 4238863 : btrfs_item_key_to_cpu(right, &right_first, 0);
2758 : }
2759 :
2760 4355168 : if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) {
2761 0 : btrfs_crit(left->fs_info, "left extent buffer:");
2762 0 : btrfs_print_tree(left, false);
2763 0 : btrfs_crit(left->fs_info, "right extent buffer:");
2764 0 : btrfs_print_tree(right, false);
2765 0 : btrfs_crit(left->fs_info,
2766 : "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2767 : left_last.objectid, left_last.type,
2768 : left_last.offset, right_first.objectid,
2769 : right_first.type, right_first.offset);
2770 0 : return true;
2771 : }
2772 : return false;
2773 : }
2774 :
2775 : /*
2776 : * try to push data from one node into the next node left in the
2777 : * tree.
2778 : *
2779 : * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2780 : * error, and > 0 if there was no room in the left hand block.
2781 : */
2782 86717 : static int push_node_left(struct btrfs_trans_handle *trans,
2783 : struct extent_buffer *dst,
2784 : struct extent_buffer *src, int empty)
2785 : {
2786 86717 : struct btrfs_fs_info *fs_info = trans->fs_info;
2787 86717 : int push_items = 0;
2788 86717 : int src_nritems;
2789 86717 : int dst_nritems;
2790 86717 : int ret = 0;
2791 :
2792 86717 : src_nritems = btrfs_header_nritems(src);
2793 86717 : dst_nritems = btrfs_header_nritems(dst);
2794 86717 : push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2795 86717 : WARN_ON(btrfs_header_generation(src) != trans->transid);
2796 86717 : WARN_ON(btrfs_header_generation(dst) != trans->transid);
2797 :
2798 86717 : if (!empty && src_nritems <= 8)
2799 : return 1;
2800 :
2801 86717 : if (push_items <= 0)
2802 : return 1;
2803 :
2804 59805 : if (empty) {
2805 1484 : push_items = min(src_nritems, push_items);
2806 1484 : if (push_items < src_nritems) {
2807 : /* leave at least 8 pointers in the node if
2808 : * we aren't going to empty it
2809 : */
2810 1322 : if (src_nritems - push_items < 8) {
2811 759 : if (push_items <= 8)
2812 : return 1;
2813 7 : push_items -= 8;
2814 : }
2815 : }
2816 : } else
2817 58321 : push_items = min(src_nritems - 8, push_items);
2818 :
2819 : /* dst is the left eb, src is the middle eb */
2820 59053 : if (check_sibling_keys(dst, src)) {
2821 0 : ret = -EUCLEAN;
2822 0 : btrfs_abort_transaction(trans, ret);
2823 0 : return ret;
2824 : }
2825 59053 : ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
2826 59053 : if (ret) {
2827 0 : btrfs_abort_transaction(trans, ret);
2828 0 : return ret;
2829 : }
2830 59053 : copy_extent_buffer(dst, src,
2831 : btrfs_node_key_ptr_offset(dst, dst_nritems),
2832 : btrfs_node_key_ptr_offset(src, 0),
2833 : push_items * sizeof(struct btrfs_key_ptr));
2834 :
2835 59053 : if (push_items < src_nritems) {
2836 : /*
2837 : * btrfs_tree_mod_log_eb_copy handles logging the move, so we
2838 : * don't need to do an explicit tree mod log operation for it.
2839 : */
2840 58891 : memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0),
2841 : btrfs_node_key_ptr_offset(src, push_items),
2842 58891 : (src_nritems - push_items) *
2843 : sizeof(struct btrfs_key_ptr));
2844 : }
2845 59053 : btrfs_set_header_nritems(src, src_nritems - push_items);
2846 59053 : btrfs_set_header_nritems(dst, dst_nritems + push_items);
2847 59053 : btrfs_mark_buffer_dirty(src);
2848 59053 : btrfs_mark_buffer_dirty(dst);
2849 :
2850 59053 : return ret;
2851 : }
2852 :
2853 : /*
2854 : * try to push data from one node into the next node right in the
2855 : * tree.
2856 : *
2857 : * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2858 : * error, and > 0 if there was no room in the right hand block.
2859 : *
2860 : * this will only push up to 1/2 the contents of the left node over
2861 : */
2862 57287 : static int balance_node_right(struct btrfs_trans_handle *trans,
2863 : struct extent_buffer *dst,
2864 : struct extent_buffer *src)
2865 : {
2866 57287 : struct btrfs_fs_info *fs_info = trans->fs_info;
2867 57287 : int push_items = 0;
2868 57287 : int max_push;
2869 57287 : int src_nritems;
2870 57287 : int dst_nritems;
2871 57287 : int ret = 0;
2872 :
2873 57287 : WARN_ON(btrfs_header_generation(src) != trans->transid);
2874 57287 : WARN_ON(btrfs_header_generation(dst) != trans->transid);
2875 :
2876 57287 : src_nritems = btrfs_header_nritems(src);
2877 57287 : dst_nritems = btrfs_header_nritems(dst);
2878 57287 : push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2879 57287 : if (push_items <= 0)
2880 : return 1;
2881 :
2882 57287 : if (src_nritems < 4)
2883 : return 1;
2884 :
2885 57287 : max_push = src_nritems / 2 + 1;
2886 : /* don't try to empty the node */
2887 57287 : if (max_push >= src_nritems)
2888 : return 1;
2889 :
2890 57287 : if (max_push < push_items)
2891 : push_items = max_push;
2892 :
2893 : /* dst is the right eb, src is the middle eb */
2894 57287 : if (check_sibling_keys(src, dst)) {
2895 0 : ret = -EUCLEAN;
2896 0 : btrfs_abort_transaction(trans, ret);
2897 0 : return ret;
2898 : }
2899 :
2900 : /*
2901 : * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't
2902 : * need to do an explicit tree mod log operation for it.
2903 : */
2904 57287 : memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items),
2905 : btrfs_node_key_ptr_offset(dst, 0),
2906 : (dst_nritems) *
2907 : sizeof(struct btrfs_key_ptr));
2908 :
2909 57287 : ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2910 : push_items);
2911 57287 : if (ret) {
2912 0 : btrfs_abort_transaction(trans, ret);
2913 0 : return ret;
2914 : }
2915 57287 : copy_extent_buffer(dst, src,
2916 : btrfs_node_key_ptr_offset(dst, 0),
2917 : btrfs_node_key_ptr_offset(src, src_nritems - push_items),
2918 : push_items * sizeof(struct btrfs_key_ptr));
2919 :
2920 57287 : btrfs_set_header_nritems(src, src_nritems - push_items);
2921 57287 : btrfs_set_header_nritems(dst, dst_nritems + push_items);
2922 :
2923 57287 : btrfs_mark_buffer_dirty(src);
2924 57287 : btrfs_mark_buffer_dirty(dst);
2925 :
2926 57287 : return ret;
2927 : }
2928 :
2929 : /*
2930 : * helper function to insert a new root level in the tree.
2931 : * A new node is allocated, and a single item is inserted to
2932 : * point to the existing root
2933 : *
2934 : * returns zero on success or < 0 on failure.
2935 : */
2936 1856 : static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2937 : struct btrfs_root *root,
2938 : struct btrfs_path *path, int level)
2939 : {
2940 1856 : struct btrfs_fs_info *fs_info = root->fs_info;
2941 1856 : u64 lower_gen;
2942 1856 : struct extent_buffer *lower;
2943 1856 : struct extent_buffer *c;
2944 1856 : struct extent_buffer *old;
2945 1856 : struct btrfs_disk_key lower_key;
2946 1856 : int ret;
2947 :
2948 1856 : BUG_ON(path->nodes[level]);
2949 1856 : BUG_ON(path->nodes[level-1] != root->node);
2950 :
2951 1856 : lower = path->nodes[level-1];
2952 1856 : if (level == 1)
2953 1785 : btrfs_item_key(lower, &lower_key, 0);
2954 : else
2955 71 : btrfs_node_key(lower, &lower_key, 0);
2956 :
2957 1856 : c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2958 1856 : &lower_key, level, root->node->start, 0,
2959 : BTRFS_NESTING_NEW_ROOT);
2960 1856 : if (IS_ERR(c))
2961 0 : return PTR_ERR(c);
2962 :
2963 1856 : root_add_used(root, fs_info->nodesize);
2964 :
2965 1856 : btrfs_set_header_nritems(c, 1);
2966 1856 : btrfs_set_node_key(c, &lower_key, 0);
2967 1856 : btrfs_set_node_blockptr(c, 0, lower->start);
2968 1856 : lower_gen = btrfs_header_generation(lower);
2969 1856 : WARN_ON(lower_gen != trans->transid);
2970 :
2971 1856 : btrfs_set_node_ptr_generation(c, 0, lower_gen);
2972 :
2973 1856 : btrfs_mark_buffer_dirty(c);
2974 :
2975 1856 : old = root->node;
2976 1856 : ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
2977 1856 : if (ret < 0) {
2978 0 : btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
2979 0 : btrfs_tree_unlock(c);
2980 0 : free_extent_buffer(c);
2981 0 : return ret;
2982 : }
2983 1856 : rcu_assign_pointer(root->node, c);
2984 :
2985 : /* the super has an extra ref to root->node */
2986 1856 : free_extent_buffer(old);
2987 :
2988 1856 : add_root_to_dirty_list(root);
2989 1856 : atomic_inc(&c->refs);
2990 1856 : path->nodes[level] = c;
2991 1856 : path->locks[level] = BTRFS_WRITE_LOCK;
2992 1856 : path->slots[level] = 0;
2993 1856 : return 0;
2994 : }
2995 :
2996 : /*
2997 : * worker function to insert a single pointer in a node.
2998 : * the node should have enough room for the pointer already
2999 : *
3000 : * slot and level indicate where you want the key to go, and
3001 : * blocknr is the block the key points to.
3002 : */
3003 364898 : static int insert_ptr(struct btrfs_trans_handle *trans,
3004 : struct btrfs_path *path,
3005 : struct btrfs_disk_key *key, u64 bytenr,
3006 : int slot, int level)
3007 : {
3008 364898 : struct extent_buffer *lower;
3009 364898 : int nritems;
3010 364898 : int ret;
3011 :
3012 364898 : BUG_ON(!path->nodes[level]);
3013 364898 : btrfs_assert_tree_write_locked(path->nodes[level]);
3014 364898 : lower = path->nodes[level];
3015 364898 : nritems = btrfs_header_nritems(lower);
3016 364898 : BUG_ON(slot > nritems);
3017 364898 : BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
3018 364898 : if (slot != nritems) {
3019 189078 : if (level) {
3020 189078 : ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
3021 : slot, nritems - slot);
3022 189066 : if (ret < 0) {
3023 0 : btrfs_abort_transaction(trans, ret);
3024 0 : return ret;
3025 : }
3026 : }
3027 189066 : memmove_extent_buffer(lower,
3028 : btrfs_node_key_ptr_offset(lower, slot + 1),
3029 : btrfs_node_key_ptr_offset(lower, slot),
3030 189066 : (nritems - slot) * sizeof(struct btrfs_key_ptr));
3031 : }
3032 364903 : if (level) {
3033 364903 : ret = btrfs_tree_mod_log_insert_key(lower, slot,
3034 : BTRFS_MOD_LOG_KEY_ADD);
3035 364893 : if (ret < 0) {
3036 0 : btrfs_abort_transaction(trans, ret);
3037 0 : return ret;
3038 : }
3039 : }
3040 364893 : btrfs_set_node_key(lower, key, slot);
3041 364893 : btrfs_set_node_blockptr(lower, slot, bytenr);
3042 364880 : WARN_ON(trans->transid == 0);
3043 364880 : btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3044 364884 : btrfs_set_header_nritems(lower, nritems + 1);
3045 364884 : btrfs_mark_buffer_dirty(lower);
3046 :
3047 364884 : return 0;
3048 : }
3049 :
3050 : /*
3051 : * split the node at the specified level in path in two.
3052 : * The path is corrected to point to the appropriate node after the split
3053 : *
3054 : * Before splitting this tries to make some room in the node by pushing
3055 : * left and right, if either one works, it returns right away.
3056 : *
3057 : * returns 0 on success and < 0 on failure
3058 : */
3059 116016 : static noinline int split_node(struct btrfs_trans_handle *trans,
3060 : struct btrfs_root *root,
3061 : struct btrfs_path *path, int level)
3062 : {
3063 116016 : struct btrfs_fs_info *fs_info = root->fs_info;
3064 116016 : struct extent_buffer *c;
3065 116016 : struct extent_buffer *split;
3066 116016 : struct btrfs_disk_key disk_key;
3067 116016 : int mid;
3068 116016 : int ret;
3069 116016 : u32 c_nritems;
3070 :
3071 116016 : c = path->nodes[level];
3072 116016 : WARN_ON(btrfs_header_generation(c) != trans->transid);
3073 116016 : if (c == root->node) {
3074 : /*
3075 : * trying to split the root, lets make a new one
3076 : *
3077 : * tree mod log: We don't log_removal old root in
3078 : * insert_new_root, because that root buffer will be kept as a
3079 : * normal node. We are going to log removal of half of the
3080 : * elements below with btrfs_tree_mod_log_eb_copy(). We're
3081 : * holding a tree lock on the buffer, which is why we cannot
3082 : * race with other tree_mod_log users.
3083 : */
3084 71 : ret = insert_new_root(trans, root, path, level + 1);
3085 71 : if (ret)
3086 : return ret;
3087 : } else {
3088 115945 : ret = push_nodes_for_insert(trans, root, path, level);
3089 115945 : c = path->nodes[level];
3090 115945 : if (!ret && btrfs_header_nritems(c) <
3091 115608 : BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3092 : return 0;
3093 701 : if (ret < 0)
3094 : return ret;
3095 : }
3096 :
3097 772 : c_nritems = btrfs_header_nritems(c);
3098 772 : mid = (c_nritems + 1) / 2;
3099 772 : btrfs_node_key(c, &disk_key, mid);
3100 :
3101 772 : split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3102 : &disk_key, level, c->start, 0,
3103 : BTRFS_NESTING_SPLIT);
3104 772 : if (IS_ERR(split))
3105 0 : return PTR_ERR(split);
3106 :
3107 772 : root_add_used(root, fs_info->nodesize);
3108 772 : ASSERT(btrfs_header_level(c) == level);
3109 :
3110 772 : ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3111 772 : if (ret) {
3112 0 : btrfs_tree_unlock(split);
3113 0 : free_extent_buffer(split);
3114 0 : btrfs_abort_transaction(trans, ret);
3115 0 : return ret;
3116 : }
3117 772 : copy_extent_buffer(split, c,
3118 : btrfs_node_key_ptr_offset(split, 0),
3119 : btrfs_node_key_ptr_offset(c, mid),
3120 772 : (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3121 772 : btrfs_set_header_nritems(split, c_nritems - mid);
3122 772 : btrfs_set_header_nritems(c, mid);
3123 :
3124 772 : btrfs_mark_buffer_dirty(c);
3125 772 : btrfs_mark_buffer_dirty(split);
3126 :
3127 772 : ret = insert_ptr(trans, path, &disk_key, split->start,
3128 772 : path->slots[level + 1] + 1, level + 1);
3129 772 : if (ret < 0) {
3130 0 : btrfs_tree_unlock(split);
3131 0 : free_extent_buffer(split);
3132 0 : return ret;
3133 : }
3134 :
3135 772 : if (path->slots[level] >= mid) {
3136 662 : path->slots[level] -= mid;
3137 662 : btrfs_tree_unlock(c);
3138 662 : free_extent_buffer(c);
3139 662 : path->nodes[level] = split;
3140 662 : path->slots[level + 1] += 1;
3141 : } else {
3142 110 : btrfs_tree_unlock(split);
3143 110 : free_extent_buffer(split);
3144 : }
3145 : return 0;
3146 : }
3147 :
3148 : /*
3149 : * how many bytes are required to store the items in a leaf. start
3150 : * and nr indicate which items in the leaf to check. This totals up the
3151 : * space used both by the item structs and the item data
3152 : */
3153 340125129 : static int leaf_space_used(const struct extent_buffer *l, int start, int nr)
3154 : {
3155 340125129 : int data_len;
3156 340125129 : int nritems = btrfs_header_nritems(l);
3157 340125129 : int end = min(nritems, start + nr) - 1;
3158 :
3159 340125129 : if (!nr)
3160 : return 0;
3161 340090085 : data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
3162 339906645 : data_len = data_len - btrfs_item_offset(l, end);
3163 339910066 : data_len += sizeof(struct btrfs_item) * nr;
3164 339910066 : WARN_ON(data_len < 0);
3165 : return data_len;
3166 : }
3167 :
3168 : /*
3169 : * The space between the end of the leaf items and
3170 : * the start of the leaf data. IOW, how much room
3171 : * the leaf has left for both items and data
3172 : */
3173 309613313 : int btrfs_leaf_free_space(const struct extent_buffer *leaf)
3174 : {
3175 309613313 : struct btrfs_fs_info *fs_info = leaf->fs_info;
3176 309613313 : int nritems = btrfs_header_nritems(leaf);
3177 309613313 : int ret;
3178 :
3179 309613313 : ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3180 309422297 : if (ret < 0) {
3181 0 : btrfs_crit(fs_info,
3182 : "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3183 : ret,
3184 : (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3185 : leaf_space_used(leaf, 0, nritems), nritems);
3186 : }
3187 309422297 : return ret;
3188 : }
3189 :
3190 : /*
3191 : * min slot controls the lowest index we're willing to push to the
3192 : * right. We'll push up to and including min_slot, but no lower
3193 : */
3194 2938351 : static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3195 : struct btrfs_path *path,
3196 : int data_size, int empty,
3197 : struct extent_buffer *right,
3198 : int free_space, u32 left_nritems,
3199 : u32 min_slot)
3200 : {
3201 2938351 : struct btrfs_fs_info *fs_info = right->fs_info;
3202 2938351 : struct extent_buffer *left = path->nodes[0];
3203 2938351 : struct extent_buffer *upper = path->nodes[1];
3204 2938351 : struct btrfs_map_token token;
3205 2938351 : struct btrfs_disk_key disk_key;
3206 2938351 : int slot;
3207 2938351 : u32 i;
3208 2938351 : int push_space = 0;
3209 2938351 : int push_items = 0;
3210 2938351 : u32 nr;
3211 2938351 : u32 right_nritems;
3212 2938351 : u32 data_end;
3213 2938351 : u32 this_item_size;
3214 :
3215 2938351 : if (empty)
3216 : nr = 0;
3217 : else
3218 2920224 : nr = max_t(u32, 1, min_slot);
3219 :
3220 2938351 : if (path->slots[0] >= left_nritems)
3221 3111 : push_space += data_size;
3222 :
3223 2938351 : slot = path->slots[1];
3224 2938351 : i = left_nritems - 1;
3225 100646258 : while (i >= nr) {
3226 100644727 : if (!empty && push_items > 0) {
3227 97086279 : if (path->slots[0] > i)
3228 : break;
3229 96889338 : if (path->slots[0] == i) {
3230 348050 : int space = btrfs_leaf_free_space(left);
3231 :
3232 348051 : if (space + push_space * 2 > free_space)
3233 : break;
3234 : }
3235 : }
3236 :
3237 100275767 : if (path->slots[0] == i)
3238 254340 : push_space += data_size;
3239 :
3240 100275767 : this_item_size = btrfs_item_size(left, i);
3241 100275852 : if (this_item_size + sizeof(struct btrfs_item) +
3242 100275852 : push_space > free_space)
3243 : break;
3244 :
3245 97725984 : push_items++;
3246 97725984 : push_space += this_item_size + sizeof(struct btrfs_item);
3247 97725984 : if (i == 0)
3248 : break;
3249 97707907 : i--;
3250 : }
3251 :
3252 2938437 : if (push_items == 0)
3253 373235 : goto out_unlock;
3254 :
3255 2565202 : WARN_ON(!empty && push_items == left_nritems);
3256 :
3257 : /* push left to right */
3258 2565202 : right_nritems = btrfs_header_nritems(right);
3259 :
3260 2565202 : push_space = btrfs_item_data_end(left, left_nritems - push_items);
3261 2565193 : push_space -= leaf_data_end(left);
3262 :
3263 : /* make room in the right data area */
3264 2565180 : data_end = leaf_data_end(right);
3265 2565195 : memmove_leaf_data(right, data_end - push_space, data_end,
3266 2565195 : BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3267 :
3268 : /* copy from the left data area */
3269 2565201 : copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3270 2565198 : leaf_data_end(left), push_space);
3271 :
3272 2565206 : memmove_leaf_items(right, push_items, 0, right_nritems);
3273 :
3274 : /* copy the items from left to right */
3275 2565196 : copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
3276 :
3277 : /* update the item pointers */
3278 2565207 : btrfs_init_map_token(&token, right);
3279 2565196 : right_nritems += push_items;
3280 2565196 : btrfs_set_header_nritems(right, right_nritems);
3281 2565196 : push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3282 670430491 : for (i = 0; i < right_nritems; i++) {
3283 667865287 : push_space -= btrfs_token_item_size(&token, i);
3284 667866281 : btrfs_set_token_item_offset(&token, i, push_space);
3285 : }
3286 :
3287 2565204 : left_nritems -= push_items;
3288 2565204 : btrfs_set_header_nritems(left, left_nritems);
3289 :
3290 2565204 : if (left_nritems)
3291 2547127 : btrfs_mark_buffer_dirty(left);
3292 : else
3293 18077 : btrfs_clear_buffer_dirty(trans, left);
3294 :
3295 2565204 : btrfs_mark_buffer_dirty(right);
3296 :
3297 2565204 : btrfs_item_key(right, &disk_key, 0);
3298 2565204 : btrfs_set_node_key(upper, &disk_key, slot + 1);
3299 2565169 : btrfs_mark_buffer_dirty(upper);
3300 :
3301 : /* then fixup the leaf pointer in the path */
3302 2565201 : if (path->slots[0] >= left_nritems) {
3303 216536 : path->slots[0] -= left_nritems;
3304 216536 : if (btrfs_header_nritems(path->nodes[0]) == 0)
3305 18077 : btrfs_clear_buffer_dirty(trans, path->nodes[0]);
3306 216536 : btrfs_tree_unlock(path->nodes[0]);
3307 216536 : free_extent_buffer(path->nodes[0]);
3308 216535 : path->nodes[0] = right;
3309 216535 : path->slots[1] += 1;
3310 : } else {
3311 2348665 : btrfs_tree_unlock(right);
3312 2348667 : free_extent_buffer(right);
3313 : }
3314 : return 0;
3315 :
3316 : out_unlock:
3317 373235 : btrfs_tree_unlock(right);
3318 373235 : free_extent_buffer(right);
3319 373235 : return 1;
3320 : }
3321 :
3322 : /*
3323 : * push some data in the path leaf to the right, trying to free up at
3324 : * least data_size bytes. returns zero if the push worked, nonzero otherwise
3325 : *
3326 : * returns 1 if the push failed because the other node didn't have enough
3327 : * room, 0 if everything worked out and < 0 if there were major errors.
3328 : *
3329 : * this will push starting from min_slot to the end of the leaf. It won't
3330 : * push any slot lower than min_slot
3331 : */
3332 7663550 : static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3333 : *root, struct btrfs_path *path,
3334 : int min_data_size, int data_size,
3335 : int empty, u32 min_slot)
3336 : {
3337 7663550 : struct extent_buffer *left = path->nodes[0];
3338 7663550 : struct extent_buffer *right;
3339 7663550 : struct extent_buffer *upper;
3340 7663550 : int slot;
3341 7663550 : int free_space;
3342 7663550 : u32 left_nritems;
3343 7663550 : int ret;
3344 :
3345 7663550 : if (!path->nodes[1])
3346 : return 1;
3347 :
3348 4509667 : slot = path->slots[1];
3349 4509667 : upper = path->nodes[1];
3350 4509667 : if (slot >= btrfs_header_nritems(upper) - 1)
3351 : return 1;
3352 :
3353 3615080 : btrfs_assert_tree_write_locked(path->nodes[1]);
3354 :
3355 3615080 : right = btrfs_read_node_slot(upper, slot + 1);
3356 3615202 : if (IS_ERR(right))
3357 0 : return PTR_ERR(right);
3358 :
3359 3615202 : __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
3360 :
3361 3615158 : free_space = btrfs_leaf_free_space(right);
3362 3615253 : if (free_space < data_size)
3363 656091 : goto out_unlock;
3364 :
3365 2959162 : ret = btrfs_cow_block(trans, root, right, upper,
3366 : slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
3367 2959119 : if (ret)
3368 0 : goto out_unlock;
3369 :
3370 2959119 : left_nritems = btrfs_header_nritems(left);
3371 2959119 : if (left_nritems == 0)
3372 0 : goto out_unlock;
3373 :
3374 2959119 : if (check_sibling_keys(left, right)) {
3375 0 : ret = -EUCLEAN;
3376 0 : btrfs_abort_transaction(trans, ret);
3377 0 : btrfs_tree_unlock(right);
3378 0 : free_extent_buffer(right);
3379 0 : return ret;
3380 : }
3381 2959142 : if (path->slots[0] == left_nritems && !empty) {
3382 : /* Key greater than all keys in the leaf, right neighbor has
3383 : * enough room for it and we're not emptying our leaf to delete
3384 : * it, therefore use right neighbor to insert the new item and
3385 : * no need to touch/dirty our left leaf. */
3386 20745 : btrfs_tree_unlock(left);
3387 20745 : free_extent_buffer(left);
3388 20745 : path->nodes[0] = right;
3389 20745 : path->slots[0] = 0;
3390 20745 : path->slots[1]++;
3391 20745 : return 0;
3392 : }
3393 :
3394 2938397 : return __push_leaf_right(trans, path, min_data_size, empty, right,
3395 : free_space, left_nritems, min_slot);
3396 656091 : out_unlock:
3397 656091 : btrfs_tree_unlock(right);
3398 656096 : free_extent_buffer(right);
3399 656096 : return 1;
3400 : }
3401 :
3402 : /*
3403 : * push some data in the path leaf to the left, trying to free up at
3404 : * least data_size bytes. returns zero if the push worked, nonzero otherwise
3405 : *
3406 : * max_slot can put a limit on how far into the leaf we'll push items. The
3407 : * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
3408 : * items
3409 : */
3410 1279796 : static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3411 : struct btrfs_path *path, int data_size,
3412 : int empty, struct extent_buffer *left,
3413 : int free_space, u32 right_nritems,
3414 : u32 max_slot)
3415 : {
3416 1279796 : struct btrfs_fs_info *fs_info = left->fs_info;
3417 1279796 : struct btrfs_disk_key disk_key;
3418 1279796 : struct extent_buffer *right = path->nodes[0];
3419 1279796 : int i;
3420 1279796 : int push_space = 0;
3421 1279796 : int push_items = 0;
3422 1279796 : u32 old_left_nritems;
3423 1279796 : u32 nr;
3424 1279796 : int ret = 0;
3425 1279796 : u32 this_item_size;
3426 1279796 : u32 old_left_item_size;
3427 1279796 : struct btrfs_map_token token;
3428 :
3429 1279796 : if (empty)
3430 146689 : nr = min(right_nritems, max_slot);
3431 : else
3432 1133107 : nr = min(right_nritems - 1, max_slot);
3433 :
3434 43566775 : for (i = 0; i < nr; i++) {
3435 43481551 : if (!empty && push_items > 0) {
3436 38242796 : if (path->slots[0] < i)
3437 : break;
3438 38142969 : if (path->slots[0] == i) {
3439 170333 : int space = btrfs_leaf_free_space(right);
3440 :
3441 170333 : if (space + push_space * 2 > free_space)
3442 : break;
3443 : }
3444 : }
3445 :
3446 43313412 : if (path->slots[0] == i)
3447 238523 : push_space += data_size;
3448 :
3449 43313412 : this_item_size = btrfs_item_size(right, i);
3450 43313432 : if (this_item_size + sizeof(struct btrfs_item) + push_space >
3451 : free_space)
3452 : break;
3453 :
3454 42286979 : push_items++;
3455 42286979 : push_space += this_item_size + sizeof(struct btrfs_item);
3456 : }
3457 :
3458 1279816 : if (push_items == 0) {
3459 181998 : ret = 1;
3460 181998 : goto out;
3461 : }
3462 2048945 : WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3463 :
3464 : /* push data from right to left */
3465 1097818 : copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items);
3466 :
3467 1097806 : push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3468 1097806 : btrfs_item_offset(right, push_items - 1);
3469 :
3470 1097810 : copy_leaf_data(left, right, leaf_data_end(left) - push_space,
3471 : btrfs_item_offset(right, push_items - 1), push_space);
3472 1097817 : old_left_nritems = btrfs_header_nritems(left);
3473 1097817 : BUG_ON(old_left_nritems <= 0);
3474 :
3475 1097817 : btrfs_init_map_token(&token, left);
3476 1097814 : old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
3477 43382641 : for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3478 42284826 : u32 ioff;
3479 :
3480 42284826 : ioff = btrfs_token_item_offset(&token, i);
3481 42285128 : btrfs_set_token_item_offset(&token, i,
3482 42285128 : ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3483 : }
3484 1097815 : btrfs_set_header_nritems(left, old_left_nritems + push_items);
3485 :
3486 : /* fixup right node */
3487 1097815 : if (push_items > right_nritems)
3488 0 : WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3489 : right_nritems);
3490 :
3491 1097815 : if (push_items < right_nritems) {
3492 1012991 : push_space = btrfs_item_offset(right, push_items - 1) -
3493 1012990 : leaf_data_end(right);
3494 1012989 : memmove_leaf_data(right,
3495 1012989 : BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3496 1012987 : leaf_data_end(right), push_space);
3497 :
3498 1012993 : memmove_leaf_items(right, 0, push_items,
3499 1012993 : btrfs_header_nritems(right) - push_items);
3500 : }
3501 :
3502 1097817 : btrfs_init_map_token(&token, right);
3503 1097812 : right_nritems -= push_items;
3504 1097812 : btrfs_set_header_nritems(right, right_nritems);
3505 1097812 : push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3506 185115516 : for (i = 0; i < right_nritems; i++) {
3507 184017701 : push_space = push_space - btrfs_token_item_size(&token, i);
3508 184018523 : btrfs_set_token_item_offset(&token, i, push_space);
3509 : }
3510 :
3511 1097815 : btrfs_mark_buffer_dirty(left);
3512 1097817 : if (right_nritems)
3513 1012993 : btrfs_mark_buffer_dirty(right);
3514 : else
3515 84824 : btrfs_clear_buffer_dirty(trans, right);
3516 :
3517 1097818 : btrfs_item_key(right, &disk_key, 0);
3518 1097818 : fixup_low_keys(path, &disk_key, 1);
3519 :
3520 : /* then fixup the leaf pointer in the path */
3521 1097814 : if (path->slots[0] < push_items) {
3522 229288 : path->slots[0] += old_left_nritems;
3523 229288 : btrfs_tree_unlock(path->nodes[0]);
3524 229288 : free_extent_buffer(path->nodes[0]);
3525 229288 : path->nodes[0] = left;
3526 229288 : path->slots[1] -= 1;
3527 : } else {
3528 868526 : btrfs_tree_unlock(left);
3529 868529 : free_extent_buffer(left);
3530 868529 : path->slots[0] -= push_items;
3531 : }
3532 1097817 : BUG_ON(path->slots[0] < 0);
3533 : return ret;
3534 : out:
3535 181998 : btrfs_tree_unlock(left);
3536 181999 : free_extent_buffer(left);
3537 181999 : return ret;
3538 : }
3539 :
3540 : /*
3541 : * push some data in the path leaf to the left, trying to free up at
3542 : * least data_size bytes. returns zero if the push worked, nonzero otherwise
3543 : *
3544 : * max_slot can put a limit on how far into the leaf we'll push items. The
3545 : * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
3546 : * items
3547 : */
3548 5229254 : static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3549 : *root, struct btrfs_path *path, int min_data_size,
3550 : int data_size, int empty, u32 max_slot)
3551 : {
3552 5229254 : struct extent_buffer *right = path->nodes[0];
3553 5229254 : struct extent_buffer *left;
3554 5229254 : int slot;
3555 5229254 : int free_space;
3556 5229254 : u32 right_nritems;
3557 5229254 : int ret = 0;
3558 :
3559 5229254 : slot = path->slots[1];
3560 5229254 : if (slot == 0)
3561 : return 1;
3562 2025077 : if (!path->nodes[1])
3563 : return 1;
3564 :
3565 2025077 : right_nritems = btrfs_header_nritems(right);
3566 2025077 : if (right_nritems == 0)
3567 : return 1;
3568 :
3569 2025073 : btrfs_assert_tree_write_locked(path->nodes[1]);
3570 :
3571 2025073 : left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3572 2025102 : if (IS_ERR(left))
3573 0 : return PTR_ERR(left);
3574 :
3575 2025102 : __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
3576 :
3577 2025103 : free_space = btrfs_leaf_free_space(left);
3578 2025108 : if (free_space < data_size) {
3579 743110 : ret = 1;
3580 743110 : goto out;
3581 : }
3582 :
3583 1281998 : ret = btrfs_cow_block(trans, root, left,
3584 : path->nodes[1], slot - 1, &left,
3585 : BTRFS_NESTING_LEFT_COW);
3586 1281991 : if (ret) {
3587 : /* we hit -ENOSPC, but it isn't fatal here */
3588 2192 : if (ret == -ENOSPC)
3589 2192 : ret = 1;
3590 2192 : goto out;
3591 : }
3592 :
3593 1279799 : if (check_sibling_keys(left, right)) {
3594 0 : ret = -EUCLEAN;
3595 0 : btrfs_abort_transaction(trans, ret);
3596 0 : goto out;
3597 : }
3598 1279804 : return __push_leaf_left(trans, path, min_data_size, empty, left,
3599 : free_space, right_nritems, max_slot);
3600 745302 : out:
3601 745302 : btrfs_tree_unlock(left);
3602 745303 : free_extent_buffer(left);
3603 745303 : return ret;
3604 : }
3605 :
3606 : /*
3607 : * split the path's leaf in two, making sure there is at least data_size
3608 : * available for the resulting leaf level of the path.
3609 : */
3610 363450 : static noinline int copy_for_split(struct btrfs_trans_handle *trans,
3611 : struct btrfs_path *path,
3612 : struct extent_buffer *l,
3613 : struct extent_buffer *right,
3614 : int slot, int mid, int nritems)
3615 : {
3616 363450 : struct btrfs_fs_info *fs_info = trans->fs_info;
3617 363450 : int data_copy_size;
3618 363450 : int rt_data_off;
3619 363450 : int i;
3620 363450 : int ret;
3621 363450 : struct btrfs_disk_key disk_key;
3622 363450 : struct btrfs_map_token token;
3623 :
3624 363450 : nritems = nritems - mid;
3625 363450 : btrfs_set_header_nritems(right, nritems);
3626 363450 : data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
3627 :
3628 363451 : copy_leaf_items(right, l, 0, mid, nritems);
3629 :
3630 363451 : copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size,
3631 363451 : leaf_data_end(l), data_copy_size);
3632 :
3633 363447 : rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
3634 :
3635 363447 : btrfs_init_map_token(&token, right);
3636 30809378 : for (i = 0; i < nritems; i++) {
3637 30082483 : u32 ioff;
3638 :
3639 30082483 : ioff = btrfs_token_item_offset(&token, i);
3640 30082519 : btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
3641 : }
3642 :
3643 363448 : btrfs_set_header_nritems(l, mid);
3644 363448 : btrfs_item_key(right, &disk_key, 0);
3645 363449 : ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
3646 363444 : if (ret < 0)
3647 : return ret;
3648 :
3649 363444 : btrfs_mark_buffer_dirty(right);
3650 363451 : btrfs_mark_buffer_dirty(l);
3651 363451 : BUG_ON(path->slots[0] != slot);
3652 :
3653 363451 : if (mid <= slot) {
3654 257303 : btrfs_tree_unlock(path->nodes[0]);
3655 257303 : free_extent_buffer(path->nodes[0]);
3656 257303 : path->nodes[0] = right;
3657 257303 : path->slots[0] -= mid;
3658 257303 : path->slots[1] += 1;
3659 : } else {
3660 106148 : btrfs_tree_unlock(right);
3661 106148 : free_extent_buffer(right);
3662 : }
3663 :
3664 363451 : BUG_ON(path->slots[0] < 0);
3665 :
3666 : return 0;
3667 : }
3668 :
3669 : /*
3670 : * double splits happen when we need to insert a big item in the middle
3671 : * of a leaf. A double split can leave us with 3 mostly empty leaves:
3672 : * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3673 : * A B C
3674 : *
3675 : * We avoid this by trying to push the items on either side of our target
3676 : * into the adjacent leaves. If all goes well we can avoid the double split
3677 : * completely.
3678 : */
3679 406 : static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3680 : struct btrfs_root *root,
3681 : struct btrfs_path *path,
3682 : int data_size)
3683 : {
3684 406 : int ret;
3685 406 : int progress = 0;
3686 406 : int slot;
3687 406 : u32 nritems;
3688 406 : int space_needed = data_size;
3689 :
3690 406 : slot = path->slots[0];
3691 406 : if (slot < btrfs_header_nritems(path->nodes[0]))
3692 406 : space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3693 :
3694 : /*
3695 : * try to push all the items after our slot into the
3696 : * right leaf
3697 : */
3698 406 : ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3699 406 : if (ret < 0)
3700 : return ret;
3701 :
3702 406 : if (ret == 0)
3703 1 : progress++;
3704 :
3705 406 : nritems = btrfs_header_nritems(path->nodes[0]);
3706 : /*
3707 : * our goal is to get our slot at the start or end of a leaf. If
3708 : * we've done so we're done
3709 : */
3710 406 : if (path->slots[0] == 0 || path->slots[0] == nritems)
3711 : return 0;
3712 :
3713 405 : if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3714 : return 0;
3715 :
3716 : /* try to push all the items before our slot into the next leaf */
3717 405 : slot = path->slots[0];
3718 405 : space_needed = data_size;
3719 405 : if (slot > 0)
3720 405 : space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3721 405 : ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3722 405 : if (ret < 0)
3723 : return ret;
3724 :
3725 405 : if (ret == 0)
3726 2 : progress++;
3727 :
3728 405 : if (progress)
3729 2 : return 0;
3730 : return 1;
3731 : }
3732 :
3733 : /*
3734 : * split the path's leaf in two, making sure there is at least data_size
3735 : * available for the resulting leaf level of the path.
3736 : *
3737 : * returns 0 if all went well and < 0 on failure.
3738 : */
3739 3858179 : static noinline int split_leaf(struct btrfs_trans_handle *trans,
3740 : struct btrfs_root *root,
3741 : const struct btrfs_key *ins_key,
3742 : struct btrfs_path *path, int data_size,
3743 : int extend)
3744 : {
3745 3858179 : struct btrfs_disk_key disk_key;
3746 3858179 : struct extent_buffer *l;
3747 3858179 : u32 nritems;
3748 3858179 : int mid;
3749 3858179 : int slot;
3750 3858179 : struct extent_buffer *right;
3751 3858179 : struct btrfs_fs_info *fs_info = root->fs_info;
3752 3858179 : int ret = 0;
3753 3858179 : int wret;
3754 3858179 : int split;
3755 3858179 : int num_doubles = 0;
3756 3858179 : int tried_avoid_double = 0;
3757 :
3758 3858179 : l = path->nodes[0];
3759 3858179 : slot = path->slots[0];
3760 4671500 : if (extend && data_size + btrfs_item_size(l, slot) +
3761 813321 : sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
3762 : return -EOVERFLOW;
3763 :
3764 : /* first try to make some room by pushing left and right */
3765 3850139 : if (data_size && path->nodes[1]) {
3766 3848320 : int space_needed = data_size;
3767 :
3768 3848320 : if (slot < btrfs_header_nritems(l))
3769 3540931 : space_needed -= btrfs_leaf_free_space(l);
3770 :
3771 3848469 : wret = push_leaf_right(trans, root, path, space_needed,
3772 : space_needed, 0, 0);
3773 3848440 : if (wret < 0)
3774 : return wret;
3775 3848440 : if (wret) {
3776 1280591 : space_needed = data_size;
3777 1280591 : if (slot > 0)
3778 1275520 : space_needed -= btrfs_leaf_free_space(l);
3779 1280591 : wret = push_leaf_left(trans, root, path, space_needed,
3780 : space_needed, 0, (u32)-1);
3781 1280605 : if (wret < 0)
3782 : return wret;
3783 : }
3784 3848454 : l = path->nodes[0];
3785 :
3786 : /* did the pushes work? */
3787 3848454 : if (btrfs_leaf_free_space(l) >= data_size)
3788 : return 0;
3789 : }
3790 :
3791 363720 : if (!path->nodes[1]) {
3792 1785 : ret = insert_new_root(trans, root, path, 1);
3793 1785 : if (ret)
3794 : return ret;
3795 : }
3796 363720 : again:
3797 364531 : split = 1;
3798 364531 : l = path->nodes[0];
3799 364531 : slot = path->slots[0];
3800 364531 : nritems = btrfs_header_nritems(l);
3801 364531 : mid = (nritems + 1) / 2;
3802 :
3803 364531 : if (mid <= slot) {
3804 515673 : if (nritems == 1 ||
3805 257783 : leaf_space_used(l, mid, nritems - mid) + data_size >
3806 : BTRFS_LEAF_DATA_SIZE(fs_info)) {
3807 2972 : if (slot >= nritems) {
3808 : split = 0;
3809 : } else {
3810 2616 : mid = slot;
3811 2616 : if (mid != nritems &&
3812 2616 : leaf_space_used(l, mid, nritems - mid) +
3813 : data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3814 727 : if (data_size && !tried_avoid_double)
3815 365 : goto push_for_double;
3816 : split = 2;
3817 : }
3818 : }
3819 : }
3820 : } else {
3821 106640 : if (leaf_space_used(l, 0, mid) + data_size >
3822 : BTRFS_LEAF_DATA_SIZE(fs_info)) {
3823 497 : if (!extend && data_size && slot == 0) {
3824 : split = 0;
3825 173 : } else if ((extend || !data_size) && slot == 0) {
3826 : mid = 1;
3827 : } else {
3828 173 : mid = slot;
3829 346 : if (mid != nritems &&
3830 173 : leaf_space_used(l, mid, nritems - mid) +
3831 : data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3832 82 : if (data_size && !tried_avoid_double)
3833 41 : goto push_for_double;
3834 : split = 2;
3835 : }
3836 : }
3837 : }
3838 : }
3839 :
3840 : if (split == 0)
3841 680 : btrfs_cpu_key_to_disk(&disk_key, ins_key);
3842 : else
3843 363438 : btrfs_item_key(l, &disk_key, mid);
3844 :
3845 : /*
3846 : * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3847 : * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3848 : * subclasses, which is 8 at the time of this patch, and we've maxed it
3849 : * out. In the future we could add a
3850 : * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3851 : * use BTRFS_NESTING_NEW_ROOT.
3852 : */
3853 727828 : right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3854 : &disk_key, 0, l->start, 0,
3855 : num_doubles ? BTRFS_NESTING_NEW_ROOT :
3856 : BTRFS_NESTING_SPLIT);
3857 364131 : if (IS_ERR(right))
3858 0 : return PTR_ERR(right);
3859 :
3860 364131 : root_add_used(root, fs_info->nodesize);
3861 :
3862 364131 : if (split == 0) {
3863 680 : if (mid <= slot) {
3864 356 : btrfs_set_header_nritems(right, 0);
3865 356 : ret = insert_ptr(trans, path, &disk_key,
3866 356 : right->start, path->slots[1] + 1, 1);
3867 356 : if (ret < 0) {
3868 0 : btrfs_tree_unlock(right);
3869 0 : free_extent_buffer(right);
3870 0 : return ret;
3871 : }
3872 356 : btrfs_tree_unlock(path->nodes[0]);
3873 356 : free_extent_buffer(path->nodes[0]);
3874 356 : path->nodes[0] = right;
3875 356 : path->slots[0] = 0;
3876 356 : path->slots[1] += 1;
3877 : } else {
3878 324 : btrfs_set_header_nritems(right, 0);
3879 324 : ret = insert_ptr(trans, path, &disk_key,
3880 : right->start, path->slots[1], 1);
3881 324 : if (ret < 0) {
3882 0 : btrfs_tree_unlock(right);
3883 0 : free_extent_buffer(right);
3884 0 : return ret;
3885 : }
3886 324 : btrfs_tree_unlock(path->nodes[0]);
3887 324 : free_extent_buffer(path->nodes[0]);
3888 324 : path->nodes[0] = right;
3889 324 : path->slots[0] = 0;
3890 324 : if (path->slots[1] == 0)
3891 43 : fixup_low_keys(path, &disk_key, 1);
3892 : }
3893 : /*
3894 : * We create a new leaf 'right' for the required ins_len and
3895 : * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3896 : * the content of ins_len to 'right'.
3897 : */
3898 680 : return ret;
3899 : }
3900 :
3901 363451 : ret = copy_for_split(trans, path, l, right, slot, mid, nritems);
3902 363450 : if (ret < 0) {
3903 0 : btrfs_tree_unlock(right);
3904 0 : free_extent_buffer(right);
3905 0 : return ret;
3906 : }
3907 :
3908 363450 : if (split == 2) {
3909 405 : BUG_ON(num_doubles != 0);
3910 405 : num_doubles++;
3911 405 : goto again;
3912 : }
3913 :
3914 : return 0;
3915 :
3916 406 : push_for_double:
3917 406 : push_for_double_split(trans, root, path, data_size);
3918 406 : tried_avoid_double = 1;
3919 406 : if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3920 : return 0;
3921 406 : goto again;
3922 : }
3923 :
3924 2806622 : static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3925 : struct btrfs_root *root,
3926 : struct btrfs_path *path, int ins_len)
3927 : {
3928 2806622 : struct btrfs_key key;
3929 2806622 : struct extent_buffer *leaf;
3930 2806622 : struct btrfs_file_extent_item *fi;
3931 2806622 : u64 extent_len = 0;
3932 2806622 : u32 item_size;
3933 2806622 : int ret;
3934 :
3935 2806622 : leaf = path->nodes[0];
3936 2806622 : btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3937 :
3938 2806621 : BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3939 : key.type != BTRFS_EXTENT_CSUM_KEY);
3940 :
3941 2806621 : if (btrfs_leaf_free_space(leaf) >= ins_len)
3942 : return 0;
3943 :
3944 41181 : item_size = btrfs_item_size(leaf, path->slots[0]);
3945 41181 : if (key.type == BTRFS_EXTENT_DATA_KEY) {
3946 39143 : fi = btrfs_item_ptr(leaf, path->slots[0],
3947 : struct btrfs_file_extent_item);
3948 39143 : extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3949 : }
3950 41181 : btrfs_release_path(path);
3951 :
3952 41181 : path->keep_locks = 1;
3953 41181 : path->search_for_split = 1;
3954 41181 : ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3955 41181 : path->search_for_split = 0;
3956 41181 : if (ret > 0)
3957 : ret = -EAGAIN;
3958 41180 : if (ret < 0)
3959 1 : goto err;
3960 :
3961 41180 : ret = -EAGAIN;
3962 41180 : leaf = path->nodes[0];
3963 : /* if our item isn't there, return now */
3964 41180 : if (item_size != btrfs_item_size(leaf, path->slots[0]))
3965 0 : goto err;
3966 :
3967 : /* the leaf has changed, it now has room. return now */
3968 41180 : if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
3969 31 : goto err;
3970 :
3971 41149 : if (key.type == BTRFS_EXTENT_DATA_KEY) {
3972 39111 : fi = btrfs_item_ptr(leaf, path->slots[0],
3973 : struct btrfs_file_extent_item);
3974 39111 : if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3975 0 : goto err;
3976 : }
3977 :
3978 41149 : ret = split_leaf(trans, root, &key, path, ins_len, 1);
3979 41149 : if (ret)
3980 0 : goto err;
3981 :
3982 41149 : path->keep_locks = 0;
3983 41149 : btrfs_unlock_up_safe(path, 1);
3984 41149 : return 0;
3985 32 : err:
3986 32 : path->keep_locks = 0;
3987 32 : return ret;
3988 : }
3989 :
3990 194677 : static noinline int split_item(struct btrfs_path *path,
3991 : const struct btrfs_key *new_key,
3992 : unsigned long split_offset)
3993 : {
3994 194677 : struct extent_buffer *leaf;
3995 194677 : int orig_slot, slot;
3996 194677 : char *buf;
3997 194677 : u32 nritems;
3998 194677 : u32 item_size;
3999 194677 : u32 orig_offset;
4000 194677 : struct btrfs_disk_key disk_key;
4001 :
4002 194677 : leaf = path->nodes[0];
4003 : /*
4004 : * Shouldn't happen because the caller must have previously called
4005 : * setup_leaf_for_split() to make room for the new item in the leaf.
4006 : */
4007 194677 : if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)))
4008 : return -ENOSPC;
4009 :
4010 194677 : orig_slot = path->slots[0];
4011 194677 : orig_offset = btrfs_item_offset(leaf, path->slots[0]);
4012 194677 : item_size = btrfs_item_size(leaf, path->slots[0]);
4013 :
4014 194677 : buf = kmalloc(item_size, GFP_NOFS);
4015 194677 : if (!buf)
4016 : return -ENOMEM;
4017 :
4018 194677 : read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4019 : path->slots[0]), item_size);
4020 :
4021 194677 : slot = path->slots[0] + 1;
4022 194677 : nritems = btrfs_header_nritems(leaf);
4023 194677 : if (slot != nritems) {
4024 : /* shift the items */
4025 188674 : memmove_leaf_items(leaf, slot + 1, slot, nritems - slot);
4026 : }
4027 :
4028 194677 : btrfs_cpu_key_to_disk(&disk_key, new_key);
4029 194677 : btrfs_set_item_key(leaf, &disk_key, slot);
4030 :
4031 194677 : btrfs_set_item_offset(leaf, slot, orig_offset);
4032 194677 : btrfs_set_item_size(leaf, slot, item_size - split_offset);
4033 :
4034 194677 : btrfs_set_item_offset(leaf, orig_slot,
4035 194677 : orig_offset + item_size - split_offset);
4036 194677 : btrfs_set_item_size(leaf, orig_slot, split_offset);
4037 :
4038 194677 : btrfs_set_header_nritems(leaf, nritems + 1);
4039 :
4040 : /* write the data for the start of the original item */
4041 389354 : write_extent_buffer(leaf, buf,
4042 194677 : btrfs_item_ptr_offset(leaf, path->slots[0]),
4043 : split_offset);
4044 :
4045 : /* write the data for the new item */
4046 194677 : write_extent_buffer(leaf, buf + split_offset,
4047 194677 : btrfs_item_ptr_offset(leaf, slot),
4048 : item_size - split_offset);
4049 194677 : btrfs_mark_buffer_dirty(leaf);
4050 :
4051 194677 : BUG_ON(btrfs_leaf_free_space(leaf) < 0);
4052 194677 : kfree(buf);
4053 194677 : return 0;
4054 : }
4055 :
4056 : /*
4057 : * This function splits a single item into two items,
4058 : * giving 'new_key' to the new item and splitting the
4059 : * old one at split_offset (from the start of the item).
4060 : *
4061 : * The path may be released by this operation. After
4062 : * the split, the path is pointing to the old item. The
4063 : * new item is going to be in the same node as the old one.
4064 : *
4065 : * Note, the item being split must be smaller enough to live alone on
4066 : * a tree block with room for one extra struct btrfs_item
4067 : *
4068 : * This allows us to split the item in place, keeping a lock on the
4069 : * leaf the entire time.
4070 : */
4071 194677 : int btrfs_split_item(struct btrfs_trans_handle *trans,
4072 : struct btrfs_root *root,
4073 : struct btrfs_path *path,
4074 : const struct btrfs_key *new_key,
4075 : unsigned long split_offset)
4076 : {
4077 194677 : int ret;
4078 194677 : ret = setup_leaf_for_split(trans, root, path,
4079 : sizeof(struct btrfs_item));
4080 194677 : if (ret)
4081 : return ret;
4082 :
4083 194677 : ret = split_item(path, new_key, split_offset);
4084 194677 : return ret;
4085 : }
4086 :
4087 : /*
4088 : * make the item pointed to by the path smaller. new_size indicates
4089 : * how small to make it, and from_end tells us if we just chop bytes
4090 : * off the end of the item or if we shift the item to chop bytes off
4091 : * the front.
4092 : */
4093 16276797 : void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
4094 : {
4095 16276797 : int slot;
4096 16276797 : struct extent_buffer *leaf;
4097 16276797 : u32 nritems;
4098 16276797 : unsigned int data_end;
4099 16276797 : unsigned int old_data_start;
4100 16276797 : unsigned int old_size;
4101 16276797 : unsigned int size_diff;
4102 16276797 : int i;
4103 16276797 : struct btrfs_map_token token;
4104 :
4105 16276797 : leaf = path->nodes[0];
4106 16276797 : slot = path->slots[0];
4107 :
4108 16276797 : old_size = btrfs_item_size(leaf, slot);
4109 16276777 : if (old_size == new_size)
4110 0 : return;
4111 :
4112 16276777 : nritems = btrfs_header_nritems(leaf);
4113 16276777 : data_end = leaf_data_end(leaf);
4114 :
4115 16276759 : old_data_start = btrfs_item_offset(leaf, slot);
4116 :
4117 16276769 : size_diff = old_size - new_size;
4118 :
4119 16276769 : BUG_ON(slot < 0);
4120 16276769 : BUG_ON(slot >= nritems);
4121 :
4122 : /*
4123 : * item0..itemN ... dataN.offset..dataN.size .. data0.size
4124 : */
4125 : /* first correct the data pointers */
4126 16276769 : btrfs_init_map_token(&token, leaf);
4127 1367362158 : for (i = slot; i < nritems; i++) {
4128 1334808574 : u32 ioff;
4129 :
4130 1334808574 : ioff = btrfs_token_item_offset(&token, i);
4131 1334808899 : btrfs_set_token_item_offset(&token, i, ioff + size_diff);
4132 : }
4133 :
4134 : /* shift the data */
4135 16276815 : if (from_end) {
4136 14249341 : memmove_leaf_data(leaf, data_end + size_diff, data_end,
4137 14249341 : old_data_start + new_size - data_end);
4138 : } else {
4139 2027474 : struct btrfs_disk_key disk_key;
4140 2027474 : u64 offset;
4141 :
4142 2027474 : btrfs_item_key(leaf, &disk_key, slot);
4143 :
4144 2027474 : if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4145 0 : unsigned long ptr;
4146 0 : struct btrfs_file_extent_item *fi;
4147 :
4148 0 : fi = btrfs_item_ptr(leaf, slot,
4149 : struct btrfs_file_extent_item);
4150 0 : fi = (struct btrfs_file_extent_item *)(
4151 0 : (unsigned long)fi - size_diff);
4152 :
4153 0 : if (btrfs_file_extent_type(leaf, fi) ==
4154 : BTRFS_FILE_EXTENT_INLINE) {
4155 0 : ptr = btrfs_item_ptr_offset(leaf, slot);
4156 0 : memmove_extent_buffer(leaf, ptr,
4157 : (unsigned long)fi,
4158 : BTRFS_FILE_EXTENT_INLINE_DATA_START);
4159 : }
4160 : }
4161 :
4162 2027474 : memmove_leaf_data(leaf, data_end + size_diff, data_end,
4163 2027474 : old_data_start - data_end);
4164 :
4165 2027474 : offset = btrfs_disk_key_offset(&disk_key);
4166 2027474 : btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4167 2027474 : btrfs_set_item_key(leaf, &disk_key, slot);
4168 2027472 : if (slot == 0)
4169 620015 : fixup_low_keys(path, &disk_key, 1);
4170 : }
4171 :
4172 16276812 : btrfs_set_item_size(leaf, slot, new_size);
4173 16276795 : btrfs_mark_buffer_dirty(leaf);
4174 :
4175 16276809 : if (btrfs_leaf_free_space(leaf) < 0) {
4176 0 : btrfs_print_leaf(leaf);
4177 16276794 : BUG();
4178 : }
4179 : }
4180 :
4181 : /*
4182 : * make the item pointed to by the path bigger, data_size is the added size.
4183 : */
4184 18832981 : void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4185 : {
4186 18832981 : int slot;
4187 18832981 : struct extent_buffer *leaf;
4188 18832981 : u32 nritems;
4189 18832981 : unsigned int data_end;
4190 18832981 : unsigned int old_data;
4191 18832981 : unsigned int old_size;
4192 18832981 : int i;
4193 18832981 : struct btrfs_map_token token;
4194 :
4195 18832981 : leaf = path->nodes[0];
4196 :
4197 18832981 : nritems = btrfs_header_nritems(leaf);
4198 18832981 : data_end = leaf_data_end(leaf);
4199 :
4200 18832940 : if (btrfs_leaf_free_space(leaf) < data_size) {
4201 0 : btrfs_print_leaf(leaf);
4202 0 : BUG();
4203 : }
4204 18832968 : slot = path->slots[0];
4205 18832968 : old_data = btrfs_item_data_end(leaf, slot);
4206 :
4207 18832936 : BUG_ON(slot < 0);
4208 18832936 : if (slot >= nritems) {
4209 0 : btrfs_print_leaf(leaf);
4210 0 : btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4211 : slot, nritems);
4212 0 : BUG();
4213 : }
4214 :
4215 : /*
4216 : * item0..itemN ... dataN.offset..dataN.size .. data0.size
4217 : */
4218 : /* first correct the data pointers */
4219 18832936 : btrfs_init_map_token(&token, leaf);
4220 1426971090 : for (i = slot; i < nritems; i++) {
4221 1389305141 : u32 ioff;
4222 :
4223 1389305141 : ioff = btrfs_token_item_offset(&token, i);
4224 1389306555 : btrfs_set_token_item_offset(&token, i, ioff - data_size);
4225 : }
4226 :
4227 : /* shift the data */
4228 18833013 : memmove_leaf_data(leaf, data_end - data_size, data_end,
4229 18833013 : old_data - data_end);
4230 :
4231 18833034 : data_end = old_data;
4232 18833034 : old_size = btrfs_item_size(leaf, slot);
4233 18833004 : btrfs_set_item_size(leaf, slot, old_size + data_size);
4234 18832971 : btrfs_mark_buffer_dirty(leaf);
4235 :
4236 18833047 : if (btrfs_leaf_free_space(leaf) < 0) {
4237 0 : btrfs_print_leaf(leaf);
4238 0 : BUG();
4239 : }
4240 18832991 : }
4241 :
4242 : /*
4243 : * Make space in the node before inserting one or more items.
4244 : *
4245 : * @root: root we are inserting items to
4246 : * @path: points to the leaf/slot where we are going to insert new items
4247 : * @batch: information about the batch of items to insert
4248 : *
4249 : * Main purpose is to save stack depth by doing the bulk of the work in a
4250 : * function that doesn't call btrfs_search_slot
4251 : */
4252 67974159 : static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4253 : const struct btrfs_item_batch *batch)
4254 : {
4255 67974159 : struct btrfs_fs_info *fs_info = root->fs_info;
4256 67974159 : int i;
4257 67974159 : u32 nritems;
4258 67974159 : unsigned int data_end;
4259 67974159 : struct btrfs_disk_key disk_key;
4260 67974159 : struct extent_buffer *leaf;
4261 67974159 : int slot;
4262 67974159 : struct btrfs_map_token token;
4263 67974159 : u32 total_size;
4264 :
4265 : /*
4266 : * Before anything else, update keys in the parent and other ancestors
4267 : * if needed, then release the write locks on them, so that other tasks
4268 : * can use them while we modify the leaf.
4269 : */
4270 67974159 : if (path->slots[0] == 0) {
4271 370868 : btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
4272 370865 : fixup_low_keys(path, &disk_key, 1);
4273 : }
4274 67974160 : btrfs_unlock_up_safe(path, 1);
4275 :
4276 67971958 : leaf = path->nodes[0];
4277 67971958 : slot = path->slots[0];
4278 :
4279 67971958 : nritems = btrfs_header_nritems(leaf);
4280 67971958 : data_end = leaf_data_end(leaf);
4281 67960728 : total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4282 :
4283 67960728 : if (btrfs_leaf_free_space(leaf) < total_size) {
4284 0 : btrfs_print_leaf(leaf);
4285 0 : btrfs_crit(fs_info, "not enough freespace need %u have %d",
4286 : total_size, btrfs_leaf_free_space(leaf));
4287 0 : BUG();
4288 : }
4289 :
4290 67956783 : btrfs_init_map_token(&token, leaf);
4291 67953168 : if (slot != nritems) {
4292 40497126 : unsigned int old_data = btrfs_item_data_end(leaf, slot);
4293 :
4294 40496634 : if (old_data < data_end) {
4295 0 : btrfs_print_leaf(leaf);
4296 0 : btrfs_crit(fs_info,
4297 : "item at slot %d with data offset %u beyond data end of leaf %u",
4298 : slot, old_data, data_end);
4299 0 : BUG();
4300 : }
4301 : /*
4302 : * item0..itemN ... dataN.offset..dataN.size .. data0.size
4303 : */
4304 : /* first correct the data pointers */
4305 3663575030 : for (i = slot; i < nritems; i++) {
4306 3623077598 : u32 ioff;
4307 :
4308 3623077598 : ioff = btrfs_token_item_offset(&token, i);
4309 3623344166 : btrfs_set_token_item_offset(&token, i,
4310 3623344166 : ioff - batch->total_data_size);
4311 : }
4312 : /* shift the items */
4313 40497432 : memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot);
4314 :
4315 : /* shift the data */
4316 40496633 : memmove_leaf_data(leaf, data_end - batch->total_data_size,
4317 40496633 : data_end, old_data - data_end);
4318 40496633 : data_end = old_data;
4319 : }
4320 :
4321 : /* setup the item for the new data */
4322 142905734 : for (i = 0; i < batch->nr; i++) {
4323 74940983 : btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
4324 74920619 : btrfs_set_item_key(leaf, &disk_key, slot + i);
4325 74920078 : data_end -= batch->data_sizes[i];
4326 74920078 : btrfs_set_token_item_offset(&token, slot + i, data_end);
4327 74915567 : btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
4328 : }
4329 :
4330 67964751 : btrfs_set_header_nritems(leaf, nritems + batch->nr);
4331 67964751 : btrfs_mark_buffer_dirty(leaf);
4332 :
4333 67978373 : if (btrfs_leaf_free_space(leaf) < 0) {
4334 0 : btrfs_print_leaf(leaf);
4335 0 : BUG();
4336 : }
4337 67966642 : }
4338 :
4339 : /*
4340 : * Insert a new item into a leaf.
4341 : *
4342 : * @root: The root of the btree.
4343 : * @path: A path pointing to the target leaf and slot.
4344 : * @key: The key of the new item.
4345 : * @data_size: The size of the data associated with the new key.
4346 : */
4347 3807681 : void btrfs_setup_item_for_insert(struct btrfs_root *root,
4348 : struct btrfs_path *path,
4349 : const struct btrfs_key *key,
4350 : u32 data_size)
4351 : {
4352 6419595 : struct btrfs_item_batch batch;
4353 :
4354 6419595 : batch.keys = key;
4355 6419595 : batch.data_sizes = &data_size;
4356 6419595 : batch.total_data_size = data_size;
4357 6419595 : batch.nr = 1;
4358 :
4359 3807681 : setup_items_for_insert(root, path, &batch);
4360 3807673 : }
4361 :
4362 : /*
4363 : * Given a key and some data, insert items into the tree.
4364 : * This does all the path init required, making room in the tree if needed.
4365 : */
4366 62009031 : int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4367 : struct btrfs_root *root,
4368 : struct btrfs_path *path,
4369 : const struct btrfs_item_batch *batch)
4370 : {
4371 62009031 : int ret = 0;
4372 62009031 : int slot;
4373 62009031 : u32 total_size;
4374 :
4375 62009031 : total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4376 62009031 : ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
4377 62008982 : if (ret == 0)
4378 : return -EEXIST;
4379 61563704 : if (ret < 0)
4380 : return ret;
4381 :
4382 61555658 : slot = path->slots[0];
4383 61555658 : BUG_ON(slot < 0);
4384 :
4385 61555658 : setup_items_for_insert(root, path, batch);
4386 61555658 : return 0;
4387 : }
4388 :
4389 : /*
4390 : * Given a key and some data, insert an item into the tree.
4391 : * This does all the path init required, making room in the tree if needed.
4392 : */
4393 9748 : int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4394 : const struct btrfs_key *cpu_key, void *data,
4395 : u32 data_size)
4396 : {
4397 9748 : int ret = 0;
4398 9748 : struct btrfs_path *path;
4399 9748 : struct extent_buffer *leaf;
4400 9748 : unsigned long ptr;
4401 :
4402 9748 : path = btrfs_alloc_path();
4403 9748 : if (!path)
4404 : return -ENOMEM;
4405 9748 : ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4406 9748 : if (!ret) {
4407 9748 : leaf = path->nodes[0];
4408 9748 : ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4409 9748 : write_extent_buffer(leaf, data, ptr, data_size);
4410 9748 : btrfs_mark_buffer_dirty(leaf);
4411 : }
4412 9748 : btrfs_free_path(path);
4413 9748 : return ret;
4414 : }
4415 :
4416 : /*
4417 : * This function duplicates an item, giving 'new_key' to the new item.
4418 : * It guarantees both items live in the same tree leaf and the new item is
4419 : * contiguous with the original item.
4420 : *
4421 : * This allows us to split a file extent in place, keeping a lock on the leaf
4422 : * the entire time.
4423 : */
4424 2611947 : int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4425 : struct btrfs_root *root,
4426 : struct btrfs_path *path,
4427 : const struct btrfs_key *new_key)
4428 : {
4429 2611947 : struct extent_buffer *leaf;
4430 2611947 : int ret;
4431 2611947 : u32 item_size;
4432 :
4433 2611947 : leaf = path->nodes[0];
4434 2611947 : item_size = btrfs_item_size(leaf, path->slots[0]);
4435 2611943 : ret = setup_leaf_for_split(trans, root, path,
4436 2611943 : item_size + sizeof(struct btrfs_item));
4437 2611947 : if (ret)
4438 : return ret;
4439 :
4440 2611914 : path->slots[0]++;
4441 2611914 : btrfs_setup_item_for_insert(root, path, new_key, item_size);
4442 2611916 : leaf = path->nodes[0];
4443 7835744 : memcpy_extent_buffer(leaf,
4444 5223828 : btrfs_item_ptr_offset(leaf, path->slots[0]),
4445 2611916 : btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4446 : item_size);
4447 2611912 : return 0;
4448 : }
4449 :
4450 : /*
4451 : * delete the pointer from a given node.
4452 : *
4453 : * the tree should have been previously balanced so the deletion does not
4454 : * empty a node.
4455 : *
4456 : * This is exported for use inside btrfs-progs, don't un-export it.
4457 : */
4458 113751 : int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4459 : struct btrfs_path *path, int level, int slot)
4460 : {
4461 113751 : struct extent_buffer *parent = path->nodes[level];
4462 113751 : u32 nritems;
4463 113751 : int ret;
4464 :
4465 113751 : nritems = btrfs_header_nritems(parent);
4466 113751 : if (slot != nritems - 1) {
4467 110555 : if (level) {
4468 110555 : ret = btrfs_tree_mod_log_insert_move(parent, slot,
4469 110555 : slot + 1, nritems - slot - 1);
4470 110554 : if (ret < 0) {
4471 0 : btrfs_abort_transaction(trans, ret);
4472 0 : return ret;
4473 : }
4474 : }
4475 110554 : memmove_extent_buffer(parent,
4476 : btrfs_node_key_ptr_offset(parent, slot),
4477 : btrfs_node_key_ptr_offset(parent, slot + 1),
4478 : sizeof(struct btrfs_key_ptr) *
4479 110554 : (nritems - slot - 1));
4480 3196 : } else if (level) {
4481 3196 : ret = btrfs_tree_mod_log_insert_key(parent, slot,
4482 : BTRFS_MOD_LOG_KEY_REMOVE);
4483 3196 : if (ret < 0) {
4484 0 : btrfs_abort_transaction(trans, ret);
4485 0 : return ret;
4486 : }
4487 : }
4488 :
4489 113750 : nritems--;
4490 113750 : btrfs_set_header_nritems(parent, nritems);
4491 113750 : if (nritems == 0 && parent == root->node) {
4492 0 : BUG_ON(btrfs_header_level(root->node) != 1);
4493 : /* just turn the root into a leaf and break */
4494 0 : btrfs_set_header_level(root->node, 0);
4495 113750 : } else if (slot == 0) {
4496 4749 : struct btrfs_disk_key disk_key;
4497 :
4498 4749 : btrfs_node_key(parent, &disk_key, 0);
4499 4749 : fixup_low_keys(path, &disk_key, level + 1);
4500 : }
4501 113750 : btrfs_mark_buffer_dirty(parent);
4502 113750 : return 0;
4503 : }
4504 :
4505 : /*
4506 : * a helper function to delete the leaf pointed to by path->slots[1] and
4507 : * path->nodes[1].
4508 : *
4509 : * This deletes the pointer in path->nodes[1] and frees the leaf
4510 : * block extent. zero is returned if it all worked out, < 0 otherwise.
4511 : *
4512 : * The path must have already been setup for deleting the leaf, including
4513 : * all the proper balancing. path->nodes[1] must be locked.
4514 : */
4515 113642 : static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
4516 : struct btrfs_root *root,
4517 : struct btrfs_path *path,
4518 : struct extent_buffer *leaf)
4519 : {
4520 113642 : int ret;
4521 :
4522 113642 : WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4523 113642 : ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]);
4524 113641 : if (ret < 0)
4525 : return ret;
4526 :
4527 : /*
4528 : * btrfs_free_extent is expensive, we want to make sure we
4529 : * aren't holding any locks when we call it
4530 : */
4531 113641 : btrfs_unlock_up_safe(path, 0);
4532 :
4533 113642 : root_sub_used(root, leaf->len);
4534 :
4535 113642 : atomic_inc(&leaf->refs);
4536 113642 : btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
4537 113642 : free_extent_buffer_stale(leaf);
4538 113642 : return 0;
4539 : }
4540 : /*
4541 : * delete the item at the leaf level in path. If that empties
4542 : * the leaf, remove it from the tree
4543 : */
4544 26354348 : int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4545 : struct btrfs_path *path, int slot, int nr)
4546 : {
4547 26354348 : struct btrfs_fs_info *fs_info = root->fs_info;
4548 26354348 : struct extent_buffer *leaf;
4549 26354348 : int ret = 0;
4550 26354348 : int wret;
4551 26354348 : u32 nritems;
4552 :
4553 26354348 : leaf = path->nodes[0];
4554 26354348 : nritems = btrfs_header_nritems(leaf);
4555 :
4556 26354348 : if (slot + nr != nritems) {
4557 21181486 : const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
4558 21180565 : const int data_end = leaf_data_end(leaf);
4559 21179982 : struct btrfs_map_token token;
4560 21179982 : u32 dsize = 0;
4561 21179982 : int i;
4562 :
4563 47354579 : for (i = 0; i < nr; i++)
4564 26174414 : dsize += btrfs_item_size(leaf, slot + i);
4565 :
4566 21180165 : memmove_leaf_data(leaf, data_end + dsize, data_end,
4567 21180165 : last_off - data_end);
4568 :
4569 21180208 : btrfs_init_map_token(&token, leaf);
4570 2013212099 : for (i = slot + nr; i < nritems; i++) {
4571 1970851000 : u32 ioff;
4572 :
4573 1970851000 : ioff = btrfs_token_item_offset(&token, i);
4574 1970781869 : btrfs_set_token_item_offset(&token, i, ioff + dsize);
4575 : }
4576 :
4577 21180891 : memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
4578 : }
4579 26354125 : btrfs_set_header_nritems(leaf, nritems - nr);
4580 26354125 : nritems -= nr;
4581 :
4582 : /* delete the leaf if we've emptied it */
4583 26354125 : if (nritems == 0) {
4584 11212 : if (leaf == root->node) {
4585 471 : btrfs_set_header_level(leaf, 0);
4586 : } else {
4587 10741 : btrfs_clear_buffer_dirty(trans, leaf);
4588 10741 : ret = btrfs_del_leaf(trans, root, path, leaf);
4589 10741 : if (ret < 0)
4590 : return ret;
4591 : }
4592 : } else {
4593 26342913 : int used = leaf_space_used(leaf, 0, nritems);
4594 26342054 : if (slot == 0) {
4595 2333239 : struct btrfs_disk_key disk_key;
4596 :
4597 2333239 : btrfs_item_key(leaf, &disk_key, 0);
4598 2333243 : fixup_low_keys(path, &disk_key, 1);
4599 : }
4600 :
4601 : /*
4602 : * Try to delete the leaf if it is mostly empty. We do this by
4603 : * trying to move all its items into its left and right neighbours.
4604 : * If we can't move all the items, then we don't delete it - it's
4605 : * not ideal, but future insertions might fill the leaf with more
4606 : * items, or items from other leaves might be moved later into our
4607 : * leaf due to deletions on those leaves.
4608 : */
4609 26342066 : if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4610 3948232 : u32 min_push_space;
4611 :
4612 : /* push_leaf_left fixes the path.
4613 : * make sure the path still points to our leaf
4614 : * for possible call to btrfs_del_ptr below
4615 : */
4616 3948232 : slot = path->slots[1];
4617 3948232 : atomic_inc(&leaf->refs);
4618 : /*
4619 : * We want to be able to at least push one item to the
4620 : * left neighbour leaf, and that's the first item.
4621 : */
4622 3948250 : min_push_space = sizeof(struct btrfs_item) +
4623 : btrfs_item_size(leaf, 0);
4624 3948243 : wret = push_leaf_left(trans, root, path, 0,
4625 : min_push_space, 1, (u32)-1);
4626 3948243 : if (wret < 0 && wret != -ENOSPC)
4627 0 : ret = wret;
4628 :
4629 3948243 : if (path->nodes[0] == leaf &&
4630 : btrfs_header_nritems(leaf)) {
4631 : /*
4632 : * If we were not able to push all items from our
4633 : * leaf to its left neighbour, then attempt to
4634 : * either push all the remaining items to the
4635 : * right neighbour or none. There's no advantage
4636 : * in pushing only some items, instead of all, as
4637 : * it's pointless to end up with a leaf having
4638 : * too few items while the neighbours can be full
4639 : * or nearly full.
4640 : */
4641 3814790 : nritems = btrfs_header_nritems(leaf);
4642 3814790 : min_push_space = leaf_space_used(leaf, 0, nritems);
4643 3814758 : wret = push_leaf_right(trans, root, path, 0,
4644 : min_push_space, 1, 0);
4645 3814769 : if (wret < 0 && wret != -ENOSPC)
4646 0 : ret = wret;
4647 : }
4648 :
4649 3948222 : if (btrfs_header_nritems(leaf) == 0) {
4650 102901 : path->slots[1] = slot;
4651 102901 : ret = btrfs_del_leaf(trans, root, path, leaf);
4652 102901 : if (ret < 0)
4653 : return ret;
4654 102901 : free_extent_buffer(leaf);
4655 102901 : ret = 0;
4656 : } else {
4657 : /* if we're still in the path, make sure
4658 : * we're dirty. Otherwise, one of the
4659 : * push_leaf functions must have already
4660 : * dirtied this buffer
4661 : */
4662 3845321 : if (path->nodes[0] == leaf)
4663 3796691 : btrfs_mark_buffer_dirty(leaf);
4664 3845308 : free_extent_buffer(leaf);
4665 : }
4666 : } else {
4667 22393834 : btrfs_mark_buffer_dirty(leaf);
4668 : }
4669 : }
4670 : return ret;
4671 : }
4672 :
4673 : /*
4674 : * A helper function to walk down the tree starting at min_key, and looking
4675 : * for nodes or leaves that are have a minimum transaction id.
4676 : * This is used by the btree defrag code, and tree logging
4677 : *
4678 : * This does not cow, but it does stuff the starting key it finds back
4679 : * into min_key, so you can call btrfs_search_slot with cow=1 on the
4680 : * key and get a writable path.
4681 : *
4682 : * This honors path->lowest_level to prevent descent past a given level
4683 : * of the tree.
4684 : *
4685 : * min_trans indicates the oldest transaction that you are interested
4686 : * in walking through. Any nodes or leaves older than min_trans are
4687 : * skipped over (without reading them).
4688 : *
4689 : * returns zero if something useful was found, < 0 on error and 1 if there
4690 : * was nothing in the tree that matched the search criteria.
4691 : */
4692 248722 : int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4693 : struct btrfs_path *path,
4694 : u64 min_trans)
4695 : {
4696 248722 : struct extent_buffer *cur;
4697 248722 : struct btrfs_key found_key;
4698 248722 : int slot;
4699 248722 : int sret;
4700 248722 : u32 nritems;
4701 248722 : int level;
4702 248722 : int ret = 1;
4703 248722 : int keep_locks = path->keep_locks;
4704 :
4705 248722 : ASSERT(!path->nowait);
4706 248722 : path->keep_locks = 1;
4707 258831 : again:
4708 258831 : cur = btrfs_read_lock_root_node(root);
4709 258832 : level = btrfs_header_level(cur);
4710 258832 : WARN_ON(path->nodes[level]);
4711 258832 : path->nodes[level] = cur;
4712 258832 : path->locks[level] = BTRFS_READ_LOCK;
4713 :
4714 258832 : if (btrfs_header_generation(cur) < min_trans) {
4715 154 : ret = 1;
4716 154 : goto out;
4717 : }
4718 773904 : while (1) {
4719 516291 : nritems = btrfs_header_nritems(cur);
4720 516291 : level = btrfs_header_level(cur);
4721 516291 : sret = btrfs_bin_search(cur, 0, min_key, &slot);
4722 516289 : if (sret < 0) {
4723 0 : ret = sret;
4724 0 : goto out;
4725 : }
4726 :
4727 : /* at the lowest level, we're done, setup the path and exit */
4728 516289 : if (level == path->lowest_level) {
4729 258619 : if (slot >= nritems)
4730 102591 : goto find_next_key;
4731 156028 : ret = 0;
4732 156028 : path->slots[level] = slot;
4733 156028 : btrfs_item_key_to_cpu(cur, &found_key, slot);
4734 156030 : goto out;
4735 : }
4736 257670 : if (sret && slot > 0)
4737 247420 : slot--;
4738 : /*
4739 : * check this node pointer against the min_trans parameters.
4740 : * If it is too old, skip to the next one.
4741 : */
4742 266947 : while (slot < nritems) {
4743 266890 : u64 gen;
4744 :
4745 266890 : gen = btrfs_node_ptr_generation(cur, slot);
4746 266890 : if (gen < min_trans) {
4747 9277 : slot++;
4748 9277 : continue;
4749 : }
4750 : break;
4751 : }
4752 57 : find_next_key:
4753 : /*
4754 : * we didn't find a candidate key in this node, walk forward
4755 : * and find another one
4756 : */
4757 360261 : if (slot >= nritems) {
4758 102648 : path->slots[level] = slot;
4759 102648 : sret = btrfs_find_next_key(root, path, min_key, level,
4760 : min_trans);
4761 102647 : if (sret == 0) {
4762 10109 : btrfs_release_path(path);
4763 10109 : goto again;
4764 : } else {
4765 92538 : goto out;
4766 : }
4767 : }
4768 : /* save our key for returning back */
4769 257613 : btrfs_node_key_to_cpu(cur, &found_key, slot);
4770 257613 : path->slots[level] = slot;
4771 257613 : if (level == path->lowest_level) {
4772 0 : ret = 0;
4773 0 : goto out;
4774 : }
4775 257613 : cur = btrfs_read_node_slot(cur, slot);
4776 257614 : if (IS_ERR(cur)) {
4777 0 : ret = PTR_ERR(cur);
4778 0 : goto out;
4779 : }
4780 :
4781 257614 : btrfs_tree_read_lock(cur);
4782 :
4783 257613 : path->locks[level - 1] = BTRFS_READ_LOCK;
4784 257613 : path->nodes[level - 1] = cur;
4785 257613 : unlock_up(path, level, 1, 0, NULL);
4786 : }
4787 0 : out:
4788 248722 : path->keep_locks = keep_locks;
4789 248722 : if (ret == 0) {
4790 156030 : btrfs_unlock_up_safe(path, path->lowest_level + 1);
4791 312058 : memcpy(min_key, &found_key, sizeof(found_key));
4792 : }
4793 248721 : return ret;
4794 : }
4795 :
4796 : /*
4797 : * this is similar to btrfs_next_leaf, but does not try to preserve
4798 : * and fixup the path. It looks for and returns the next key in the
4799 : * tree based on the current path and the min_trans parameters.
4800 : *
4801 : * 0 is returned if another key is found, < 0 if there are any errors
4802 : * and 1 is returned if there are no higher keys in the tree
4803 : *
4804 : * path->keep_locks should be set to 1 on the search made before
4805 : * calling this function.
4806 : */
4807 102659 : int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4808 : struct btrfs_key *key, int level, u64 min_trans)
4809 : {
4810 102659 : int slot;
4811 102659 : struct extent_buffer *c;
4812 :
4813 102659 : WARN_ON(!path->keep_locks && !path->skip_locking);
4814 203661 : while (level < BTRFS_MAX_LEVEL) {
4815 203661 : if (!path->nodes[level])
4816 : return 1;
4817 :
4818 203661 : slot = path->slots[level] + 1;
4819 203661 : c = path->nodes[level];
4820 : next:
4821 204053 : if (slot >= btrfs_header_nritems(c)) {
4822 193551 : int ret;
4823 193551 : int orig_lowest;
4824 193551 : struct btrfs_key cur_key;
4825 193551 : if (level + 1 >= BTRFS_MAX_LEVEL ||
4826 193551 : !path->nodes[level + 1])
4827 92549 : return 1;
4828 :
4829 101002 : if (path->locks[level + 1] || path->skip_locking) {
4830 101002 : level++;
4831 101002 : continue;
4832 : }
4833 :
4834 0 : slot = btrfs_header_nritems(c) - 1;
4835 0 : if (level == 0)
4836 0 : btrfs_item_key_to_cpu(c, &cur_key, slot);
4837 : else
4838 0 : btrfs_node_key_to_cpu(c, &cur_key, slot);
4839 :
4840 0 : orig_lowest = path->lowest_level;
4841 0 : btrfs_release_path(path);
4842 0 : path->lowest_level = level;
4843 0 : ret = btrfs_search_slot(NULL, root, &cur_key, path,
4844 : 0, 0);
4845 0 : path->lowest_level = orig_lowest;
4846 0 : if (ret < 0)
4847 0 : return ret;
4848 :
4849 0 : c = path->nodes[level];
4850 0 : slot = path->slots[level];
4851 0 : if (ret == 0)
4852 0 : slot++;
4853 0 : goto next;
4854 : }
4855 :
4856 10502 : if (level == 0)
4857 0 : btrfs_item_key_to_cpu(c, key, slot);
4858 : else {
4859 10502 : u64 gen = btrfs_node_ptr_generation(c, slot);
4860 :
4861 10501 : if (gen < min_trans) {
4862 392 : slot++;
4863 392 : goto next;
4864 : }
4865 10109 : btrfs_node_key_to_cpu(c, key, slot);
4866 : }
4867 : return 0;
4868 : }
4869 : return 1;
4870 : }
4871 :
4872 29290899 : int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4873 : u64 time_seq)
4874 : {
4875 29290899 : int slot;
4876 29290899 : int level;
4877 29290899 : struct extent_buffer *c;
4878 29290899 : struct extent_buffer *next;
4879 29290899 : struct btrfs_fs_info *fs_info = root->fs_info;
4880 29290899 : struct btrfs_key key;
4881 29290899 : bool need_commit_sem = false;
4882 29290899 : u32 nritems;
4883 29290899 : int ret;
4884 29290899 : int i;
4885 :
4886 : /*
4887 : * The nowait semantics are used only for write paths, where we don't
4888 : * use the tree mod log and sequence numbers.
4889 : */
4890 29290899 : if (time_seq)
4891 : ASSERT(!path->nowait);
4892 :
4893 29290899 : nritems = btrfs_header_nritems(path->nodes[0]);
4894 29290899 : if (nritems == 0)
4895 : return 1;
4896 :
4897 29287173 : btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4898 : again:
4899 29289546 : level = 1;
4900 29289546 : next = NULL;
4901 29289546 : btrfs_release_path(path);
4902 :
4903 29291096 : path->keep_locks = 1;
4904 :
4905 29291096 : if (time_seq) {
4906 286 : ret = btrfs_search_old_slot(root, &key, path, time_seq);
4907 : } else {
4908 29290810 : if (path->need_commit_sem) {
4909 67428 : path->need_commit_sem = 0;
4910 67428 : need_commit_sem = true;
4911 67428 : if (path->nowait) {
4912 0 : if (!down_read_trylock(&fs_info->commit_root_sem)) {
4913 0 : ret = -EAGAIN;
4914 0 : goto done;
4915 : }
4916 : } else {
4917 67428 : down_read(&fs_info->commit_root_sem);
4918 : }
4919 : }
4920 29290810 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4921 : }
4922 29290420 : path->keep_locks = 0;
4923 :
4924 29290420 : if (ret < 0)
4925 0 : goto done;
4926 :
4927 29290420 : nritems = btrfs_header_nritems(path->nodes[0]);
4928 : /*
4929 : * by releasing the path above we dropped all our locks. A balance
4930 : * could have added more items next to the key that used to be
4931 : * at the very end of the block. So, check again here and
4932 : * advance the path if there are now more items available.
4933 : */
4934 29290420 : if (nritems > 0 && path->slots[0] < nritems - 1) {
4935 14874 : if (ret == 0)
4936 14874 : path->slots[0]++;
4937 14874 : ret = 0;
4938 14874 : goto done;
4939 : }
4940 : /*
4941 : * So the above check misses one case:
4942 : * - after releasing the path above, someone has removed the item that
4943 : * used to be at the very end of the block, and balance between leafs
4944 : * gets another one with bigger key.offset to replace it.
4945 : *
4946 : * This one should be returned as well, or we can get leaf corruption
4947 : * later(esp. in __btrfs_drop_extents()).
4948 : *
4949 : * And a bit more explanation about this check,
4950 : * with ret > 0, the key isn't found, the path points to the slot
4951 : * where it should be inserted, so the path->slots[0] item must be the
4952 : * bigger one.
4953 : */
4954 29275546 : if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4955 0 : ret = 0;
4956 0 : goto done;
4957 : }
4958 :
4959 59845875 : while (level < BTRFS_MAX_LEVEL) {
4960 59845875 : if (!path->nodes[level]) {
4961 17002271 : ret = 1;
4962 17002271 : goto done;
4963 : }
4964 :
4965 42843604 : slot = path->slots[level] + 1;
4966 42843604 : c = path->nodes[level];
4967 42843604 : if (slot >= btrfs_header_nritems(c)) {
4968 30570329 : level++;
4969 30570329 : if (level == BTRFS_MAX_LEVEL) {
4970 0 : ret = 1;
4971 0 : goto done;
4972 : }
4973 30570329 : continue;
4974 : }
4975 :
4976 :
4977 : /*
4978 : * Our current level is where we're going to start from, and to
4979 : * make sure lockdep doesn't complain we need to drop our locks
4980 : * and nodes from 0 to our current level.
4981 : */
4982 24607019 : for (i = 0; i < level; i++) {
4983 12333136 : if (path->locks[level]) {
4984 12160862 : btrfs_tree_read_unlock(path->nodes[i]);
4985 12160557 : path->locks[i] = 0;
4986 : }
4987 12332831 : free_extent_buffer(path->nodes[i]);
4988 12333744 : path->nodes[i] = NULL;
4989 : }
4990 :
4991 12273883 : next = c;
4992 12273883 : ret = read_block_for_search(root, path, &next, level,
4993 : slot, &key);
4994 12272452 : if (ret == -EAGAIN && !path->nowait)
4995 2705 : goto again;
4996 :
4997 12269747 : if (ret < 0) {
4998 0 : btrfs_release_path(path);
4999 0 : goto done;
5000 : }
5001 :
5002 12269747 : if (!path->skip_locking) {
5003 12097964 : ret = btrfs_try_tree_read_lock(next);
5004 12098365 : if (!ret && path->nowait) {
5005 0 : ret = -EAGAIN;
5006 0 : goto done;
5007 : }
5008 12098365 : if (!ret && time_seq) {
5009 : /*
5010 : * If we don't get the lock, we may be racing
5011 : * with push_leaf_left, holding that lock while
5012 : * itself waiting for the leaf we've currently
5013 : * locked. To solve this situation, we give up
5014 : * on our lock and cycle.
5015 : */
5016 0 : free_extent_buffer(next);
5017 0 : btrfs_release_path(path);
5018 0 : cond_resched();
5019 0 : goto again;
5020 : }
5021 12098365 : if (!ret)
5022 154 : btrfs_tree_read_lock(next);
5023 : }
5024 : break;
5025 : }
5026 12270148 : path->slots[level] = slot;
5027 12329866 : while (1) {
5028 12329866 : level--;
5029 12329866 : path->nodes[level] = next;
5030 12329866 : path->slots[level] = 0;
5031 12329866 : if (!path->skip_locking)
5032 12157723 : path->locks[level] = BTRFS_READ_LOCK;
5033 12329866 : if (!level)
5034 : break;
5035 :
5036 60043 : ret = read_block_for_search(root, path, &next, level,
5037 : 0, &key);
5038 59827 : if (ret == -EAGAIN && !path->nowait)
5039 109 : goto again;
5040 :
5041 59718 : if (ret < 0) {
5042 0 : btrfs_release_path(path);
5043 0 : goto done;
5044 : }
5045 :
5046 59718 : if (!path->skip_locking) {
5047 59522 : if (path->nowait) {
5048 0 : if (!btrfs_try_tree_read_lock(next)) {
5049 0 : ret = -EAGAIN;
5050 0 : goto done;
5051 : }
5052 : } else {
5053 59522 : btrfs_tree_read_lock(next);
5054 : }
5055 : }
5056 : }
5057 : ret = 0;
5058 29286968 : done:
5059 29286968 : unlock_up(path, 0, 1, 0, NULL);
5060 29287105 : if (need_commit_sem) {
5061 67428 : int ret2;
5062 :
5063 67428 : path->need_commit_sem = 1;
5064 67428 : ret2 = finish_need_commit_sem_search(path);
5065 67428 : up_read(&fs_info->commit_root_sem);
5066 67428 : if (ret2)
5067 0 : ret = ret2;
5068 : }
5069 :
5070 : return ret;
5071 : }
5072 :
5073 17475601 : int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq)
5074 : {
5075 17475601 : path->slots[0]++;
5076 17475601 : if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
5077 114252 : return btrfs_next_old_leaf(root, path, time_seq);
5078 : return 0;
5079 : }
5080 :
5081 : /*
5082 : * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5083 : * searching until it gets past min_objectid or finds an item of 'type'
5084 : *
5085 : * returns 0 if something is found, 1 if nothing was found and < 0 on error
5086 : */
5087 213100 : int btrfs_previous_item(struct btrfs_root *root,
5088 : struct btrfs_path *path, u64 min_objectid,
5089 : int type)
5090 : {
5091 213460 : struct btrfs_key found_key;
5092 213460 : struct extent_buffer *leaf;
5093 213460 : u32 nritems;
5094 213460 : int ret;
5095 :
5096 213460 : while (1) {
5097 213460 : if (path->slots[0] == 0) {
5098 911 : ret = btrfs_prev_leaf(root, path);
5099 911 : if (ret != 0)
5100 909 : return ret;
5101 : } else {
5102 212549 : path->slots[0]--;
5103 : }
5104 212551 : leaf = path->nodes[0];
5105 212551 : nritems = btrfs_header_nritems(leaf);
5106 212551 : if (nritems == 0)
5107 : return 1;
5108 212551 : if (path->slots[0] == nritems)
5109 2 : path->slots[0]--;
5110 :
5111 212551 : btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5112 212550 : if (found_key.objectid < min_objectid)
5113 : break;
5114 211843 : if (found_key.type == type)
5115 : return 0;
5116 1346 : if (found_key.objectid == min_objectid &&
5117 : found_key.type < type)
5118 : break;
5119 : }
5120 : return 1;
5121 : }
5122 :
5123 : /*
5124 : * search in extent tree to find a previous Metadata/Data extent item with
5125 : * min objecitd.
5126 : *
5127 : * returns 0 if something is found, 1 if nothing was found and < 0 on error
5128 : */
5129 66684 : int btrfs_previous_extent_item(struct btrfs_root *root,
5130 : struct btrfs_path *path, u64 min_objectid)
5131 : {
5132 76643 : struct btrfs_key found_key;
5133 76643 : struct extent_buffer *leaf;
5134 76643 : u32 nritems;
5135 76643 : int ret;
5136 :
5137 76643 : while (1) {
5138 76643 : if (path->slots[0] == 0) {
5139 52 : ret = btrfs_prev_leaf(root, path);
5140 52 : if (ret != 0)
5141 9 : return ret;
5142 : } else {
5143 76591 : path->slots[0]--;
5144 : }
5145 76634 : leaf = path->nodes[0];
5146 76634 : nritems = btrfs_header_nritems(leaf);
5147 76634 : if (nritems == 0)
5148 : return 1;
5149 76634 : if (path->slots[0] == nritems)
5150 43 : path->slots[0]--;
5151 :
5152 76634 : btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5153 76634 : if (found_key.objectid < min_objectid)
5154 : break;
5155 76634 : if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5156 : found_key.type == BTRFS_METADATA_ITEM_KEY)
5157 : return 0;
5158 9959 : if (found_key.objectid == min_objectid &&
5159 : found_key.type < BTRFS_EXTENT_ITEM_KEY)
5160 : break;
5161 : }
5162 : return 1;
5163 : }
5164 :
5165 11 : int __init btrfs_ctree_init(void)
5166 : {
5167 11 : btrfs_path_cachep = kmem_cache_create("btrfs_path",
5168 : sizeof(struct btrfs_path), 0,
5169 : SLAB_MEM_SPREAD, NULL);
5170 11 : if (!btrfs_path_cachep)
5171 0 : return -ENOMEM;
5172 : return 0;
5173 : }
5174 :
5175 0 : void __cold btrfs_ctree_exit(void)
5176 : {
5177 0 : kmem_cache_destroy(btrfs_path_cachep);
5178 0 : }
|