Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (C) 2007,2008 Oracle. All rights reserved.
4 : */
5 :
6 : #include <linux/sched.h>
7 : #include <linux/slab.h>
8 : #include <linux/rbtree.h>
9 : #include <linux/mm.h>
10 : #include <linux/error-injection.h>
11 : #include "messages.h"
12 : #include "ctree.h"
13 : #include "disk-io.h"
14 : #include "transaction.h"
15 : #include "print-tree.h"
16 : #include "locking.h"
17 : #include "volumes.h"
18 : #include "qgroup.h"
19 : #include "tree-mod-log.h"
20 : #include "tree-checker.h"
21 : #include "fs.h"
22 : #include "accessors.h"
23 : #include "extent-tree.h"
24 : #include "relocation.h"
25 : #include "file-item.h"
26 :
27 : static struct kmem_cache *btrfs_path_cachep;
28 :
29 : static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
30 : *root, struct btrfs_path *path, int level);
31 : static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
32 : const struct btrfs_key *ins_key, struct btrfs_path *path,
33 : int data_size, int extend);
34 : static int push_node_left(struct btrfs_trans_handle *trans,
35 : struct extent_buffer *dst,
36 : struct extent_buffer *src, int empty);
37 : static int balance_node_right(struct btrfs_trans_handle *trans,
38 : struct extent_buffer *dst_buf,
39 : struct extent_buffer *src_buf);
40 :
41 : static const struct btrfs_csums {
42 : u16 size;
43 : const char name[10];
44 : const char driver[12];
45 : } btrfs_csums[] = {
46 : [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
47 : [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
48 : [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
49 : [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
50 : .driver = "blake2b-256" },
51 : };
52 :
53 : /*
54 : * The leaf data grows from end-to-front in the node. this returns the address
55 : * of the start of the last item, which is the stop of the leaf data stack.
56 : */
57 140422697 : static unsigned int leaf_data_end(const struct extent_buffer *leaf)
58 : {
59 140422697 : u32 nr = btrfs_header_nritems(leaf);
60 :
61 140422697 : if (nr == 0)
62 13591 : return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
63 140409106 : return btrfs_item_offset(leaf, nr - 1);
64 : }
65 :
66 : /*
67 : * Move data in a @leaf (using memmove, safe for overlapping ranges).
68 : *
69 : * @leaf: leaf that we're doing a memmove on
70 : * @dst_offset: item data offset we're moving to
71 : * @src_offset: item data offset were' moving from
72 : * @len: length of the data we're moving
73 : *
74 : * Wrapper around memmove_extent_buffer() that takes into account the header on
75 : * the leaf. The btrfs_item offset's start directly after the header, so we
76 : * have to adjust any offsets to account for the header in the leaf. This
77 : * handles that math to simplify the callers.
78 : */
79 : static inline void memmove_leaf_data(const struct extent_buffer *leaf,
80 : unsigned long dst_offset,
81 : unsigned long src_offset,
82 : unsigned long len)
83 : {
84 106398683 : memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset,
85 : btrfs_item_nr_offset(leaf, 0) + src_offset, len);
86 16124291 : }
87 :
88 : /*
89 : * Copy item data from @src into @dst at the given @offset.
90 : *
91 : * @dst: destination leaf that we're copying into
92 : * @src: source leaf that we're copying from
93 : * @dst_offset: item data offset we're copying to
94 : * @src_offset: item data offset were' copying from
95 : * @len: length of the data we're copying
96 : *
97 : * Wrapper around copy_extent_buffer() that takes into account the header on
98 : * the leaf. The btrfs_item offset's start directly after the header, so we
99 : * have to adjust any offsets to account for the header in the leaf. This
100 : * handles that math to simplify the callers.
101 : */
102 : static inline void copy_leaf_data(const struct extent_buffer *dst,
103 : const struct extent_buffer *src,
104 : unsigned long dst_offset,
105 : unsigned long src_offset, unsigned long len)
106 : {
107 2808270 : copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset,
108 : btrfs_item_nr_offset(src, 0) + src_offset, len);
109 : }
110 :
111 : /*
112 : * Move items in a @leaf (using memmove).
113 : *
114 : * @dst: destination leaf for the items
115 : * @dst_item: the item nr we're copying into
116 : * @src_item: the item nr we're copying from
117 : * @nr_items: the number of items to copy
118 : *
119 : * Wrapper around memmove_extent_buffer() that does the math to get the
120 : * appropriate offsets into the leaf from the item numbers.
121 : */
122 64241780 : static inline void memmove_leaf_items(const struct extent_buffer *leaf,
123 : int dst_item, int src_item, int nr_items)
124 : {
125 64241780 : memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item),
126 : btrfs_item_nr_offset(leaf, src_item),
127 : nr_items * sizeof(struct btrfs_item));
128 949735 : }
129 :
130 : /*
131 : * Copy items from @src into @dst at the given @offset.
132 : *
133 : * @dst: destination leaf for the items
134 : * @src: source leaf for the items
135 : * @dst_item: the item nr we're copying into
136 : * @src_item: the item nr we're copying from
137 : * @nr_items: the number of items to copy
138 : *
139 : * Wrapper around copy_extent_buffer() that does the math to get the
140 : * appropriate offsets into the leaf from the item numbers.
141 : */
142 : static inline void copy_leaf_items(const struct extent_buffer *dst,
143 : const struct extent_buffer *src,
144 : int dst_item, int src_item, int nr_items)
145 : {
146 1032341 : copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item),
147 : btrfs_item_nr_offset(src, src_item),
148 : nr_items * sizeof(struct btrfs_item));
149 : }
150 :
151 : /* This exists for btrfs-progs usages. */
152 0 : u16 btrfs_csum_type_size(u16 type)
153 : {
154 3256 : return btrfs_csums[type].size;
155 : }
156 :
157 3256 : int btrfs_super_csum_size(const struct btrfs_super_block *s)
158 : {
159 3256 : u16 t = btrfs_super_csum_type(s);
160 : /*
161 : * csum type is validated at mount time
162 : */
163 3256 : return btrfs_csum_type_size(t);
164 : }
165 :
166 3244 : const char *btrfs_super_csum_name(u16 csum_type)
167 : {
168 : /* csum type is validated at mount time */
169 3244 : return btrfs_csums[csum_type].name;
170 : }
171 :
172 : /*
173 : * Return driver name if defined, otherwise the name that's also a valid driver
174 : * name
175 : */
176 3244 : const char *btrfs_super_csum_driver(u16 csum_type)
177 : {
178 : /* csum type is validated at mount time */
179 3244 : return btrfs_csums[csum_type].driver[0] ?
180 3244 : btrfs_csums[csum_type].driver :
181 : btrfs_csums[csum_type].name;
182 : }
183 :
184 0 : size_t __attribute_const__ btrfs_get_num_csums(void)
185 : {
186 0 : return ARRAY_SIZE(btrfs_csums);
187 : }
188 :
189 204681517 : struct btrfs_path *btrfs_alloc_path(void)
190 : {
191 204681517 : might_sleep();
192 :
193 204653902 : return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
194 : }
195 :
196 : /* this also releases the path */
197 308664606 : void btrfs_free_path(struct btrfs_path *p)
198 : {
199 308664606 : if (!p)
200 : return;
201 204701005 : btrfs_release_path(p);
202 204710440 : kmem_cache_free(btrfs_path_cachep, p);
203 : }
204 :
205 : /*
206 : * path release drops references on the extent buffers in the path
207 : * and it drops any locks held by this path
208 : *
209 : * It is safe to call this on paths that no locks or extent buffers held.
210 : */
211 584555034 : noinline void btrfs_release_path(struct btrfs_path *p)
212 : {
213 584555034 : int i;
214 :
215 5258666797 : for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
216 4674185235 : p->slots[i] = 0;
217 4674185235 : if (!p->nodes[i])
218 3729206612 : continue;
219 944978623 : if (p->locks[i]) {
220 424391153 : btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
221 424309767 : p->locks[i] = 0;
222 : }
223 944897237 : free_extent_buffer(p->nodes[i]);
224 944905151 : p->nodes[i] = NULL;
225 : }
226 584481562 : }
227 :
228 : /*
229 : * We want the transaction abort to print stack trace only for errors where the
230 : * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
231 : * caused by external factors.
232 : */
233 17 : bool __cold abort_should_print_stack(int errno)
234 : {
235 17 : switch (errno) {
236 : case -EIO:
237 : case -EROFS:
238 : case -ENOMEM:
239 : return false;
240 : }
241 0 : return true;
242 : }
243 :
244 : /*
245 : * safely gets a reference on the root node of a tree. A lock
246 : * is not taken, so a concurrent writer may put a different node
247 : * at the root of the tree. See btrfs_lock_root_node for the
248 : * looping required.
249 : *
250 : * The extent buffer returned by this has a reference taken, so
251 : * it won't disappear. It may stop being the root of the tree
252 : * at any time because there are no locks held.
253 : */
254 501964884 : struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
255 : {
256 501964884 : struct extent_buffer *eb;
257 :
258 501964884 : while (1) {
259 501964884 : rcu_read_lock();
260 502042399 : eb = rcu_dereference(root->node);
261 :
262 : /*
263 : * RCU really hurts here, we could free up the root node because
264 : * it was COWed but we may not get the new root node yet so do
265 : * the inc_not_zero dance and if it doesn't work then
266 : * synchronize_rcu and try again.
267 : */
268 1004273281 : if (atomic_inc_not_zero(&eb->refs)) {
269 502230882 : rcu_read_unlock();
270 502224566 : break;
271 : }
272 0 : rcu_read_unlock();
273 0 : synchronize_rcu();
274 : }
275 502224566 : return eb;
276 : }
277 :
278 : /*
279 : * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
280 : * just get put onto a simple dirty list. Transaction walks this list to make
281 : * sure they get properly updated on disk.
282 : */
283 1338798 : static void add_root_to_dirty_list(struct btrfs_root *root)
284 : {
285 1338798 : struct btrfs_fs_info *fs_info = root->fs_info;
286 :
287 1338798 : if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
288 0 : !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
289 : return;
290 :
291 459192 : spin_lock(&fs_info->trans_lock);
292 459191 : if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
293 : /* Want the extent tree to be the last on the list */
294 459191 : if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
295 206456 : list_move_tail(&root->dirty_list,
296 : &fs_info->dirty_cowonly_roots);
297 : else
298 252735 : list_move(&root->dirty_list,
299 : &fs_info->dirty_cowonly_roots);
300 : }
301 459191 : spin_unlock(&fs_info->trans_lock);
302 : }
303 :
304 : /*
305 : * used by snapshot creation to make a copy of a root for a tree with
306 : * a given objectid. The buffer with the new root node is returned in
307 : * cow_ret, and this func returns zero on success or a negative error code.
308 : */
309 2546 : int btrfs_copy_root(struct btrfs_trans_handle *trans,
310 : struct btrfs_root *root,
311 : struct extent_buffer *buf,
312 : struct extent_buffer **cow_ret, u64 new_root_objectid)
313 : {
314 2546 : struct btrfs_fs_info *fs_info = root->fs_info;
315 2546 : struct extent_buffer *cow;
316 2546 : int ret = 0;
317 2546 : int level;
318 2546 : struct btrfs_disk_key disk_key;
319 :
320 5092 : WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
321 : trans->transid != fs_info->running_transaction->transid);
322 5092 : WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
323 : trans->transid != root->last_trans);
324 :
325 2546 : level = btrfs_header_level(buf);
326 2546 : if (level == 0)
327 812 : btrfs_item_key(buf, &disk_key, 0);
328 : else
329 1734 : btrfs_node_key(buf, &disk_key, 0);
330 :
331 2546 : cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
332 : &disk_key, level, buf->start, 0,
333 : BTRFS_NESTING_NEW_ROOT);
334 2546 : if (IS_ERR(cow))
335 0 : return PTR_ERR(cow);
336 :
337 2546 : copy_extent_buffer_full(cow, buf);
338 2546 : btrfs_set_header_bytenr(cow, cow->start);
339 2546 : btrfs_set_header_generation(cow, trans->transid);
340 2546 : btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
341 2546 : btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
342 : BTRFS_HEADER_FLAG_RELOC);
343 2546 : if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
344 1520 : btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
345 : else
346 1026 : btrfs_set_header_owner(cow, new_root_objectid);
347 :
348 2546 : write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
349 :
350 2546 : WARN_ON(btrfs_header_generation(buf) > trans->transid);
351 2546 : if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
352 1520 : ret = btrfs_inc_ref(trans, root, cow, 1);
353 : else
354 1026 : ret = btrfs_inc_ref(trans, root, cow, 0);
355 2546 : if (ret) {
356 0 : btrfs_tree_unlock(cow);
357 0 : free_extent_buffer(cow);
358 0 : btrfs_abort_transaction(trans, ret);
359 0 : return ret;
360 : }
361 :
362 2546 : btrfs_mark_buffer_dirty(cow);
363 2546 : *cow_ret = cow;
364 2546 : return 0;
365 : }
366 :
367 : /*
368 : * check if the tree block can be shared by multiple trees
369 : */
370 9160097 : int btrfs_block_can_be_shared(struct btrfs_root *root,
371 : struct extent_buffer *buf)
372 : {
373 : /*
374 : * Tree blocks not in shareable trees and tree roots are never shared.
375 : * If a block was allocated after the last snapshot and the block was
376 : * not allocated by tree relocation, we know the block is not shared.
377 : */
378 18320194 : if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
379 5012698 : buf != root->node && buf != root->commit_root &&
380 : (btrfs_header_generation(buf) <=
381 2824192 : btrfs_root_last_snapshot(&root->root_item) ||
382 : btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
383 2019142 : return 1;
384 :
385 : return 0;
386 : }
387 :
388 9063067 : static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
389 : struct btrfs_root *root,
390 : struct extent_buffer *buf,
391 : struct extent_buffer *cow,
392 : int *last_ref)
393 : {
394 9063067 : struct btrfs_fs_info *fs_info = root->fs_info;
395 9063067 : u64 refs;
396 9063067 : u64 owner;
397 9063067 : u64 flags;
398 9063067 : u64 new_flags = 0;
399 9063067 : int ret;
400 :
401 : /*
402 : * Backrefs update rules:
403 : *
404 : * Always use full backrefs for extent pointers in tree block
405 : * allocated by tree relocation.
406 : *
407 : * If a shared tree block is no longer referenced by its owner
408 : * tree (btrfs_header_owner(buf) == root->root_key.objectid),
409 : * use full backrefs for extent pointers in tree block.
410 : *
411 : * If a tree block is been relocating
412 : * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
413 : * use full backrefs for extent pointers in tree block.
414 : * The reason for this is some operations (such as drop tree)
415 : * are only allowed for blocks use full backrefs.
416 : */
417 :
418 9063067 : if (btrfs_block_can_be_shared(root, buf)) {
419 2018979 : ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
420 : btrfs_header_level(buf), 1,
421 : &refs, &flags);
422 2018979 : if (ret)
423 : return ret;
424 2018979 : if (unlikely(refs == 0)) {
425 0 : btrfs_crit(fs_info,
426 : "found 0 references for tree block at bytenr %llu level %d root %llu",
427 : buf->start, btrfs_header_level(buf),
428 : btrfs_root_id(root));
429 0 : ret = -EUCLEAN;
430 0 : btrfs_abort_transaction(trans, ret);
431 0 : return ret;
432 : }
433 : } else {
434 7043951 : refs = 1;
435 7043951 : if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
436 : btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
437 615 : flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
438 : else
439 7043336 : flags = 0;
440 : }
441 :
442 9062930 : owner = btrfs_header_owner(buf);
443 9062930 : BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
444 : !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
445 :
446 9062930 : if (refs > 1) {
447 2017295 : if ((owner == root->root_key.objectid ||
448 2016385 : root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
449 2016385 : !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
450 69362 : ret = btrfs_inc_ref(trans, root, buf, 1);
451 69362 : if (ret)
452 : return ret;
453 :
454 69362 : if (root->root_key.objectid ==
455 : BTRFS_TREE_RELOC_OBJECTID) {
456 32716 : ret = btrfs_dec_ref(trans, root, buf, 0);
457 32716 : if (ret)
458 : return ret;
459 32716 : ret = btrfs_inc_ref(trans, root, cow, 1);
460 32716 : if (ret)
461 : return ret;
462 : }
463 69362 : new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
464 : } else {
465 :
466 1947933 : if (root->root_key.objectid ==
467 : BTRFS_TREE_RELOC_OBJECTID)
468 1947011 : ret = btrfs_inc_ref(trans, root, cow, 1);
469 : else
470 922 : ret = btrfs_inc_ref(trans, root, cow, 0);
471 1947933 : if (ret)
472 0 : return ret;
473 : }
474 69362 : if (new_flags != 0) {
475 69362 : ret = btrfs_set_disk_extent_flags(trans, buf, new_flags);
476 69362 : if (ret)
477 0 : return ret;
478 : }
479 : } else {
480 7045635 : if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
481 1283 : if (root->root_key.objectid ==
482 : BTRFS_TREE_RELOC_OBJECTID)
483 960 : ret = btrfs_inc_ref(trans, root, cow, 1);
484 : else
485 323 : ret = btrfs_inc_ref(trans, root, cow, 0);
486 1283 : if (ret)
487 : return ret;
488 1283 : ret = btrfs_dec_ref(trans, root, buf, 1);
489 1283 : if (ret)
490 : return ret;
491 : }
492 7045635 : btrfs_clear_buffer_dirty(trans, buf);
493 7045653 : *last_ref = 1;
494 : }
495 : return 0;
496 : }
497 :
498 : /*
499 : * does the dirty work in cow of a single block. The parent block (if
500 : * supplied) is updated to point to the new cow copy. The new buffer is marked
501 : * dirty and returned locked. If you modify the block it needs to be marked
502 : * dirty again.
503 : *
504 : * search_start -- an allocation hint for the new block
505 : *
506 : * empty_size -- a hint that you plan on doing more cow. This is the size in
507 : * bytes the allocator should try to find free next to the block it returns.
508 : * This is just a hint and may be ignored by the allocator.
509 : */
510 9064350 : static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
511 : struct btrfs_root *root,
512 : struct extent_buffer *buf,
513 : struct extent_buffer *parent, int parent_slot,
514 : struct extent_buffer **cow_ret,
515 : u64 search_start, u64 empty_size,
516 : enum btrfs_lock_nesting nest)
517 : {
518 9064350 : struct btrfs_fs_info *fs_info = root->fs_info;
519 9064350 : struct btrfs_disk_key disk_key;
520 9064350 : struct extent_buffer *cow;
521 9064350 : int level, ret;
522 9064350 : int last_ref = 0;
523 9064350 : int unlock_orig = 0;
524 9064350 : u64 parent_start = 0;
525 :
526 9064350 : if (*cow_ret == buf)
527 9064308 : unlock_orig = 1;
528 :
529 9064350 : btrfs_assert_tree_write_locked(buf);
530 :
531 14077522 : WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
532 : trans->transid != fs_info->running_transaction->transid);
533 14077497 : WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
534 : trans->transid != root->last_trans);
535 :
536 9064350 : level = btrfs_header_level(buf);
537 :
538 9064350 : if (level == 0)
539 8185095 : btrfs_item_key(buf, &disk_key, 0);
540 : else
541 879255 : btrfs_node_key(buf, &disk_key, 0);
542 :
543 9064349 : if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
544 1980013 : parent_start = parent->start;
545 :
546 9064349 : cow = btrfs_alloc_tree_block(trans, root, parent_start,
547 : root->root_key.objectid, &disk_key, level,
548 : search_start, empty_size, nest);
549 9064627 : if (IS_ERR(cow))
550 1372 : return PTR_ERR(cow);
551 :
552 : /* cow is set to blocking by btrfs_init_new_buffer */
553 :
554 9063255 : copy_extent_buffer_full(cow, buf);
555 9063134 : btrfs_set_header_bytenr(cow, cow->start);
556 9063134 : btrfs_set_header_generation(cow, trans->transid);
557 9063134 : btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
558 9063062 : btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
559 : BTRFS_HEADER_FLAG_RELOC);
560 9063125 : if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
561 1980687 : btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
562 : else
563 7082438 : btrfs_set_header_owner(cow, root->root_key.objectid);
564 :
565 9063125 : write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
566 :
567 9063086 : ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
568 9062887 : if (ret) {
569 0 : btrfs_tree_unlock(cow);
570 0 : free_extent_buffer(cow);
571 0 : btrfs_abort_transaction(trans, ret);
572 0 : return ret;
573 : }
574 :
575 18125774 : if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
576 5011593 : ret = btrfs_reloc_cow_block(trans, root, buf, cow);
577 5011560 : if (ret) {
578 0 : btrfs_tree_unlock(cow);
579 0 : free_extent_buffer(cow);
580 0 : btrfs_abort_transaction(trans, ret);
581 0 : return ret;
582 : }
583 : }
584 :
585 9062854 : if (buf == root->node) {
586 1335577 : WARN_ON(parent && parent != buf);
587 1335577 : if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
588 : btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
589 674 : parent_start = buf->start;
590 :
591 1335577 : ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
592 1335552 : if (ret < 0) {
593 0 : btrfs_tree_unlock(cow);
594 0 : free_extent_buffer(cow);
595 0 : btrfs_abort_transaction(trans, ret);
596 0 : return ret;
597 : }
598 1335552 : atomic_inc(&cow->refs);
599 1335582 : rcu_assign_pointer(root->node, cow);
600 :
601 1335581 : btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
602 : parent_start, last_ref);
603 1335593 : free_extent_buffer(buf);
604 1335596 : add_root_to_dirty_list(root);
605 : } else {
606 7727277 : WARN_ON(trans->transid != btrfs_header_generation(parent));
607 7727277 : ret = btrfs_tree_mod_log_insert_key(parent, parent_slot,
608 : BTRFS_MOD_LOG_KEY_REPLACE);
609 7727184 : if (ret) {
610 0 : btrfs_tree_unlock(cow);
611 0 : free_extent_buffer(cow);
612 0 : btrfs_abort_transaction(trans, ret);
613 0 : return ret;
614 : }
615 7727184 : btrfs_set_node_blockptr(parent, parent_slot,
616 : cow->start);
617 7726955 : btrfs_set_node_ptr_generation(parent, parent_slot,
618 : trans->transid);
619 7727060 : btrfs_mark_buffer_dirty(parent);
620 7727349 : if (last_ref) {
621 5710054 : ret = btrfs_tree_mod_log_free_eb(buf);
622 5709547 : if (ret) {
623 0 : btrfs_tree_unlock(cow);
624 0 : free_extent_buffer(cow);
625 0 : btrfs_abort_transaction(trans, ret);
626 0 : return ret;
627 : }
628 : }
629 7726842 : btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
630 : parent_start, last_ref);
631 : }
632 9063252 : if (unlock_orig)
633 9063246 : btrfs_tree_unlock(buf);
634 9063247 : free_extent_buffer_stale(buf);
635 9062997 : btrfs_mark_buffer_dirty(cow);
636 9063263 : *cow_ret = cow;
637 9063263 : return 0;
638 : }
639 :
640 504982123 : static inline int should_cow_block(struct btrfs_trans_handle *trans,
641 : struct btrfs_root *root,
642 : struct extent_buffer *buf)
643 : {
644 504982123 : if (btrfs_is_testing(root->fs_info))
645 : return 0;
646 :
647 : /* Ensure we can see the FORCE_COW bit */
648 504982123 : smp_mb__before_atomic();
649 :
650 : /*
651 : * We do not need to cow a block if
652 : * 1) this block is not created or changed in this transaction;
653 : * 2) this block does not belong to TREE_RELOC tree;
654 : * 3) the root is not forced COW.
655 : *
656 : * What is forced COW:
657 : * when we create snapshot during committing the transaction,
658 : * after we've finished copying src root, we must COW the shared
659 : * block to ensure the metadata consistency.
660 : */
661 504982123 : if (btrfs_header_generation(buf) == trans->transid &&
662 488110801 : !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
663 488110801 : !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
664 488106686 : btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
665 488106686 : !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
666 488078924 : return 0;
667 : return 1;
668 : }
669 :
670 : /*
671 : * cows a single block, see __btrfs_cow_block for the real work.
672 : * This version of it has extra checks so that a block isn't COWed more than
673 : * once per transaction, as long as it hasn't been written yet
674 : */
675 13060875 : noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
676 : struct btrfs_root *root, struct extent_buffer *buf,
677 : struct extent_buffer *parent, int parent_slot,
678 : struct extent_buffer **cow_ret,
679 : enum btrfs_lock_nesting nest)
680 : {
681 13060875 : struct btrfs_fs_info *fs_info = root->fs_info;
682 13060875 : u64 search_start;
683 13060875 : int ret;
684 :
685 26121750 : if (test_bit(BTRFS_ROOT_DELETING, &root->state))
686 0 : btrfs_err(fs_info,
687 : "COW'ing blocks on a fs root that's being dropped");
688 :
689 13060875 : if (trans->transaction != fs_info->running_transaction)
690 0 : WARN(1, KERN_CRIT "trans %llu running %llu\n",
691 : trans->transid,
692 : fs_info->running_transaction->transid);
693 :
694 13060829 : if (trans->transid != fs_info->generation)
695 0 : WARN(1, KERN_CRIT "trans %llu running %llu\n",
696 : trans->transid, fs_info->generation);
697 :
698 13060829 : if (!should_cow_block(trans, root, buf)) {
699 3996443 : *cow_ret = buf;
700 3996443 : return 0;
701 : }
702 :
703 9064376 : search_start = buf->start & ~((u64)SZ_1G - 1);
704 :
705 : /*
706 : * Before CoWing this block for later modification, check if it's
707 : * the subtree root and do the delayed subtree trace if needed.
708 : *
709 : * Also We don't care about the error, as it's handled internally.
710 : */
711 9064376 : btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
712 9064341 : ret = __btrfs_cow_block(trans, root, buf, parent,
713 : parent_slot, cow_ret, search_start, 0, nest);
714 :
715 9064633 : trace_btrfs_cow_block(root, buf, *cow_ret);
716 :
717 9064633 : return ret;
718 : }
719 : ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
720 :
721 : /*
722 : * helper function for defrag to decide if two blocks pointed to by a
723 : * node are actually close by
724 : */
725 0 : static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
726 : {
727 0 : if (blocknr < other && other - (blocknr + blocksize) < 32768)
728 : return 1;
729 0 : if (blocknr > other && blocknr - (other + blocksize) < 32768)
730 0 : return 1;
731 : return 0;
732 : }
733 :
734 : #ifdef __LITTLE_ENDIAN
735 :
736 : /*
737 : * Compare two keys, on little-endian the disk order is same as CPU order and
738 : * we can avoid the conversion.
739 : */
740 : static int comp_keys(const struct btrfs_disk_key *disk_key,
741 : const struct btrfs_key *k2)
742 : {
743 5282246262 : const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
744 :
745 5282246262 : return btrfs_comp_cpu_keys(k1, k2);
746 : }
747 :
748 : #else
749 :
750 : /*
751 : * compare two keys in a memcmp fashion
752 : */
753 : static int comp_keys(const struct btrfs_disk_key *disk,
754 : const struct btrfs_key *k2)
755 : {
756 : struct btrfs_key k1;
757 :
758 : btrfs_disk_key_to_cpu(&k1, disk);
759 :
760 : return btrfs_comp_cpu_keys(&k1, k2);
761 : }
762 : #endif
763 :
764 : /*
765 : * same as comp_keys only with two btrfs_key's
766 : */
767 6888865498 : int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
768 : {
769 6888865498 : if (k1->objectid > k2->objectid)
770 : return 1;
771 5595884208 : if (k1->objectid < k2->objectid)
772 : return -1;
773 3508241859 : if (k1->type > k2->type)
774 : return 1;
775 3404127251 : if (k1->type < k2->type)
776 : return -1;
777 3232002439 : if (k1->offset > k2->offset)
778 : return 1;
779 2600184942 : if (k1->offset < k2->offset)
780 2058641225 : return -1;
781 : return 0;
782 : }
783 :
784 : /*
785 : * this is used by the defrag code to go through all the
786 : * leaves pointed to by a node and reallocate them so that
787 : * disk order is close to key order
788 : */
789 0 : int btrfs_realloc_node(struct btrfs_trans_handle *trans,
790 : struct btrfs_root *root, struct extent_buffer *parent,
791 : int start_slot, u64 *last_ret,
792 : struct btrfs_key *progress)
793 : {
794 0 : struct btrfs_fs_info *fs_info = root->fs_info;
795 0 : struct extent_buffer *cur;
796 0 : u64 blocknr;
797 0 : u64 search_start = *last_ret;
798 0 : u64 last_block = 0;
799 0 : u64 other;
800 0 : u32 parent_nritems;
801 0 : int end_slot;
802 0 : int i;
803 0 : int err = 0;
804 0 : u32 blocksize;
805 0 : int progress_passed = 0;
806 0 : struct btrfs_disk_key disk_key;
807 :
808 0 : WARN_ON(trans->transaction != fs_info->running_transaction);
809 0 : WARN_ON(trans->transid != fs_info->generation);
810 :
811 0 : parent_nritems = btrfs_header_nritems(parent);
812 0 : blocksize = fs_info->nodesize;
813 0 : end_slot = parent_nritems - 1;
814 :
815 0 : if (parent_nritems <= 1)
816 : return 0;
817 :
818 0 : for (i = start_slot; i <= end_slot; i++) {
819 0 : int close = 1;
820 :
821 0 : btrfs_node_key(parent, &disk_key, i);
822 0 : if (!progress_passed && comp_keys(&disk_key, progress) < 0)
823 0 : continue;
824 :
825 0 : progress_passed = 1;
826 0 : blocknr = btrfs_node_blockptr(parent, i);
827 0 : if (last_block == 0)
828 0 : last_block = blocknr;
829 :
830 0 : if (i > 0) {
831 0 : other = btrfs_node_blockptr(parent, i - 1);
832 0 : close = close_blocks(blocknr, other, blocksize);
833 : }
834 0 : if (!close && i < end_slot) {
835 0 : other = btrfs_node_blockptr(parent, i + 1);
836 0 : close = close_blocks(blocknr, other, blocksize);
837 : }
838 0 : if (close) {
839 0 : last_block = blocknr;
840 0 : continue;
841 : }
842 :
843 0 : cur = btrfs_read_node_slot(parent, i);
844 0 : if (IS_ERR(cur))
845 0 : return PTR_ERR(cur);
846 0 : if (search_start == 0)
847 0 : search_start = last_block;
848 :
849 0 : btrfs_tree_lock(cur);
850 0 : err = __btrfs_cow_block(trans, root, cur, parent, i,
851 : &cur, search_start,
852 0 : min(16 * blocksize,
853 : (end_slot - i) * blocksize),
854 : BTRFS_NESTING_COW);
855 0 : if (err) {
856 0 : btrfs_tree_unlock(cur);
857 0 : free_extent_buffer(cur);
858 0 : break;
859 : }
860 0 : search_start = cur->start;
861 0 : last_block = cur->start;
862 0 : *last_ret = search_start;
863 0 : btrfs_tree_unlock(cur);
864 0 : free_extent_buffer(cur);
865 : }
866 : return err;
867 : }
868 :
869 : /*
870 : * Search for a key in the given extent_buffer.
871 : *
872 : * The lower boundary for the search is specified by the slot number @first_slot.
873 : * Use a value of 0 to search over the whole extent buffer. Works for both
874 : * leaves and nodes.
875 : *
876 : * The slot in the extent buffer is returned via @slot. If the key exists in the
877 : * extent buffer, then @slot will point to the slot where the key is, otherwise
878 : * it points to the slot where you would insert the key.
879 : *
880 : * Slot may point to the total number of items (i.e. one position beyond the last
881 : * key) if the key is bigger than the last key in the extent buffer.
882 : */
883 898202327 : int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
884 : const struct btrfs_key *key, int *slot)
885 : {
886 898202327 : unsigned long p;
887 898202327 : int item_size;
888 : /*
889 : * Use unsigned types for the low and high slots, so that we get a more
890 : * efficient division in the search loop below.
891 : */
892 898202327 : u32 low = first_slot;
893 898202327 : u32 high = btrfs_header_nritems(eb);
894 898202327 : int ret;
895 898202327 : const int key_size = sizeof(struct btrfs_disk_key);
896 :
897 898202327 : if (unlikely(low > high)) {
898 0 : btrfs_err(eb->fs_info,
899 : "%s: low (%u) > high (%u) eb %llu owner %llu level %d",
900 : __func__, low, high, eb->start,
901 : btrfs_header_owner(eb), btrfs_header_level(eb));
902 0 : return -EINVAL;
903 : }
904 :
905 898202327 : if (btrfs_header_level(eb) == 0) {
906 : p = offsetof(struct btrfs_leaf, items);
907 : item_size = sizeof(struct btrfs_item);
908 : } else {
909 560796395 : p = offsetof(struct btrfs_node, ptrs);
910 560796395 : item_size = sizeof(struct btrfs_key_ptr);
911 : }
912 :
913 5894291683 : while (low < high) {
914 5192094577 : unsigned long oip;
915 5192094577 : unsigned long offset;
916 5192094577 : struct btrfs_disk_key *tmp;
917 5192094577 : struct btrfs_disk_key unaligned;
918 5192094577 : int mid;
919 :
920 5192094577 : mid = (low + high) / 2;
921 5192094577 : offset = p + mid * item_size;
922 5192094577 : oip = offset_in_page(offset);
923 :
924 5192094577 : if (oip + key_size <= PAGE_SIZE) {
925 5182352040 : const unsigned long idx = get_eb_page_index(offset);
926 5182352040 : char *kaddr = page_address(eb->pages[idx]);
927 :
928 5182352040 : oip = get_eb_offset_in_page(eb, offset);
929 5182352040 : tmp = (struct btrfs_disk_key *)(kaddr + oip);
930 : } else {
931 9742537 : read_extent_buffer(eb, &unaligned, offset, key_size);
932 9742537 : tmp = &unaligned;
933 : }
934 :
935 5192296353 : ret = comp_keys(tmp, key);
936 :
937 5192296353 : if (ret < 0)
938 3019114729 : low = mid + 1;
939 2173181624 : else if (ret > 0)
940 : high = mid;
941 : else {
942 196206997 : *slot = mid;
943 196206997 : return 0;
944 : }
945 : }
946 702197106 : *slot = low;
947 702197106 : return 1;
948 : }
949 :
950 375158 : static void root_add_used(struct btrfs_root *root, u32 size)
951 : {
952 375158 : spin_lock(&root->accounting_lock);
953 375158 : btrfs_set_root_used(&root->root_item,
954 : btrfs_root_used(&root->root_item) + size);
955 375158 : spin_unlock(&root->accounting_lock);
956 375158 : }
957 :
958 119511 : static void root_sub_used(struct btrfs_root *root, u32 size)
959 : {
960 119511 : spin_lock(&root->accounting_lock);
961 119511 : btrfs_set_root_used(&root->root_item,
962 : btrfs_root_used(&root->root_item) - size);
963 119511 : spin_unlock(&root->accounting_lock);
964 119511 : }
965 :
966 : /* given a node and slot number, this reads the blocks it points to. The
967 : * extent buffer is returned with a reference taken (but unlocked).
968 : */
969 9047960 : struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
970 : int slot)
971 : {
972 9047960 : int level = btrfs_header_level(parent);
973 9047960 : struct btrfs_tree_parent_check check = { 0 };
974 9047960 : struct extent_buffer *eb;
975 :
976 9047960 : if (slot < 0 || slot >= btrfs_header_nritems(parent))
977 : return ERR_PTR(-ENOENT);
978 :
979 9047941 : ASSERT(level);
980 :
981 9047941 : check.level = level - 1;
982 9047941 : check.transid = btrfs_node_ptr_generation(parent, slot);
983 9047995 : check.owner_root = btrfs_header_owner(parent);
984 9047995 : check.has_first_key = true;
985 9047995 : btrfs_node_key_to_cpu(parent, &check.first_key, slot);
986 :
987 9048020 : eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
988 : &check);
989 9048049 : if (IS_ERR(eb))
990 : return eb;
991 18096098 : if (!extent_buffer_uptodate(eb)) {
992 0 : free_extent_buffer(eb);
993 0 : return ERR_PTR(-EIO);
994 : }
995 :
996 : return eb;
997 : }
998 :
999 : /*
1000 : * node level balancing, used to make sure nodes are in proper order for
1001 : * item deletion. We balance from the top down, so we have to make sure
1002 : * that a deletion won't leave an node completely empty later on.
1003 : */
1004 59530036 : static noinline int balance_level(struct btrfs_trans_handle *trans,
1005 : struct btrfs_root *root,
1006 : struct btrfs_path *path, int level)
1007 : {
1008 59530036 : struct btrfs_fs_info *fs_info = root->fs_info;
1009 59530036 : struct extent_buffer *right = NULL;
1010 59530036 : struct extent_buffer *mid;
1011 59530036 : struct extent_buffer *left = NULL;
1012 59530036 : struct extent_buffer *parent = NULL;
1013 59530036 : int ret = 0;
1014 59530036 : int wret;
1015 59530036 : int pslot;
1016 59530036 : int orig_slot = path->slots[level];
1017 59530036 : u64 orig_ptr;
1018 :
1019 59530036 : ASSERT(level > 0);
1020 :
1021 59530036 : mid = path->nodes[level];
1022 :
1023 59530036 : WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
1024 59530036 : WARN_ON(btrfs_header_generation(mid) != trans->transid);
1025 :
1026 59530036 : orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1027 :
1028 59530149 : if (level < BTRFS_MAX_LEVEL - 1) {
1029 59530286 : parent = path->nodes[level + 1];
1030 59530286 : pslot = path->slots[level + 1];
1031 : }
1032 :
1033 : /*
1034 : * deal with the case where there is only one pointer in the root
1035 : * by promoting the node below to a root
1036 : */
1037 59530286 : if (!parent) {
1038 49410449 : struct extent_buffer *child;
1039 :
1040 49410449 : if (btrfs_header_nritems(mid) != 1)
1041 49410449 : return 0;
1042 :
1043 : /* promote the child to a root */
1044 1188 : child = btrfs_read_node_slot(mid, 0);
1045 1188 : if (IS_ERR(child)) {
1046 0 : ret = PTR_ERR(child);
1047 0 : goto out;
1048 : }
1049 :
1050 1188 : btrfs_tree_lock(child);
1051 1188 : ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
1052 : BTRFS_NESTING_COW);
1053 1188 : if (ret) {
1054 0 : btrfs_tree_unlock(child);
1055 0 : free_extent_buffer(child);
1056 0 : goto out;
1057 : }
1058 :
1059 1188 : ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
1060 1188 : if (ret < 0) {
1061 0 : btrfs_tree_unlock(child);
1062 0 : free_extent_buffer(child);
1063 0 : btrfs_abort_transaction(trans, ret);
1064 0 : goto out;
1065 : }
1066 1188 : rcu_assign_pointer(root->node, child);
1067 :
1068 1188 : add_root_to_dirty_list(root);
1069 1188 : btrfs_tree_unlock(child);
1070 :
1071 1188 : path->locks[level] = 0;
1072 1188 : path->nodes[level] = NULL;
1073 1188 : btrfs_clear_buffer_dirty(trans, mid);
1074 1188 : btrfs_tree_unlock(mid);
1075 : /* once for the path */
1076 1188 : free_extent_buffer(mid);
1077 :
1078 1188 : root_sub_used(root, mid->len);
1079 1188 : btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
1080 : /* once for the root ptr */
1081 1188 : free_extent_buffer_stale(mid);
1082 1188 : return 0;
1083 : }
1084 10119700 : if (btrfs_header_nritems(mid) >
1085 10119700 : BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1086 : return 0;
1087 :
1088 4354 : if (pslot) {
1089 4327 : left = btrfs_read_node_slot(parent, pslot - 1);
1090 4327 : if (IS_ERR(left)) {
1091 0 : ret = PTR_ERR(left);
1092 0 : left = NULL;
1093 0 : goto out;
1094 : }
1095 :
1096 4327 : __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1097 4327 : wret = btrfs_cow_block(trans, root, left,
1098 : parent, pslot - 1, &left,
1099 : BTRFS_NESTING_LEFT_COW);
1100 4327 : if (wret) {
1101 0 : ret = wret;
1102 0 : goto out;
1103 : }
1104 : }
1105 :
1106 4354 : if (pslot + 1 < btrfs_header_nritems(parent)) {
1107 115 : right = btrfs_read_node_slot(parent, pslot + 1);
1108 115 : if (IS_ERR(right)) {
1109 0 : ret = PTR_ERR(right);
1110 0 : right = NULL;
1111 0 : goto out;
1112 : }
1113 :
1114 115 : __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1115 115 : wret = btrfs_cow_block(trans, root, right,
1116 : parent, pslot + 1, &right,
1117 : BTRFS_NESTING_RIGHT_COW);
1118 115 : if (wret) {
1119 0 : ret = wret;
1120 0 : goto out;
1121 : }
1122 : }
1123 :
1124 : /* first, try to make some room in the middle buffer */
1125 4354 : if (left) {
1126 4327 : orig_slot += btrfs_header_nritems(left);
1127 4327 : wret = push_node_left(trans, left, mid, 1);
1128 4327 : if (wret < 0)
1129 : ret = wret;
1130 : }
1131 :
1132 : /*
1133 : * then try to empty the right most buffer into the middle
1134 : */
1135 4354 : if (right) {
1136 115 : wret = push_node_left(trans, mid, right, 1);
1137 115 : if (wret < 0 && wret != -ENOSPC)
1138 : ret = wret;
1139 115 : if (btrfs_header_nritems(right) == 0) {
1140 94 : btrfs_clear_buffer_dirty(trans, right);
1141 94 : btrfs_tree_unlock(right);
1142 94 : ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1);
1143 94 : if (ret < 0) {
1144 0 : free_extent_buffer_stale(right);
1145 0 : right = NULL;
1146 0 : goto out;
1147 : }
1148 94 : root_sub_used(root, right->len);
1149 94 : btrfs_free_tree_block(trans, btrfs_root_id(root), right,
1150 : 0, 1);
1151 94 : free_extent_buffer_stale(right);
1152 94 : right = NULL;
1153 : } else {
1154 21 : struct btrfs_disk_key right_key;
1155 21 : btrfs_node_key(right, &right_key, 0);
1156 21 : ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1157 : BTRFS_MOD_LOG_KEY_REPLACE);
1158 21 : if (ret < 0) {
1159 0 : btrfs_abort_transaction(trans, ret);
1160 0 : goto out;
1161 : }
1162 21 : btrfs_set_node_key(parent, &right_key, pslot + 1);
1163 21 : btrfs_mark_buffer_dirty(parent);
1164 : }
1165 : }
1166 4354 : if (btrfs_header_nritems(mid) == 1) {
1167 : /*
1168 : * we're not allowed to leave a node with one item in the
1169 : * tree during a delete. A deletion from lower in the tree
1170 : * could try to delete the only pointer in this node.
1171 : * So, pull some keys from the left.
1172 : * There has to be a left pointer at this point because
1173 : * otherwise we would have pulled some pointers from the
1174 : * right
1175 : */
1176 0 : if (unlikely(!left)) {
1177 0 : btrfs_crit(fs_info,
1178 : "missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu",
1179 : parent->start, btrfs_header_level(parent),
1180 : mid->start, btrfs_root_id(root));
1181 0 : ret = -EUCLEAN;
1182 0 : btrfs_abort_transaction(trans, ret);
1183 0 : goto out;
1184 : }
1185 0 : wret = balance_node_right(trans, mid, left);
1186 0 : if (wret < 0) {
1187 0 : ret = wret;
1188 0 : goto out;
1189 : }
1190 0 : if (wret == 1) {
1191 0 : wret = push_node_left(trans, left, mid, 1);
1192 0 : if (wret < 0)
1193 : ret = wret;
1194 : }
1195 0 : BUG_ON(wret == 1);
1196 : }
1197 4354 : if (btrfs_header_nritems(mid) == 0) {
1198 18 : btrfs_clear_buffer_dirty(trans, mid);
1199 18 : btrfs_tree_unlock(mid);
1200 18 : ret = btrfs_del_ptr(trans, root, path, level + 1, pslot);
1201 18 : if (ret < 0) {
1202 0 : free_extent_buffer_stale(mid);
1203 0 : mid = NULL;
1204 0 : goto out;
1205 : }
1206 18 : root_sub_used(root, mid->len);
1207 18 : btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
1208 18 : free_extent_buffer_stale(mid);
1209 18 : mid = NULL;
1210 : } else {
1211 : /* update the parent key to reflect our changes */
1212 4336 : struct btrfs_disk_key mid_key;
1213 4336 : btrfs_node_key(mid, &mid_key, 0);
1214 4336 : ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1215 : BTRFS_MOD_LOG_KEY_REPLACE);
1216 4336 : if (ret < 0) {
1217 0 : btrfs_abort_transaction(trans, ret);
1218 0 : goto out;
1219 : }
1220 4336 : btrfs_set_node_key(parent, &mid_key, pslot);
1221 4336 : btrfs_mark_buffer_dirty(parent);
1222 : }
1223 :
1224 : /* update the path */
1225 4354 : if (left) {
1226 4327 : if (btrfs_header_nritems(left) > orig_slot) {
1227 91 : atomic_inc(&left->refs);
1228 : /* left was locked after cow */
1229 91 : path->nodes[level] = left;
1230 91 : path->slots[level + 1] -= 1;
1231 91 : path->slots[level] = orig_slot;
1232 91 : if (mid) {
1233 73 : btrfs_tree_unlock(mid);
1234 73 : free_extent_buffer(mid);
1235 : }
1236 : } else {
1237 4236 : orig_slot -= btrfs_header_nritems(left);
1238 4236 : path->slots[level] = orig_slot;
1239 : }
1240 : }
1241 : /* double check we haven't messed things up */
1242 4354 : if (orig_ptr !=
1243 4354 : btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1244 0 : BUG();
1245 4354 : out:
1246 4354 : if (right) {
1247 21 : btrfs_tree_unlock(right);
1248 21 : free_extent_buffer(right);
1249 : }
1250 4354 : if (left) {
1251 4327 : if (path->nodes[level] != left)
1252 4236 : btrfs_tree_unlock(left);
1253 4327 : free_extent_buffer(left);
1254 : }
1255 : return ret;
1256 : }
1257 :
1258 : /* Node balancing for insertion. Here we only split or push nodes around
1259 : * when they are completely full. This is also done top down, so we
1260 : * have to be pessimistic.
1261 : */
1262 116996 : static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1263 : struct btrfs_root *root,
1264 : struct btrfs_path *path, int level)
1265 : {
1266 116996 : struct btrfs_fs_info *fs_info = root->fs_info;
1267 116996 : struct extent_buffer *right = NULL;
1268 116996 : struct extent_buffer *mid;
1269 116996 : struct extent_buffer *left = NULL;
1270 116996 : struct extent_buffer *parent = NULL;
1271 116996 : int ret = 0;
1272 116996 : int wret;
1273 116996 : int pslot;
1274 116996 : int orig_slot = path->slots[level];
1275 :
1276 116996 : if (level == 0)
1277 : return 1;
1278 :
1279 116996 : mid = path->nodes[level];
1280 116996 : WARN_ON(btrfs_header_generation(mid) != trans->transid);
1281 :
1282 116996 : if (level < BTRFS_MAX_LEVEL - 1) {
1283 116996 : parent = path->nodes[level + 1];
1284 116996 : pslot = path->slots[level + 1];
1285 : }
1286 :
1287 116996 : if (!parent)
1288 : return 1;
1289 :
1290 : /* first, try to make some room in the middle buffer */
1291 116996 : if (pslot) {
1292 69670 : u32 left_nr;
1293 :
1294 69670 : left = btrfs_read_node_slot(parent, pslot - 1);
1295 69670 : if (IS_ERR(left))
1296 0 : return PTR_ERR(left);
1297 :
1298 69670 : __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1299 :
1300 69670 : left_nr = btrfs_header_nritems(left);
1301 69670 : if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1302 : wret = 1;
1303 : } else {
1304 58883 : ret = btrfs_cow_block(trans, root, left, parent,
1305 : pslot - 1, &left,
1306 : BTRFS_NESTING_LEFT_COW);
1307 58883 : if (ret)
1308 : wret = 1;
1309 : else {
1310 58883 : wret = push_node_left(trans, left, mid, 0);
1311 : }
1312 : }
1313 58883 : if (wret < 0)
1314 : ret = wret;
1315 58883 : if (wret == 0) {
1316 58883 : struct btrfs_disk_key disk_key;
1317 58883 : orig_slot += left_nr;
1318 58883 : btrfs_node_key(mid, &disk_key, 0);
1319 58883 : ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1320 : BTRFS_MOD_LOG_KEY_REPLACE);
1321 58883 : if (ret < 0) {
1322 0 : btrfs_tree_unlock(left);
1323 0 : free_extent_buffer(left);
1324 0 : btrfs_abort_transaction(trans, ret);
1325 0 : return ret;
1326 : }
1327 58883 : btrfs_set_node_key(parent, &disk_key, pslot);
1328 58883 : btrfs_mark_buffer_dirty(parent);
1329 58883 : if (btrfs_header_nritems(left) > orig_slot) {
1330 279 : path->nodes[level] = left;
1331 279 : path->slots[level + 1] -= 1;
1332 279 : path->slots[level] = orig_slot;
1333 279 : btrfs_tree_unlock(mid);
1334 279 : free_extent_buffer(mid);
1335 : } else {
1336 58604 : orig_slot -=
1337 : btrfs_header_nritems(left);
1338 58604 : path->slots[level] = orig_slot;
1339 58604 : btrfs_tree_unlock(left);
1340 58604 : free_extent_buffer(left);
1341 : }
1342 58883 : return 0;
1343 : }
1344 10787 : btrfs_tree_unlock(left);
1345 10787 : free_extent_buffer(left);
1346 : }
1347 :
1348 : /*
1349 : * then try to empty the right most buffer into the middle
1350 : */
1351 58113 : if (pslot + 1 < btrfs_header_nritems(parent)) {
1352 57833 : u32 right_nr;
1353 :
1354 57833 : right = btrfs_read_node_slot(parent, pslot + 1);
1355 57833 : if (IS_ERR(right))
1356 0 : return PTR_ERR(right);
1357 :
1358 57833 : __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1359 :
1360 57833 : right_nr = btrfs_header_nritems(right);
1361 57833 : if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1362 : wret = 1;
1363 : } else {
1364 57776 : ret = btrfs_cow_block(trans, root, right,
1365 : parent, pslot + 1,
1366 : &right, BTRFS_NESTING_RIGHT_COW);
1367 57776 : if (ret)
1368 : wret = 1;
1369 : else {
1370 57776 : wret = balance_node_right(trans, right, mid);
1371 : }
1372 : }
1373 57776 : if (wret < 0)
1374 : ret = wret;
1375 57776 : if (wret == 0) {
1376 57776 : struct btrfs_disk_key disk_key;
1377 :
1378 57776 : btrfs_node_key(right, &disk_key, 0);
1379 57776 : ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1380 : BTRFS_MOD_LOG_KEY_REPLACE);
1381 57776 : if (ret < 0) {
1382 0 : btrfs_tree_unlock(right);
1383 0 : free_extent_buffer(right);
1384 0 : btrfs_abort_transaction(trans, ret);
1385 0 : return ret;
1386 : }
1387 57776 : btrfs_set_node_key(parent, &disk_key, pslot + 1);
1388 57776 : btrfs_mark_buffer_dirty(parent);
1389 :
1390 57776 : if (btrfs_header_nritems(mid) <= orig_slot) {
1391 100 : path->nodes[level] = right;
1392 100 : path->slots[level + 1] += 1;
1393 100 : path->slots[level] = orig_slot -
1394 : btrfs_header_nritems(mid);
1395 100 : btrfs_tree_unlock(mid);
1396 100 : free_extent_buffer(mid);
1397 : } else {
1398 57676 : btrfs_tree_unlock(right);
1399 57676 : free_extent_buffer(right);
1400 : }
1401 57776 : return 0;
1402 : }
1403 57 : btrfs_tree_unlock(right);
1404 57 : free_extent_buffer(right);
1405 : }
1406 : return 1;
1407 : }
1408 :
1409 : /*
1410 : * readahead one full node of leaves, finding things that are close
1411 : * to the block in 'slot', and triggering ra on them.
1412 : */
1413 32481 : static void reada_for_search(struct btrfs_fs_info *fs_info,
1414 : struct btrfs_path *path,
1415 : int level, int slot, u64 objectid)
1416 : {
1417 32481 : struct extent_buffer *node;
1418 32481 : struct btrfs_disk_key disk_key;
1419 32481 : u32 nritems;
1420 32481 : u64 search;
1421 32481 : u64 target;
1422 32481 : u64 nread = 0;
1423 32481 : u64 nread_max;
1424 32481 : u32 nr;
1425 32481 : u32 blocksize;
1426 32481 : u32 nscan = 0;
1427 :
1428 32481 : if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
1429 541 : return;
1430 :
1431 31941 : if (!path->nodes[level])
1432 : return;
1433 :
1434 31941 : node = path->nodes[level];
1435 :
1436 : /*
1437 : * Since the time between visiting leaves is much shorter than the time
1438 : * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1439 : * much IO at once (possibly random).
1440 : */
1441 31941 : if (path->reada == READA_FORWARD_ALWAYS) {
1442 19847 : if (level > 1)
1443 6669 : nread_max = node->fs_info->nodesize;
1444 : else
1445 : nread_max = SZ_128K;
1446 : } else {
1447 : nread_max = SZ_64K;
1448 : }
1449 :
1450 31941 : search = btrfs_node_blockptr(node, slot);
1451 31940 : blocksize = fs_info->nodesize;
1452 31940 : if (path->reada != READA_FORWARD_ALWAYS) {
1453 12094 : struct extent_buffer *eb;
1454 :
1455 12094 : eb = find_extent_buffer(fs_info, search);
1456 12093 : if (eb) {
1457 1 : free_extent_buffer(eb);
1458 1 : return;
1459 : }
1460 : }
1461 :
1462 31938 : target = search;
1463 :
1464 31938 : nritems = btrfs_header_nritems(node);
1465 31938 : nr = slot;
1466 :
1467 485944 : while (1) {
1468 485944 : if (path->reada == READA_BACK) {
1469 488 : if (nr == 0)
1470 : break;
1471 488 : nr--;
1472 485456 : } else if (path->reada == READA_FORWARD ||
1473 : path->reada == READA_FORWARD_ALWAYS) {
1474 485427 : nr++;
1475 485427 : if (nr >= nritems)
1476 : break;
1477 : }
1478 480158 : if (path->reada == READA_BACK && objectid) {
1479 488 : btrfs_node_key(node, &disk_key, nr);
1480 488 : if (btrfs_disk_key_objectid(&disk_key) != objectid)
1481 : break;
1482 : }
1483 479917 : search = btrfs_node_blockptr(node, nr);
1484 479922 : if (path->reada == READA_FORWARD_ALWAYS ||
1485 356562 : (search <= target && target - search <= 65536) ||
1486 206120 : (search > target && search - target <= 65536)) {
1487 134141 : btrfs_readahead_node_child(node, nr);
1488 134139 : nread += blocksize;
1489 : }
1490 479920 : nscan++;
1491 479920 : if (nread > nread_max || nscan > 32)
1492 : break;
1493 : }
1494 : }
1495 :
1496 59647249 : static noinline void reada_for_balance(struct btrfs_path *path, int level)
1497 : {
1498 59647249 : struct extent_buffer *parent;
1499 59647249 : int slot;
1500 59647249 : int nritems;
1501 :
1502 59647249 : parent = path->nodes[level + 1];
1503 59647249 : if (!parent)
1504 : return;
1505 :
1506 10236696 : nritems = btrfs_header_nritems(parent);
1507 10236696 : slot = path->slots[level + 1];
1508 :
1509 10236696 : if (slot > 0)
1510 9814623 : btrfs_readahead_node_child(parent, slot - 1);
1511 10236696 : if (slot + 1 < nritems)
1512 3485022 : btrfs_readahead_node_child(parent, slot + 1);
1513 : }
1514 :
1515 :
1516 : /*
1517 : * when we walk down the tree, it is usually safe to unlock the higher layers
1518 : * in the tree. The exceptions are when our path goes through slot 0, because
1519 : * operations on the tree might require changing key pointers higher up in the
1520 : * tree.
1521 : *
1522 : * callers might also have set path->keep_locks, which tells this code to keep
1523 : * the lock if the path points to the last slot in the block. This is part of
1524 : * walking through the tree, and selecting the next slot in the higher block.
1525 : *
1526 : * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1527 : * if lowest_unlock is 1, level 0 won't be unlocked
1528 : */
1529 915992980 : static noinline void unlock_up(struct btrfs_path *path, int level,
1530 : int lowest_unlock, int min_write_lock_level,
1531 : int *write_lock_level)
1532 : {
1533 915992980 : int i;
1534 915992980 : int skip_level = level;
1535 915992980 : bool check_skip = true;
1536 :
1537 2055407723 : for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1538 2055407723 : if (!path->nodes[i])
1539 : break;
1540 1515919367 : if (!path->locks[i])
1541 : break;
1542 :
1543 1138338335 : if (check_skip) {
1544 1109626380 : if (path->slots[i] == 0) {
1545 147039907 : skip_level = i + 1;
1546 147039907 : continue;
1547 : }
1548 :
1549 962586473 : if (path->keep_locks) {
1550 231743407 : u32 nritems;
1551 :
1552 231743407 : nritems = btrfs_header_nritems(path->nodes[i]);
1553 231743407 : if (nritems < 1 || path->slots[i] >= nritems - 1) {
1554 126938597 : skip_level = i + 1;
1555 126938597 : continue;
1556 : }
1557 : }
1558 : }
1559 :
1560 864359831 : if (i >= lowest_unlock && i > skip_level) {
1561 136420658 : check_skip = false;
1562 136420658 : btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
1563 137497066 : path->locks[i] = 0;
1564 137497066 : if (write_lock_level &&
1565 137497066 : i > min_write_lock_level &&
1566 40803672 : i <= *write_lock_level) {
1567 154016 : *write_lock_level = i - 1;
1568 : }
1569 : }
1570 : }
1571 917069388 : }
1572 :
1573 : /*
1574 : * Helper function for btrfs_search_slot() and other functions that do a search
1575 : * on a btree. The goal is to find a tree block in the cache (the radix tree at
1576 : * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
1577 : * its pages from disk.
1578 : *
1579 : * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
1580 : * whole btree search, starting again from the current root node.
1581 : */
1582 : static int
1583 537924568 : read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1584 : struct extent_buffer **eb_ret, int level, int slot,
1585 : const struct btrfs_key *key)
1586 : {
1587 537924568 : struct btrfs_fs_info *fs_info = root->fs_info;
1588 537924568 : struct btrfs_tree_parent_check check = { 0 };
1589 537924568 : u64 blocknr;
1590 537924568 : u64 gen;
1591 537924568 : struct extent_buffer *tmp;
1592 537924568 : int ret;
1593 537924568 : int parent_level;
1594 537924568 : bool unlock_up;
1595 :
1596 537924568 : unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
1597 537924568 : blocknr = btrfs_node_blockptr(*eb_ret, slot);
1598 537817072 : gen = btrfs_node_ptr_generation(*eb_ret, slot);
1599 537793186 : parent_level = btrfs_header_level(*eb_ret);
1600 537793186 : btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
1601 537782526 : check.has_first_key = true;
1602 537782526 : check.level = parent_level - 1;
1603 537782526 : check.transid = gen;
1604 537782526 : check.owner_root = root->root_key.objectid;
1605 :
1606 : /*
1607 : * If we need to read an extent buffer from disk and we are holding locks
1608 : * on upper level nodes, we unlock all the upper nodes before reading the
1609 : * extent buffer, and then return -EAGAIN to the caller as it needs to
1610 : * restart the search. We don't release the lock on the current level
1611 : * because we need to walk this node to figure out which blocks to read.
1612 : */
1613 537782526 : tmp = find_extent_buffer(fs_info, blocknr);
1614 538139185 : if (tmp) {
1615 537999862 : if (p->reada == READA_FORWARD_ALWAYS)
1616 19836 : reada_for_search(fs_info, p, level, slot, key->objectid);
1617 :
1618 : /* first we do an atomic uptodate check */
1619 537999862 : if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
1620 : /*
1621 : * Do extra check for first_key, eb can be stale due to
1622 : * being cached, read from scrub, or have multiple
1623 : * parents (shared tree blocks).
1624 : */
1625 537961048 : if (btrfs_verify_level_key(tmp,
1626 : parent_level - 1, &check.first_key, gen)) {
1627 0 : free_extent_buffer(tmp);
1628 0 : return -EUCLEAN;
1629 : }
1630 537860802 : *eb_ret = tmp;
1631 537860802 : return 0;
1632 : }
1633 :
1634 252 : if (p->nowait) {
1635 0 : free_extent_buffer(tmp);
1636 0 : return -EAGAIN;
1637 : }
1638 :
1639 252 : if (unlock_up)
1640 4 : btrfs_unlock_up_safe(p, level + 1);
1641 :
1642 : /* now we're allowed to do a blocking uptodate check */
1643 252 : ret = btrfs_read_extent_buffer(tmp, &check);
1644 252 : if (ret) {
1645 0 : free_extent_buffer(tmp);
1646 0 : btrfs_release_path(p);
1647 0 : return -EIO;
1648 : }
1649 252 : if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
1650 0 : free_extent_buffer(tmp);
1651 0 : btrfs_release_path(p);
1652 0 : return -EUCLEAN;
1653 : }
1654 :
1655 251 : if (unlock_up)
1656 4 : ret = -EAGAIN;
1657 :
1658 247 : goto out;
1659 139323 : } else if (p->nowait) {
1660 : return -EAGAIN;
1661 : }
1662 :
1663 139323 : if (unlock_up) {
1664 12942 : btrfs_unlock_up_safe(p, level + 1);
1665 12942 : ret = -EAGAIN;
1666 : } else {
1667 : ret = 0;
1668 : }
1669 :
1670 139324 : if (p->reada != READA_NONE)
1671 12647 : reada_for_search(fs_info, p, level, slot, key->objectid);
1672 :
1673 139324 : tmp = read_tree_block(fs_info, blocknr, &check);
1674 139322 : if (IS_ERR(tmp)) {
1675 0 : btrfs_release_path(p);
1676 0 : return PTR_ERR(tmp);
1677 : }
1678 : /*
1679 : * If the read above didn't mark this buffer up to date,
1680 : * it will never end up being up to date. Set ret to EIO now
1681 : * and give up so that our caller doesn't loop forever
1682 : * on our EAGAINs.
1683 : */
1684 278644 : if (!extent_buffer_uptodate(tmp))
1685 : ret = -EIO;
1686 :
1687 139322 : out:
1688 139573 : if (ret == 0) {
1689 126626 : *eb_ret = tmp;
1690 : } else {
1691 12947 : free_extent_buffer(tmp);
1692 12947 : btrfs_release_path(p);
1693 : }
1694 :
1695 : return ret;
1696 : }
1697 :
1698 : /*
1699 : * helper function for btrfs_search_slot. This does all of the checks
1700 : * for node-level blocks and does any balancing required based on
1701 : * the ins_len.
1702 : *
1703 : * If no extra work was required, zero is returned. If we had to
1704 : * drop the path, -EAGAIN is returned and btrfs_search_slot must
1705 : * start over
1706 : */
1707 : static int
1708 572209898 : setup_nodes_for_search(struct btrfs_trans_handle *trans,
1709 : struct btrfs_root *root, struct btrfs_path *p,
1710 : struct extent_buffer *b, int level, int ins_len,
1711 : int *write_lock_level)
1712 : {
1713 572209898 : struct btrfs_fs_info *fs_info = root->fs_info;
1714 572209898 : int ret = 0;
1715 :
1716 572209898 : if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1717 156287779 : BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
1718 :
1719 188344 : if (*write_lock_level < level + 1) {
1720 71273 : *write_lock_level = level + 1;
1721 71273 : btrfs_release_path(p);
1722 71273 : return -EAGAIN;
1723 : }
1724 :
1725 117071 : reada_for_balance(p, level);
1726 117071 : ret = split_node(trans, root, p, level);
1727 :
1728 117071 : b = p->nodes[level];
1729 572021554 : } else if (ins_len < 0 && btrfs_header_nritems(b) <
1730 112954991 : BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
1731 :
1732 90496470 : if (*write_lock_level < level + 1) {
1733 30966454 : *write_lock_level = level + 1;
1734 30966454 : btrfs_release_path(p);
1735 30966454 : return -EAGAIN;
1736 : }
1737 :
1738 59530016 : reada_for_balance(p, level);
1739 59529904 : ret = balance_level(trans, root, p, level);
1740 59529577 : if (ret)
1741 : return ret;
1742 :
1743 59529577 : b = p->nodes[level];
1744 59529577 : if (!b) {
1745 1188 : btrfs_release_path(p);
1746 1188 : return -EAGAIN;
1747 : }
1748 59528389 : BUG_ON(btrfs_header_nritems(b) == 1);
1749 : }
1750 : return ret;
1751 : }
1752 :
1753 27914 : int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
1754 : u64 iobjectid, u64 ioff, u8 key_type,
1755 : struct btrfs_key *found_key)
1756 : {
1757 27914 : int ret;
1758 27914 : struct btrfs_key key;
1759 27914 : struct extent_buffer *eb;
1760 :
1761 27914 : ASSERT(path);
1762 27914 : ASSERT(found_key);
1763 :
1764 27914 : key.type = key_type;
1765 27914 : key.objectid = iobjectid;
1766 27914 : key.offset = ioff;
1767 :
1768 27914 : ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1769 27914 : if (ret < 0)
1770 : return ret;
1771 :
1772 27914 : eb = path->nodes[0];
1773 27914 : if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1774 845 : ret = btrfs_next_leaf(fs_root, path);
1775 845 : if (ret)
1776 : return ret;
1777 845 : eb = path->nodes[0];
1778 : }
1779 :
1780 27914 : btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1781 27914 : if (found_key->type != key.type ||
1782 27135 : found_key->objectid != key.objectid)
1783 779 : return 1;
1784 :
1785 : return 0;
1786 : }
1787 :
1788 400252291 : static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1789 : struct btrfs_path *p,
1790 : int write_lock_level)
1791 : {
1792 400252291 : struct extent_buffer *b;
1793 400252291 : int root_lock = 0;
1794 400252291 : int level = 0;
1795 :
1796 400252291 : if (p->search_commit_root) {
1797 33614701 : b = root->commit_root;
1798 33614701 : atomic_inc(&b->refs);
1799 33618124 : level = btrfs_header_level(b);
1800 : /*
1801 : * Ensure that all callers have set skip_locking when
1802 : * p->search_commit_root = 1.
1803 : */
1804 33618124 : ASSERT(p->skip_locking == 1);
1805 :
1806 33618124 : goto out;
1807 : }
1808 :
1809 366637590 : if (p->skip_locking) {
1810 7226573 : b = btrfs_root_node(root);
1811 7226573 : level = btrfs_header_level(b);
1812 7226573 : goto out;
1813 : }
1814 :
1815 : /* We try very hard to do read locks on the root */
1816 359411017 : root_lock = BTRFS_READ_LOCK;
1817 :
1818 : /*
1819 : * If the level is set to maximum, we can skip trying to get the read
1820 : * lock.
1821 : */
1822 359411017 : if (write_lock_level < BTRFS_MAX_LEVEL) {
1823 : /*
1824 : * We don't know the level of the root node until we actually
1825 : * have it read locked
1826 : */
1827 326909128 : if (p->nowait) {
1828 0 : b = btrfs_try_read_lock_root_node(root);
1829 0 : if (IS_ERR(b))
1830 : return b;
1831 : } else {
1832 326909128 : b = btrfs_read_lock_root_node(root);
1833 : }
1834 326952982 : level = btrfs_header_level(b);
1835 326952982 : if (level > write_lock_level)
1836 193401532 : goto out;
1837 :
1838 : /* Whoops, must trade for write lock */
1839 133551450 : btrfs_tree_read_unlock(b);
1840 133550000 : free_extent_buffer(b);
1841 : }
1842 :
1843 166068502 : b = btrfs_lock_root_node(root);
1844 166068309 : root_lock = BTRFS_WRITE_LOCK;
1845 :
1846 : /* The level might have changed, check again */
1847 166068309 : level = btrfs_header_level(b);
1848 :
1849 400314538 : out:
1850 : /*
1851 : * The root may have failed to write out at some point, and thus is no
1852 : * longer valid, return an error in this case.
1853 : */
1854 800629076 : if (!extent_buffer_uptodate(b)) {
1855 16 : if (root_lock)
1856 16 : btrfs_tree_unlock_rw(b, root_lock);
1857 16 : free_extent_buffer(b);
1858 16 : return ERR_PTR(-EIO);
1859 : }
1860 :
1861 400314522 : p->nodes[level] = b;
1862 400314522 : if (!p->skip_locking)
1863 359378484 : p->locks[level] = root_lock;
1864 : /*
1865 : * Callers are responsible for dropping b's references.
1866 : */
1867 : return b;
1868 : }
1869 :
1870 : /*
1871 : * Replace the extent buffer at the lowest level of the path with a cloned
1872 : * version. The purpose is to be able to use it safely, after releasing the
1873 : * commit root semaphore, even if relocation is happening in parallel, the
1874 : * transaction used for relocation is committed and the extent buffer is
1875 : * reallocated in the next transaction.
1876 : *
1877 : * This is used in a context where the caller does not prevent transaction
1878 : * commits from happening, either by holding a transaction handle or holding
1879 : * some lock, while it's doing searches through a commit root.
1880 : * At the moment it's only used for send operations.
1881 : */
1882 18820509 : static int finish_need_commit_sem_search(struct btrfs_path *path)
1883 : {
1884 18820509 : const int i = path->lowest_level;
1885 18820509 : const int slot = path->slots[i];
1886 18820509 : struct extent_buffer *lowest = path->nodes[i];
1887 18820509 : struct extent_buffer *clone;
1888 :
1889 18820509 : ASSERT(path->need_commit_sem);
1890 :
1891 18820509 : if (!lowest)
1892 : return 0;
1893 :
1894 18820509 : lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
1895 :
1896 18820509 : clone = btrfs_clone_extent_buffer(lowest);
1897 18821712 : if (!clone)
1898 : return -ENOMEM;
1899 :
1900 18821712 : btrfs_release_path(path);
1901 18821191 : path->nodes[i] = clone;
1902 18821191 : path->slots[i] = slot;
1903 :
1904 18821191 : return 0;
1905 : }
1906 :
1907 : static inline int search_for_key_slot(struct extent_buffer *eb,
1908 : int search_low_slot,
1909 : const struct btrfs_key *key,
1910 : int prev_cmp,
1911 : int *slot)
1912 : {
1913 : /*
1914 : * If a previous call to btrfs_bin_search() on a parent node returned an
1915 : * exact match (prev_cmp == 0), we can safely assume the target key will
1916 : * always be at slot 0 on lower levels, since each key pointer
1917 : * (struct btrfs_key_ptr) refers to the lowest key accessible from the
1918 : * subtree it points to. Thus we can skip searching lower levels.
1919 : */
1920 919087551 : if (prev_cmp == 0) {
1921 21260987 : *slot = 0;
1922 21260987 : return 0;
1923 : }
1924 :
1925 897826564 : return btrfs_bin_search(eb, search_low_slot, key, slot);
1926 : }
1927 :
1928 354177655 : static int search_leaf(struct btrfs_trans_handle *trans,
1929 : struct btrfs_root *root,
1930 : const struct btrfs_key *key,
1931 : struct btrfs_path *path,
1932 : int ins_len,
1933 : int prev_cmp)
1934 : {
1935 354177655 : struct extent_buffer *leaf = path->nodes[0];
1936 354177655 : int leaf_free_space = -1;
1937 354177655 : int search_low_slot = 0;
1938 354177655 : int ret;
1939 354177655 : bool do_bin_search = true;
1940 :
1941 : /*
1942 : * If we are doing an insertion, the leaf has enough free space and the
1943 : * destination slot for the key is not slot 0, then we can unlock our
1944 : * write lock on the parent, and any other upper nodes, before doing the
1945 : * binary search on the leaf (with search_for_key_slot()), allowing other
1946 : * tasks to lock the parent and any other upper nodes.
1947 : */
1948 354177655 : if (ins_len > 0) {
1949 : /*
1950 : * Cache the leaf free space, since we will need it later and it
1951 : * will not change until then.
1952 : */
1953 91738804 : leaf_free_space = btrfs_leaf_free_space(leaf);
1954 :
1955 : /*
1956 : * !path->locks[1] means we have a single node tree, the leaf is
1957 : * the root of the tree.
1958 : */
1959 91733859 : if (path->locks[1] && leaf_free_space >= ins_len) {
1960 80269494 : struct btrfs_disk_key first_key;
1961 :
1962 80269494 : ASSERT(btrfs_header_nritems(leaf) > 0);
1963 80269494 : btrfs_item_key(leaf, &first_key, 0);
1964 :
1965 : /*
1966 : * Doing the extra comparison with the first key is cheap,
1967 : * taking into account that the first key is very likely
1968 : * already in a cache line because it immediately follows
1969 : * the extent buffer's header and we have recently accessed
1970 : * the header's level field.
1971 : */
1972 80275982 : ret = comp_keys(&first_key, key);
1973 80275982 : if (ret < 0) {
1974 : /*
1975 : * The first key is smaller than the key we want
1976 : * to insert, so we are safe to unlock all upper
1977 : * nodes and we have to do the binary search.
1978 : *
1979 : * We do use btrfs_unlock_up_safe() and not
1980 : * unlock_up() because the later does not unlock
1981 : * nodes with a slot of 0 - we can safely unlock
1982 : * any node even if its slot is 0 since in this
1983 : * case the key does not end up at slot 0 of the
1984 : * leaf and there's no need to split the leaf.
1985 : */
1986 73058874 : btrfs_unlock_up_safe(path, 1);
1987 73058874 : search_low_slot = 1;
1988 : } else {
1989 : /*
1990 : * The first key is >= then the key we want to
1991 : * insert, so we can skip the binary search as
1992 : * the target key will be at slot 0.
1993 : *
1994 : * We can not unlock upper nodes when the key is
1995 : * less than the first key, because we will need
1996 : * to update the key at slot 0 of the parent node
1997 : * and possibly of other upper nodes too.
1998 : * If the key matches the first key, then we can
1999 : * unlock all the upper nodes, using
2000 : * btrfs_unlock_up_safe() instead of unlock_up()
2001 : * as stated above.
2002 : */
2003 7217108 : if (ret == 0)
2004 7113370 : btrfs_unlock_up_safe(path, 1);
2005 : /*
2006 : * ret is already 0 or 1, matching the result of
2007 : * a btrfs_bin_search() call, so there is no need
2008 : * to adjust it.
2009 : */
2010 7217108 : do_bin_search = false;
2011 7217108 : path->slots[0] = 0;
2012 : }
2013 : }
2014 : }
2015 :
2016 354179991 : if (do_bin_search) {
2017 346811932 : ret = search_for_key_slot(leaf, search_low_slot, key,
2018 : prev_cmp, &path->slots[0]);
2019 346695630 : if (ret < 0)
2020 : return ret;
2021 : }
2022 :
2023 354063689 : if (ins_len > 0) {
2024 : /*
2025 : * Item key already exists. In this case, if we are allowed to
2026 : * insert the item (for example, in dir_item case, item key
2027 : * collision is allowed), it will be merged with the original
2028 : * item. Only the item size grows, no new btrfs item will be
2029 : * added. If search_for_extension is not set, ins_len already
2030 : * accounts the size btrfs_item, deduct it here so leaf space
2031 : * check will be correct.
2032 : */
2033 91722693 : if (ret == 0 && !path->search_for_extension) {
2034 405705 : ASSERT(ins_len >= sizeof(struct btrfs_item));
2035 405705 : ins_len -= sizeof(struct btrfs_item);
2036 : }
2037 :
2038 91722693 : ASSERT(leaf_free_space >= 0);
2039 :
2040 91722693 : if (leaf_free_space < ins_len) {
2041 3635882 : int err;
2042 :
2043 3635882 : err = split_leaf(trans, root, key, path, ins_len,
2044 : (ret == 0));
2045 3635906 : ASSERT(err <= 0);
2046 3635906 : if (WARN_ON(err > 0))
2047 : err = -EUCLEAN;
2048 3635906 : if (err)
2049 : ret = err;
2050 : }
2051 : }
2052 :
2053 : return ret;
2054 : }
2055 :
2056 : /*
2057 : * btrfs_search_slot - look for a key in a tree and perform necessary
2058 : * modifications to preserve tree invariants.
2059 : *
2060 : * @trans: Handle of transaction, used when modifying the tree
2061 : * @p: Holds all btree nodes along the search path
2062 : * @root: The root node of the tree
2063 : * @key: The key we are looking for
2064 : * @ins_len: Indicates purpose of search:
2065 : * >0 for inserts it's size of item inserted (*)
2066 : * <0 for deletions
2067 : * 0 for plain searches, not modifying the tree
2068 : *
2069 : * (*) If size of item inserted doesn't include
2070 : * sizeof(struct btrfs_item), then p->search_for_extension must
2071 : * be set.
2072 : * @cow: boolean should CoW operations be performed. Must always be 1
2073 : * when modifying the tree.
2074 : *
2075 : * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2076 : * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2077 : *
2078 : * If @key is found, 0 is returned and you can find the item in the leaf level
2079 : * of the path (level 0)
2080 : *
2081 : * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2082 : * points to the slot where it should be inserted
2083 : *
2084 : * If an error is encountered while searching the tree a negative error number
2085 : * is returned
2086 : */
2087 362520958 : int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2088 : const struct btrfs_key *key, struct btrfs_path *p,
2089 : int ins_len, int cow)
2090 : {
2091 362520958 : struct btrfs_fs_info *fs_info = root->fs_info;
2092 362520958 : struct extent_buffer *b;
2093 362520958 : int slot;
2094 362520958 : int ret;
2095 362520958 : int err;
2096 362520958 : int level;
2097 362520958 : int lowest_unlock = 1;
2098 : /* everything at write_lock_level or lower must be write locked */
2099 362520958 : int write_lock_level = 0;
2100 362520958 : u8 lowest_level = 0;
2101 362520958 : int min_write_lock_level;
2102 362520958 : int prev_cmp;
2103 :
2104 362520958 : might_sleep();
2105 :
2106 362511531 : lowest_level = p->lowest_level;
2107 362511531 : WARN_ON(lowest_level && ins_len > 0);
2108 362511531 : WARN_ON(p->nodes[0] != NULL);
2109 362511531 : BUG_ON(!cow && ins_len);
2110 :
2111 : /*
2112 : * For now only allow nowait for read only operations. There's no
2113 : * strict reason why we can't, we just only need it for reads so it's
2114 : * only implemented for reads.
2115 : */
2116 362511531 : ASSERT(!p->nowait || !cow);
2117 :
2118 362511531 : if (ins_len < 0) {
2119 61851903 : lowest_unlock = 2;
2120 :
2121 : /* when we are removing items, we might have to go up to level
2122 : * two as we update tree pointers Make sure we keep write
2123 : * for those levels as well
2124 : */
2125 61851903 : write_lock_level = 2;
2126 300659628 : } else if (ins_len > 0) {
2127 : /*
2128 : * for inserting items, make sure we have a write lock on
2129 : * level 1 so we can update keys
2130 : */
2131 91740894 : write_lock_level = 1;
2132 : }
2133 :
2134 362511531 : if (!cow)
2135 173066765 : write_lock_level = -1;
2136 :
2137 362511531 : if (cow && (p->keep_locks || p->lowest_level))
2138 32499269 : write_lock_level = BTRFS_MAX_LEVEL;
2139 :
2140 362511531 : min_write_lock_level = write_lock_level;
2141 :
2142 362511531 : if (p->need_commit_sem) {
2143 18748489 : ASSERT(p->search_commit_root);
2144 18748489 : if (p->nowait) {
2145 0 : if (!down_read_trylock(&fs_info->commit_root_sem))
2146 : return -EAGAIN;
2147 : } else {
2148 18748489 : down_read(&fs_info->commit_root_sem);
2149 : }
2150 : }
2151 :
2152 343763042 : again:
2153 400243080 : prev_cmp = -1;
2154 400243080 : b = btrfs_search_slot_get_root(root, p, write_lock_level);
2155 400173019 : if (IS_ERR(b)) {
2156 16 : ret = PTR_ERR(b);
2157 16 : goto done;
2158 : }
2159 :
2160 927579945 : while (b) {
2161 927579945 : int dec = 0;
2162 :
2163 927579945 : level = btrfs_header_level(b);
2164 :
2165 927579945 : if (cow) {
2166 491994689 : bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2167 :
2168 : /*
2169 : * if we don't really need to cow this block
2170 : * then we don't want to set the path blocking,
2171 : * so we test it here
2172 : */
2173 491994689 : if (!should_cow_block(trans, root, b))
2174 484050316 : goto cow_done;
2175 :
2176 : /*
2177 : * must have write locks on this node and the
2178 : * parent
2179 : */
2180 7880505 : if (level > write_lock_level ||
2181 7639019 : (level + 1 > write_lock_level &&
2182 1625057 : level + 1 < BTRFS_MAX_LEVEL &&
2183 1625057 : p->nodes[level + 1])) {
2184 1185801 : write_lock_level = level + 1;
2185 1185801 : btrfs_release_path(p);
2186 1185888 : goto again;
2187 : }
2188 :
2189 6694704 : if (last_level)
2190 0 : err = btrfs_cow_block(trans, root, b, NULL, 0,
2191 : &b,
2192 : BTRFS_NESTING_COW);
2193 : else
2194 6694704 : err = btrfs_cow_block(trans, root, b,
2195 : p->nodes[level + 1],
2196 : p->slots[level + 1], &b,
2197 : BTRFS_NESTING_COW);
2198 6694840 : if (err) {
2199 77 : ret = err;
2200 77 : goto done;
2201 : }
2202 : }
2203 442280019 : cow_done:
2204 926330335 : p->nodes[level] = b;
2205 :
2206 : /*
2207 : * we have a lock on b and as long as we aren't changing
2208 : * the tree, there is no way to for the items in b to change.
2209 : * It is safe to drop the lock on our parent before we
2210 : * go through the expensive btree search on b.
2211 : *
2212 : * If we're inserting or deleting (ins_len != 0), then we might
2213 : * be changing slot zero, which may require changing the parent.
2214 : * So, we can't drop the lock until after we know which slot
2215 : * we're operating on.
2216 : */
2217 926330335 : if (!ins_len && !p->keep_locks) {
2218 427668607 : int u = level + 1;
2219 :
2220 427668607 : if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2221 185124220 : btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2222 185102814 : p->locks[u] = 0;
2223 : }
2224 : }
2225 :
2226 926308929 : if (level == 0) {
2227 354033310 : if (ins_len > 0)
2228 : ASSERT(write_lock_level >= 1);
2229 :
2230 354033310 : ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
2231 353858703 : if (!p->search_for_split)
2232 353837024 : unlock_up(p, level, lowest_unlock,
2233 : min_write_lock_level, NULL);
2234 353882148 : goto done;
2235 : }
2236 :
2237 572275619 : ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
2238 572105912 : if (ret < 0)
2239 0 : goto done;
2240 572105912 : prev_cmp = ret;
2241 :
2242 572105912 : if (ret && slot > 0) {
2243 534948145 : dec = 1;
2244 534948145 : slot--;
2245 : }
2246 572105912 : p->slots[level] = slot;
2247 572105912 : err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2248 : &write_lock_level);
2249 572360259 : if (err == -EAGAIN)
2250 31038911 : goto again;
2251 541321348 : if (err) {
2252 0 : ret = err;
2253 0 : goto done;
2254 : }
2255 541321348 : b = p->nodes[level];
2256 541321348 : slot = p->slots[level];
2257 :
2258 : /*
2259 : * Slot 0 is special, if we change the key we have to update
2260 : * the parent pointer which means we must have a write lock on
2261 : * the parent
2262 : */
2263 541321348 : if (slot == 0 && ins_len && write_lock_level < level + 1) {
2264 5497165 : write_lock_level = level + 1;
2265 5497165 : btrfs_release_path(p);
2266 5497260 : goto again;
2267 : }
2268 :
2269 535824183 : unlock_up(p, level, lowest_unlock, min_write_lock_level,
2270 : &write_lock_level);
2271 :
2272 535600528 : if (level == lowest_level) {
2273 8505569 : if (dec)
2274 28123 : p->slots[level]++;
2275 8505569 : goto done;
2276 : }
2277 :
2278 527094959 : err = read_block_for_search(root, p, &b, level, slot, key);
2279 527377048 : if (err == -EAGAIN)
2280 8252 : goto again;
2281 527368796 : if (err) {
2282 0 : ret = err;
2283 0 : goto done;
2284 : }
2285 :
2286 527368796 : if (!p->skip_locking) {
2287 467875229 : level = btrfs_header_level(b);
2288 :
2289 467875229 : btrfs_maybe_reset_lockdep_class(root, b);
2290 :
2291 467875229 : if (level <= write_lock_level) {
2292 258294152 : btrfs_tree_lock(b);
2293 258336402 : p->locks[level] = BTRFS_WRITE_LOCK;
2294 : } else {
2295 209581077 : if (p->nowait) {
2296 0 : if (!btrfs_try_tree_read_lock(b)) {
2297 0 : free_extent_buffer(b);
2298 0 : ret = -EAGAIN;
2299 0 : goto done;
2300 : }
2301 : } else {
2302 209581077 : btrfs_tree_read_lock(b);
2303 : }
2304 209576973 : p->locks[level] = BTRFS_READ_LOCK;
2305 : }
2306 467913375 : p->nodes[level] = b;
2307 : }
2308 : }
2309 : ret = 1;
2310 362387810 : done:
2311 362387810 : if (ret < 0 && !p->skip_release_on_error)
2312 93 : btrfs_release_path(p);
2313 :
2314 362387810 : if (p->need_commit_sem) {
2315 18750183 : int ret2;
2316 :
2317 18750183 : ret2 = finish_need_commit_sem_search(p);
2318 18751134 : up_read(&fs_info->commit_root_sem);
2319 18751864 : if (ret2)
2320 0 : ret = ret2;
2321 : }
2322 :
2323 : return ret;
2324 : }
2325 : ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
2326 :
2327 : /*
2328 : * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2329 : * current state of the tree together with the operations recorded in the tree
2330 : * modification log to search for the key in a previous version of this tree, as
2331 : * denoted by the time_seq parameter.
2332 : *
2333 : * Naturally, there is no support for insert, delete or cow operations.
2334 : *
2335 : * The resulting path and return value will be set up as if we called
2336 : * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2337 : */
2338 3377868 : int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2339 : struct btrfs_path *p, u64 time_seq)
2340 : {
2341 3377868 : struct btrfs_fs_info *fs_info = root->fs_info;
2342 3377868 : struct extent_buffer *b;
2343 3377868 : int slot;
2344 3377868 : int ret;
2345 3377868 : int err;
2346 3377868 : int level;
2347 3377868 : int lowest_unlock = 1;
2348 3377868 : u8 lowest_level = 0;
2349 :
2350 3377868 : lowest_level = p->lowest_level;
2351 3377868 : WARN_ON(p->nodes[0] != NULL);
2352 3377868 : ASSERT(!p->nowait);
2353 :
2354 3377868 : if (p->search_commit_root) {
2355 3188697 : BUG_ON(time_seq);
2356 3188697 : return btrfs_search_slot(NULL, root, key, p, 0, 0);
2357 : }
2358 :
2359 189171 : again:
2360 189171 : b = btrfs_get_old_root(root, time_seq);
2361 189171 : if (!b) {
2362 0 : ret = -EIO;
2363 0 : goto done;
2364 : }
2365 189171 : level = btrfs_header_level(b);
2366 189171 : p->locks[level] = BTRFS_READ_LOCK;
2367 :
2368 389882 : while (b) {
2369 389882 : int dec = 0;
2370 :
2371 389882 : level = btrfs_header_level(b);
2372 389882 : p->nodes[level] = b;
2373 :
2374 : /*
2375 : * we have a lock on b and as long as we aren't changing
2376 : * the tree, there is no way to for the items in b to change.
2377 : * It is safe to drop the lock on our parent before we
2378 : * go through the expensive btree search on b.
2379 : */
2380 389882 : btrfs_unlock_up_safe(p, level + 1);
2381 :
2382 389882 : ret = btrfs_bin_search(b, 0, key, &slot);
2383 389882 : if (ret < 0)
2384 0 : goto done;
2385 :
2386 389882 : if (level == 0) {
2387 71378 : p->slots[level] = slot;
2388 71378 : unlock_up(p, level, lowest_unlock, 0, NULL);
2389 71378 : goto done;
2390 : }
2391 :
2392 318504 : if (ret && slot > 0) {
2393 198149 : dec = 1;
2394 198149 : slot--;
2395 : }
2396 318504 : p->slots[level] = slot;
2397 318504 : unlock_up(p, level, lowest_unlock, 0, NULL);
2398 :
2399 318504 : if (level == lowest_level) {
2400 117793 : if (dec)
2401 0 : p->slots[level]++;
2402 117793 : goto done;
2403 : }
2404 :
2405 200711 : err = read_block_for_search(root, p, &b, level, slot, key);
2406 200711 : if (err == -EAGAIN)
2407 0 : goto again;
2408 200711 : if (err) {
2409 0 : ret = err;
2410 0 : goto done;
2411 : }
2412 :
2413 200711 : level = btrfs_header_level(b);
2414 200711 : btrfs_tree_read_lock(b);
2415 200711 : b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
2416 200711 : if (!b) {
2417 0 : ret = -ENOMEM;
2418 0 : goto done;
2419 : }
2420 200711 : p->locks[level] = BTRFS_READ_LOCK;
2421 200711 : p->nodes[level] = b;
2422 : }
2423 : ret = 1;
2424 189171 : done:
2425 189171 : if (ret < 0)
2426 0 : btrfs_release_path(p);
2427 :
2428 : return ret;
2429 : }
2430 :
2431 : /*
2432 : * Search the tree again to find a leaf with smaller keys.
2433 : * Returns 0 if it found something.
2434 : * Returns 1 if there are no smaller keys.
2435 : * Returns < 0 on error.
2436 : *
2437 : * This may release the path, and so you may lose any locks held at the
2438 : * time you call it.
2439 : */
2440 960 : static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
2441 : {
2442 960 : struct btrfs_key key;
2443 960 : struct btrfs_key orig_key;
2444 960 : struct btrfs_disk_key found_key;
2445 960 : int ret;
2446 :
2447 960 : btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
2448 960 : orig_key = key;
2449 :
2450 960 : if (key.offset > 0) {
2451 960 : key.offset--;
2452 0 : } else if (key.type > 0) {
2453 0 : key.type--;
2454 0 : key.offset = (u64)-1;
2455 0 : } else if (key.objectid > 0) {
2456 0 : key.objectid--;
2457 0 : key.type = (u8)-1;
2458 0 : key.offset = (u64)-1;
2459 : } else {
2460 : return 1;
2461 : }
2462 :
2463 960 : btrfs_release_path(path);
2464 960 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2465 960 : if (ret <= 0)
2466 : return ret;
2467 :
2468 : /*
2469 : * Previous key not found. Even if we were at slot 0 of the leaf we had
2470 : * before releasing the path and calling btrfs_search_slot(), we now may
2471 : * be in a slot pointing to the same original key - this can happen if
2472 : * after we released the path, one of more items were moved from a
2473 : * sibling leaf into the front of the leaf we had due to an insertion
2474 : * (see push_leaf_right()).
2475 : * If we hit this case and our slot is > 0 and just decrement the slot
2476 : * so that the caller does not process the same key again, which may or
2477 : * may not break the caller, depending on its logic.
2478 : */
2479 960 : if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
2480 958 : btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
2481 958 : ret = comp_keys(&found_key, &orig_key);
2482 958 : if (ret == 0) {
2483 958 : if (path->slots[0] > 0) {
2484 0 : path->slots[0]--;
2485 0 : return 0;
2486 : }
2487 : /*
2488 : * At slot 0, same key as before, it means orig_key is
2489 : * the lowest, leftmost, key in the tree. We're done.
2490 : */
2491 : return 1;
2492 : }
2493 : }
2494 :
2495 2 : btrfs_item_key(path->nodes[0], &found_key, 0);
2496 2 : ret = comp_keys(&found_key, &key);
2497 : /*
2498 : * We might have had an item with the previous key in the tree right
2499 : * before we released our path. And after we released our path, that
2500 : * item might have been pushed to the first slot (0) of the leaf we
2501 : * were holding due to a tree balance. Alternatively, an item with the
2502 : * previous key can exist as the only element of a leaf (big fat item).
2503 : * Therefore account for these 2 cases, so that our callers (like
2504 : * btrfs_previous_item) don't miss an existing item with a key matching
2505 : * the previous key we computed above.
2506 : */
2507 2 : if (ret <= 0)
2508 2 : return 0;
2509 : return 1;
2510 : }
2511 :
2512 : /*
2513 : * helper to use instead of search slot if no exact match is needed but
2514 : * instead the next or previous item should be returned.
2515 : * When find_higher is true, the next higher item is returned, the next lower
2516 : * otherwise.
2517 : * When return_any and find_higher are both true, and no higher item is found,
2518 : * return the next lower instead.
2519 : * When return_any is true and find_higher is false, and no lower item is found,
2520 : * return the next higher instead.
2521 : * It returns 0 if any item is found, 1 if none is found (tree empty), and
2522 : * < 0 on error
2523 : */
2524 1842230 : int btrfs_search_slot_for_read(struct btrfs_root *root,
2525 : const struct btrfs_key *key,
2526 : struct btrfs_path *p, int find_higher,
2527 : int return_any)
2528 : {
2529 1842230 : int ret;
2530 1842230 : struct extent_buffer *leaf;
2531 :
2532 : again:
2533 1842230 : ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2534 1842252 : if (ret <= 0)
2535 1103 : return ret;
2536 : /*
2537 : * a return value of 1 means the path is at the position where the
2538 : * item should be inserted. Normally this is the next bigger item,
2539 : * but in case the previous item is the last in a leaf, path points
2540 : * to the first free slot in the previous leaf, i.e. at an invalid
2541 : * item.
2542 : */
2543 1841149 : leaf = p->nodes[0];
2544 :
2545 1841149 : if (find_higher) {
2546 1432371 : if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2547 62198 : ret = btrfs_next_leaf(root, p);
2548 62198 : if (ret <= 0)
2549 62117 : return ret;
2550 81 : if (!return_any)
2551 : return 1;
2552 : /*
2553 : * no higher item found, return the next
2554 : * lower instead
2555 : */
2556 0 : return_any = 0;
2557 0 : find_higher = 0;
2558 0 : btrfs_release_path(p);
2559 0 : goto again;
2560 : }
2561 : } else {
2562 408778 : if (p->slots[0] == 0) {
2563 0 : ret = btrfs_prev_leaf(root, p);
2564 0 : if (ret < 0)
2565 0 : return ret;
2566 0 : if (!ret) {
2567 0 : leaf = p->nodes[0];
2568 0 : if (p->slots[0] == btrfs_header_nritems(leaf))
2569 0 : p->slots[0]--;
2570 0 : return 0;
2571 : }
2572 0 : if (!return_any)
2573 : return 1;
2574 : /*
2575 : * no lower item found, return the next
2576 : * higher instead
2577 : */
2578 0 : return_any = 0;
2579 0 : find_higher = 1;
2580 0 : btrfs_release_path(p);
2581 0 : goto again;
2582 : } else {
2583 408778 : --p->slots[0];
2584 : }
2585 : }
2586 : return 0;
2587 : }
2588 :
2589 : /*
2590 : * Execute search and call btrfs_previous_item to traverse backwards if the item
2591 : * was not found.
2592 : *
2593 : * Return 0 if found, 1 if not found and < 0 if error.
2594 : */
2595 3777 : int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
2596 : struct btrfs_path *path)
2597 : {
2598 3777 : int ret;
2599 :
2600 3777 : ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2601 3777 : if (ret > 0)
2602 3550 : ret = btrfs_previous_item(root, path, key->objectid, key->type);
2603 :
2604 3777 : if (ret == 0)
2605 2483 : btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2606 :
2607 3777 : return ret;
2608 : }
2609 :
2610 : /*
2611 : * Search for a valid slot for the given path.
2612 : *
2613 : * @root: The root node of the tree.
2614 : * @key: Will contain a valid item if found.
2615 : * @path: The starting point to validate the slot.
2616 : *
2617 : * Return: 0 if the item is valid
2618 : * 1 if not found
2619 : * <0 if error.
2620 : */
2621 2404074373 : int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
2622 : struct btrfs_path *path)
2623 : {
2624 2404074373 : if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2625 9972950 : int ret;
2626 :
2627 9972950 : ret = btrfs_next_leaf(root, path);
2628 9972934 : if (ret)
2629 : return ret;
2630 : }
2631 :
2632 2404058949 : btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2633 2404058949 : return 0;
2634 : }
2635 :
2636 : /*
2637 : * adjust the pointers going up the tree, starting at level
2638 : * making sure the right key of each node is points to 'key'.
2639 : * This is used after shifting pointers to the left, so it stops
2640 : * fixing up pointers when a given leaf/node is not in slot 0 of the
2641 : * higher levels
2642 : *
2643 : */
2644 4557255 : static void fixup_low_keys(struct btrfs_path *path,
2645 : struct btrfs_disk_key *key, int level)
2646 : {
2647 4557255 : int i;
2648 4557255 : struct extent_buffer *t;
2649 4557255 : int ret;
2650 :
2651 4901290 : for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2652 4901290 : int tslot = path->slots[i];
2653 :
2654 4901290 : if (!path->nodes[i])
2655 : break;
2656 4444037 : t = path->nodes[i];
2657 4444037 : ret = btrfs_tree_mod_log_insert_key(t, tslot,
2658 : BTRFS_MOD_LOG_KEY_REPLACE);
2659 4444046 : BUG_ON(ret < 0);
2660 4444046 : btrfs_set_node_key(t, key, tslot);
2661 4444044 : btrfs_mark_buffer_dirty(path->nodes[i]);
2662 4444106 : if (tslot != 0)
2663 : break;
2664 : }
2665 4557324 : }
2666 :
2667 : /*
2668 : * update item key.
2669 : *
2670 : * This function isn't completely safe. It's the caller's responsibility
2671 : * that the new key won't break the order
2672 : */
2673 5316572 : void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2674 : struct btrfs_path *path,
2675 : const struct btrfs_key *new_key)
2676 : {
2677 5316572 : struct btrfs_disk_key disk_key;
2678 5316572 : struct extent_buffer *eb;
2679 5316572 : int slot;
2680 :
2681 5316572 : eb = path->nodes[0];
2682 5316572 : slot = path->slots[0];
2683 5316572 : if (slot > 0) {
2684 4760978 : btrfs_item_key(eb, &disk_key, slot - 1);
2685 4760979 : if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
2686 0 : btrfs_print_leaf(eb);
2687 0 : btrfs_crit(fs_info,
2688 : "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2689 : slot, btrfs_disk_key_objectid(&disk_key),
2690 : btrfs_disk_key_type(&disk_key),
2691 : btrfs_disk_key_offset(&disk_key),
2692 : new_key->objectid, new_key->type,
2693 : new_key->offset);
2694 0 : BUG();
2695 : }
2696 : }
2697 5316573 : if (slot < btrfs_header_nritems(eb) - 1) {
2698 4911992 : btrfs_item_key(eb, &disk_key, slot + 1);
2699 4911988 : if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
2700 0 : btrfs_print_leaf(eb);
2701 0 : btrfs_crit(fs_info,
2702 : "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2703 : slot, btrfs_disk_key_objectid(&disk_key),
2704 : btrfs_disk_key_type(&disk_key),
2705 : btrfs_disk_key_offset(&disk_key),
2706 : new_key->objectid, new_key->type,
2707 : new_key->offset);
2708 0 : BUG();
2709 : }
2710 : }
2711 :
2712 5316569 : btrfs_cpu_key_to_disk(&disk_key, new_key);
2713 5316571 : btrfs_set_item_key(eb, &disk_key, slot);
2714 5316571 : btrfs_mark_buffer_dirty(eb);
2715 5316575 : if (slot == 0)
2716 555601 : fixup_low_keys(path, &disk_key, 1);
2717 5316575 : }
2718 :
2719 : /*
2720 : * Check key order of two sibling extent buffers.
2721 : *
2722 : * Return true if something is wrong.
2723 : * Return false if everything is fine.
2724 : *
2725 : * Tree-checker only works inside one tree block, thus the following
2726 : * corruption can not be detected by tree-checker:
2727 : *
2728 : * Leaf @left | Leaf @right
2729 : * --------------------------------------------------------------
2730 : * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
2731 : *
2732 : * Key f6 in leaf @left itself is valid, but not valid when the next
2733 : * key in leaf @right is 7.
2734 : * This can only be checked at tree block merge time.
2735 : * And since tree checker has ensured all key order in each tree block
2736 : * is correct, we only need to bother the last key of @left and the first
2737 : * key of @right.
2738 : */
2739 4140399 : static bool check_sibling_keys(struct extent_buffer *left,
2740 : struct extent_buffer *right)
2741 : {
2742 4140399 : struct btrfs_key left_last;
2743 4140399 : struct btrfs_key right_first;
2744 4140399 : int level = btrfs_header_level(left);
2745 4140399 : int nr_left = btrfs_header_nritems(left);
2746 4140399 : int nr_right = btrfs_header_nritems(right);
2747 :
2748 : /* No key to check in one of the tree blocks */
2749 4140399 : if (!nr_left || !nr_right)
2750 : return false;
2751 :
2752 4140336 : if (level) {
2753 116869 : btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2754 116869 : btrfs_node_key_to_cpu(right, &right_first, 0);
2755 : } else {
2756 4023467 : btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2757 4023469 : btrfs_item_key_to_cpu(right, &right_first, 0);
2758 : }
2759 :
2760 4140335 : if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) {
2761 0 : btrfs_crit(left->fs_info, "left extent buffer:");
2762 0 : btrfs_print_tree(left, false);
2763 0 : btrfs_crit(left->fs_info, "right extent buffer:");
2764 0 : btrfs_print_tree(right, false);
2765 0 : btrfs_crit(left->fs_info,
2766 : "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2767 : left_last.objectid, left_last.type,
2768 : left_last.offset, right_first.objectid,
2769 : right_first.type, right_first.offset);
2770 0 : return true;
2771 : }
2772 : return false;
2773 : }
2774 :
2775 : /*
2776 : * try to push data from one node into the next node left in the
2777 : * tree.
2778 : *
2779 : * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2780 : * error, and > 0 if there was no room in the left hand block.
2781 : */
2782 63325 : static int push_node_left(struct btrfs_trans_handle *trans,
2783 : struct extent_buffer *dst,
2784 : struct extent_buffer *src, int empty)
2785 : {
2786 63325 : struct btrfs_fs_info *fs_info = trans->fs_info;
2787 63325 : int push_items = 0;
2788 63325 : int src_nritems;
2789 63325 : int dst_nritems;
2790 63325 : int ret = 0;
2791 :
2792 63325 : src_nritems = btrfs_header_nritems(src);
2793 63325 : dst_nritems = btrfs_header_nritems(dst);
2794 63325 : push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2795 63325 : WARN_ON(btrfs_header_generation(src) != trans->transid);
2796 63325 : WARN_ON(btrfs_header_generation(dst) != trans->transid);
2797 :
2798 63325 : if (!empty && src_nritems <= 8)
2799 : return 1;
2800 :
2801 63325 : if (push_items <= 0)
2802 : return 1;
2803 :
2804 59248 : if (empty) {
2805 365 : push_items = min(src_nritems, push_items);
2806 365 : if (push_items < src_nritems) {
2807 : /* leave at least 8 pointers in the node if
2808 : * we aren't going to empty it
2809 : */
2810 190 : if (src_nritems - push_items < 8) {
2811 94 : if (push_items <= 8)
2812 : return 1;
2813 2 : push_items -= 8;
2814 : }
2815 : }
2816 : } else
2817 58883 : push_items = min(src_nritems - 8, push_items);
2818 :
2819 : /* dst is the left eb, src is the middle eb */
2820 59156 : if (check_sibling_keys(dst, src)) {
2821 0 : ret = -EUCLEAN;
2822 0 : btrfs_abort_transaction(trans, ret);
2823 0 : return ret;
2824 : }
2825 59156 : ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
2826 59156 : if (ret) {
2827 0 : btrfs_abort_transaction(trans, ret);
2828 0 : return ret;
2829 : }
2830 59156 : copy_extent_buffer(dst, src,
2831 : btrfs_node_key_ptr_offset(dst, dst_nritems),
2832 : btrfs_node_key_ptr_offset(src, 0),
2833 : push_items * sizeof(struct btrfs_key_ptr));
2834 :
2835 59156 : if (push_items < src_nritems) {
2836 : /*
2837 : * btrfs_tree_mod_log_eb_copy handles logging the move, so we
2838 : * don't need to do an explicit tree mod log operation for it.
2839 : */
2840 58981 : memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0),
2841 : btrfs_node_key_ptr_offset(src, push_items),
2842 58981 : (src_nritems - push_items) *
2843 : sizeof(struct btrfs_key_ptr));
2844 : }
2845 59156 : btrfs_set_header_nritems(src, src_nritems - push_items);
2846 59156 : btrfs_set_header_nritems(dst, dst_nritems + push_items);
2847 59156 : btrfs_mark_buffer_dirty(src);
2848 59156 : btrfs_mark_buffer_dirty(dst);
2849 :
2850 59156 : return ret;
2851 : }
2852 :
2853 : /*
2854 : * try to push data from one node into the next node right in the
2855 : * tree.
2856 : *
2857 : * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2858 : * error, and > 0 if there was no room in the right hand block.
2859 : *
2860 : * this will only push up to 1/2 the contents of the left node over
2861 : */
2862 57776 : static int balance_node_right(struct btrfs_trans_handle *trans,
2863 : struct extent_buffer *dst,
2864 : struct extent_buffer *src)
2865 : {
2866 57776 : struct btrfs_fs_info *fs_info = trans->fs_info;
2867 57776 : int push_items = 0;
2868 57776 : int max_push;
2869 57776 : int src_nritems;
2870 57776 : int dst_nritems;
2871 57776 : int ret = 0;
2872 :
2873 57776 : WARN_ON(btrfs_header_generation(src) != trans->transid);
2874 57776 : WARN_ON(btrfs_header_generation(dst) != trans->transid);
2875 :
2876 57776 : src_nritems = btrfs_header_nritems(src);
2877 57776 : dst_nritems = btrfs_header_nritems(dst);
2878 57776 : push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2879 57776 : if (push_items <= 0)
2880 : return 1;
2881 :
2882 57776 : if (src_nritems < 4)
2883 : return 1;
2884 :
2885 57776 : max_push = src_nritems / 2 + 1;
2886 : /* don't try to empty the node */
2887 57776 : if (max_push >= src_nritems)
2888 : return 1;
2889 :
2890 57776 : if (max_push < push_items)
2891 : push_items = max_push;
2892 :
2893 : /* dst is the right eb, src is the middle eb */
2894 57776 : if (check_sibling_keys(src, dst)) {
2895 0 : ret = -EUCLEAN;
2896 0 : btrfs_abort_transaction(trans, ret);
2897 0 : return ret;
2898 : }
2899 :
2900 : /*
2901 : * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't
2902 : * need to do an explicit tree mod log operation for it.
2903 : */
2904 57776 : memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items),
2905 : btrfs_node_key_ptr_offset(dst, 0),
2906 : (dst_nritems) *
2907 : sizeof(struct btrfs_key_ptr));
2908 :
2909 57776 : ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2910 : push_items);
2911 57776 : if (ret) {
2912 0 : btrfs_abort_transaction(trans, ret);
2913 0 : return ret;
2914 : }
2915 57776 : copy_extent_buffer(dst, src,
2916 : btrfs_node_key_ptr_offset(dst, 0),
2917 : btrfs_node_key_ptr_offset(src, src_nritems - push_items),
2918 : push_items * sizeof(struct btrfs_key_ptr));
2919 :
2920 57776 : btrfs_set_header_nritems(src, src_nritems - push_items);
2921 57776 : btrfs_set_header_nritems(dst, dst_nritems + push_items);
2922 :
2923 57776 : btrfs_mark_buffer_dirty(src);
2924 57776 : btrfs_mark_buffer_dirty(dst);
2925 :
2926 57776 : return ret;
2927 : }
2928 :
2929 : /*
2930 : * helper function to insert a new root level in the tree.
2931 : * A new node is allocated, and a single item is inserted to
2932 : * point to the existing root
2933 : *
2934 : * returns zero on success or < 0 on failure.
2935 : */
2936 2019 : static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2937 : struct btrfs_root *root,
2938 : struct btrfs_path *path, int level)
2939 : {
2940 2019 : struct btrfs_fs_info *fs_info = root->fs_info;
2941 2019 : u64 lower_gen;
2942 2019 : struct extent_buffer *lower;
2943 2019 : struct extent_buffer *c;
2944 2019 : struct extent_buffer *old;
2945 2019 : struct btrfs_disk_key lower_key;
2946 2019 : int ret;
2947 :
2948 2019 : BUG_ON(path->nodes[level]);
2949 2019 : BUG_ON(path->nodes[level-1] != root->node);
2950 :
2951 2019 : lower = path->nodes[level-1];
2952 2019 : if (level == 1)
2953 1944 : btrfs_item_key(lower, &lower_key, 0);
2954 : else
2955 75 : btrfs_node_key(lower, &lower_key, 0);
2956 :
2957 2019 : c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2958 2019 : &lower_key, level, root->node->start, 0,
2959 : BTRFS_NESTING_NEW_ROOT);
2960 2019 : if (IS_ERR(c))
2961 0 : return PTR_ERR(c);
2962 :
2963 2019 : root_add_used(root, fs_info->nodesize);
2964 :
2965 2019 : btrfs_set_header_nritems(c, 1);
2966 2019 : btrfs_set_node_key(c, &lower_key, 0);
2967 2019 : btrfs_set_node_blockptr(c, 0, lower->start);
2968 2019 : lower_gen = btrfs_header_generation(lower);
2969 2019 : WARN_ON(lower_gen != trans->transid);
2970 :
2971 2019 : btrfs_set_node_ptr_generation(c, 0, lower_gen);
2972 :
2973 2019 : btrfs_mark_buffer_dirty(c);
2974 :
2975 2019 : old = root->node;
2976 2019 : ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
2977 2019 : if (ret < 0) {
2978 0 : btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
2979 0 : btrfs_tree_unlock(c);
2980 0 : free_extent_buffer(c);
2981 0 : return ret;
2982 : }
2983 2019 : rcu_assign_pointer(root->node, c);
2984 :
2985 : /* the super has an extra ref to root->node */
2986 2019 : free_extent_buffer(old);
2987 :
2988 2019 : add_root_to_dirty_list(root);
2989 2019 : atomic_inc(&c->refs);
2990 2019 : path->nodes[level] = c;
2991 2019 : path->locks[level] = BTRFS_WRITE_LOCK;
2992 2019 : path->slots[level] = 0;
2993 2019 : return 0;
2994 : }
2995 :
2996 : /*
2997 : * worker function to insert a single pointer in a node.
2998 : * the node should have enough room for the pointer already
2999 : *
3000 : * slot and level indicate where you want the key to go, and
3001 : * blocknr is the block the key points to.
3002 : */
3003 373137 : static int insert_ptr(struct btrfs_trans_handle *trans,
3004 : struct btrfs_path *path,
3005 : struct btrfs_disk_key *key, u64 bytenr,
3006 : int slot, int level)
3007 : {
3008 373137 : struct extent_buffer *lower;
3009 373137 : int nritems;
3010 373137 : int ret;
3011 :
3012 373137 : BUG_ON(!path->nodes[level]);
3013 373137 : btrfs_assert_tree_write_locked(path->nodes[level]);
3014 373137 : lower = path->nodes[level];
3015 373137 : nritems = btrfs_header_nritems(lower);
3016 373137 : BUG_ON(slot > nritems);
3017 373137 : BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
3018 373137 : if (slot != nritems) {
3019 192447 : if (level) {
3020 192447 : ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
3021 : slot, nritems - slot);
3022 192436 : if (ret < 0) {
3023 0 : btrfs_abort_transaction(trans, ret);
3024 0 : return ret;
3025 : }
3026 : }
3027 192436 : memmove_extent_buffer(lower,
3028 : btrfs_node_key_ptr_offset(lower, slot + 1),
3029 : btrfs_node_key_ptr_offset(lower, slot),
3030 192436 : (nritems - slot) * sizeof(struct btrfs_key_ptr));
3031 : }
3032 373136 : if (level) {
3033 373136 : ret = btrfs_tree_mod_log_insert_key(lower, slot,
3034 : BTRFS_MOD_LOG_KEY_ADD);
3035 373128 : if (ret < 0) {
3036 0 : btrfs_abort_transaction(trans, ret);
3037 0 : return ret;
3038 : }
3039 : }
3040 373128 : btrfs_set_node_key(lower, key, slot);
3041 373126 : btrfs_set_node_blockptr(lower, slot, bytenr);
3042 373122 : WARN_ON(trans->transid == 0);
3043 373122 : btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3044 373123 : btrfs_set_header_nritems(lower, nritems + 1);
3045 373123 : btrfs_mark_buffer_dirty(lower);
3046 :
3047 373123 : return 0;
3048 : }
3049 :
3050 : /*
3051 : * split the node at the specified level in path in two.
3052 : * The path is corrected to point to the appropriate node after the split
3053 : *
3054 : * Before splitting this tries to make some room in the node by pushing
3055 : * left and right, if either one works, it returns right away.
3056 : *
3057 : * returns 0 on success and < 0 on failure
3058 : */
3059 117071 : static noinline int split_node(struct btrfs_trans_handle *trans,
3060 : struct btrfs_root *root,
3061 : struct btrfs_path *path, int level)
3062 : {
3063 117071 : struct btrfs_fs_info *fs_info = root->fs_info;
3064 117071 : struct extent_buffer *c;
3065 117071 : struct extent_buffer *split;
3066 117071 : struct btrfs_disk_key disk_key;
3067 117071 : int mid;
3068 117071 : int ret;
3069 117071 : u32 c_nritems;
3070 :
3071 117071 : c = path->nodes[level];
3072 117071 : WARN_ON(btrfs_header_generation(c) != trans->transid);
3073 117071 : if (c == root->node) {
3074 : /*
3075 : * trying to split the root, lets make a new one
3076 : *
3077 : * tree mod log: We don't log_removal old root in
3078 : * insert_new_root, because that root buffer will be kept as a
3079 : * normal node. We are going to log removal of half of the
3080 : * elements below with btrfs_tree_mod_log_eb_copy(). We're
3081 : * holding a tree lock on the buffer, which is why we cannot
3082 : * race with other tree_mod_log users.
3083 : */
3084 75 : ret = insert_new_root(trans, root, path, level + 1);
3085 75 : if (ret)
3086 : return ret;
3087 : } else {
3088 116996 : ret = push_nodes_for_insert(trans, root, path, level);
3089 116996 : c = path->nodes[level];
3090 116996 : if (!ret && btrfs_header_nritems(c) <
3091 116659 : BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3092 : return 0;
3093 710 : if (ret < 0)
3094 : return ret;
3095 : }
3096 :
3097 785 : c_nritems = btrfs_header_nritems(c);
3098 785 : mid = (c_nritems + 1) / 2;
3099 785 : btrfs_node_key(c, &disk_key, mid);
3100 :
3101 785 : split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3102 : &disk_key, level, c->start, 0,
3103 : BTRFS_NESTING_SPLIT);
3104 785 : if (IS_ERR(split))
3105 0 : return PTR_ERR(split);
3106 :
3107 785 : root_add_used(root, fs_info->nodesize);
3108 785 : ASSERT(btrfs_header_level(c) == level);
3109 :
3110 785 : ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3111 785 : if (ret) {
3112 0 : btrfs_tree_unlock(split);
3113 0 : free_extent_buffer(split);
3114 0 : btrfs_abort_transaction(trans, ret);
3115 0 : return ret;
3116 : }
3117 785 : copy_extent_buffer(split, c,
3118 : btrfs_node_key_ptr_offset(split, 0),
3119 : btrfs_node_key_ptr_offset(c, mid),
3120 785 : (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3121 785 : btrfs_set_header_nritems(split, c_nritems - mid);
3122 785 : btrfs_set_header_nritems(c, mid);
3123 :
3124 785 : btrfs_mark_buffer_dirty(c);
3125 785 : btrfs_mark_buffer_dirty(split);
3126 :
3127 785 : ret = insert_ptr(trans, path, &disk_key, split->start,
3128 785 : path->slots[level + 1] + 1, level + 1);
3129 785 : if (ret < 0) {
3130 0 : btrfs_tree_unlock(split);
3131 0 : free_extent_buffer(split);
3132 0 : return ret;
3133 : }
3134 :
3135 785 : if (path->slots[level] >= mid) {
3136 660 : path->slots[level] -= mid;
3137 660 : btrfs_tree_unlock(c);
3138 660 : free_extent_buffer(c);
3139 660 : path->nodes[level] = split;
3140 660 : path->slots[level + 1] += 1;
3141 : } else {
3142 125 : btrfs_tree_unlock(split);
3143 125 : free_extent_buffer(split);
3144 : }
3145 : return 0;
3146 : }
3147 :
3148 : /*
3149 : * how many bytes are required to store the items in a leaf. start
3150 : * and nr indicate which items in the leaf to check. This totals up the
3151 : * space used both by the item structs and the item data
3152 : */
3153 349079376 : static int leaf_space_used(const struct extent_buffer *l, int start, int nr)
3154 : {
3155 349079376 : int data_len;
3156 349079376 : int nritems = btrfs_header_nritems(l);
3157 349079376 : int end = min(nritems, start + nr) - 1;
3158 :
3159 349079376 : if (!nr)
3160 : return 0;
3161 349046665 : data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
3162 348914213 : data_len = data_len - btrfs_item_offset(l, end);
3163 348947943 : data_len += sizeof(struct btrfs_item) * nr;
3164 348947943 : WARN_ON(data_len < 0);
3165 : return data_len;
3166 : }
3167 :
3168 : /*
3169 : * The space between the end of the leaf items and
3170 : * the start of the leaf data. IOW, how much room
3171 : * the leaf has left for both items and data
3172 : */
3173 317223955 : int btrfs_leaf_free_space(const struct extent_buffer *leaf)
3174 : {
3175 317223955 : struct btrfs_fs_info *fs_info = leaf->fs_info;
3176 317223955 : int nritems = btrfs_header_nritems(leaf);
3177 317223955 : int ret;
3178 :
3179 317223955 : ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3180 317098114 : if (ret < 0) {
3181 0 : btrfs_crit(fs_info,
3182 : "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3183 : ret,
3184 : (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3185 : leaf_space_used(leaf, 0, nritems), nritems);
3186 : }
3187 317098114 : return ret;
3188 : }
3189 :
3190 : /*
3191 : * min slot controls the lowest index we're willing to push to the
3192 : * right. We'll push up to and including min_slot, but no lower
3193 : */
3194 2786121 : static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3195 : struct btrfs_path *path,
3196 : int data_size, int empty,
3197 : struct extent_buffer *right,
3198 : int free_space, u32 left_nritems,
3199 : u32 min_slot)
3200 : {
3201 2786121 : struct btrfs_fs_info *fs_info = right->fs_info;
3202 2786121 : struct extent_buffer *left = path->nodes[0];
3203 2786121 : struct extent_buffer *upper = path->nodes[1];
3204 2786121 : struct btrfs_map_token token;
3205 2786121 : struct btrfs_disk_key disk_key;
3206 2786121 : int slot;
3207 2786121 : u32 i;
3208 2786121 : int push_space = 0;
3209 2786121 : int push_items = 0;
3210 2786121 : u32 nr;
3211 2786121 : u32 right_nritems;
3212 2786121 : u32 data_end;
3213 2786121 : u32 this_item_size;
3214 :
3215 2786121 : if (empty)
3216 : nr = 0;
3217 : else
3218 2762719 : nr = max_t(u32, 1, min_slot);
3219 :
3220 2786121 : if (path->slots[0] >= left_nritems)
3221 3338 : push_space += data_size;
3222 :
3223 2786121 : slot = path->slots[1];
3224 2786121 : i = left_nritems - 1;
3225 95145280 : while (i >= nr) {
3226 95143422 : if (!empty && push_items > 0) {
3227 91441479 : if (path->slots[0] > i)
3228 : break;
3229 91248938 : if (path->slots[0] == i) {
3230 341750 : int space = btrfs_leaf_free_space(left);
3231 :
3232 341750 : if (space + push_space * 2 > free_space)
3233 : break;
3234 : }
3235 : }
3236 :
3237 94785822 : if (path->slots[0] == i)
3238 243949 : push_space += data_size;
3239 :
3240 94785822 : this_item_size = btrfs_item_size(left, i);
3241 94785863 : if (this_item_size + sizeof(struct btrfs_item) +
3242 94785863 : push_space > free_space)
3243 : break;
3244 :
3245 92382521 : push_items++;
3246 92382521 : push_space += this_item_size + sizeof(struct btrfs_item);
3247 92382521 : if (i == 0)
3248 : break;
3249 92359159 : i--;
3250 : }
3251 :
3252 2786162 : if (push_items == 0)
3253 346224 : goto out_unlock;
3254 :
3255 2439938 : WARN_ON(!empty && push_items == left_nritems);
3256 :
3257 : /* push left to right */
3258 2439938 : right_nritems = btrfs_header_nritems(right);
3259 :
3260 2439938 : push_space = btrfs_item_data_end(left, left_nritems - push_items);
3261 2439925 : push_space -= leaf_data_end(left);
3262 :
3263 : /* make room in the right data area */
3264 2439915 : data_end = leaf_data_end(right);
3265 2439919 : memmove_leaf_data(right, data_end - push_space, data_end,
3266 2439919 : BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3267 :
3268 : /* copy from the left data area */
3269 2439937 : copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3270 2439934 : leaf_data_end(left), push_space);
3271 :
3272 2439934 : memmove_leaf_items(right, push_items, 0, right_nritems);
3273 :
3274 : /* copy the items from left to right */
3275 2439935 : copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
3276 :
3277 : /* update the item pointers */
3278 2439938 : btrfs_init_map_token(&token, right);
3279 2439933 : right_nritems += push_items;
3280 2439933 : btrfs_set_header_nritems(right, right_nritems);
3281 2439933 : push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3282 626991302 : for (i = 0; i < right_nritems; i++) {
3283 624551362 : push_space -= btrfs_token_item_size(&token, i);
3284 624552349 : btrfs_set_token_item_offset(&token, i, push_space);
3285 : }
3286 :
3287 2439940 : left_nritems -= push_items;
3288 2439940 : btrfs_set_header_nritems(left, left_nritems);
3289 :
3290 2439940 : if (left_nritems)
3291 2416578 : btrfs_mark_buffer_dirty(left);
3292 : else
3293 23362 : btrfs_clear_buffer_dirty(trans, left);
3294 :
3295 2439941 : btrfs_mark_buffer_dirty(right);
3296 :
3297 2439942 : btrfs_item_key(right, &disk_key, 0);
3298 2439938 : btrfs_set_node_key(upper, &disk_key, slot + 1);
3299 2439924 : btrfs_mark_buffer_dirty(upper);
3300 :
3301 : /* then fixup the leaf pointer in the path */
3302 2439940 : if (path->slots[0] >= left_nritems) {
3303 217688 : path->slots[0] -= left_nritems;
3304 217688 : if (btrfs_header_nritems(path->nodes[0]) == 0)
3305 23362 : btrfs_clear_buffer_dirty(trans, path->nodes[0]);
3306 217688 : btrfs_tree_unlock(path->nodes[0]);
3307 217688 : free_extent_buffer(path->nodes[0]);
3308 217688 : path->nodes[0] = right;
3309 217688 : path->slots[1] += 1;
3310 : } else {
3311 2222252 : btrfs_tree_unlock(right);
3312 2222250 : free_extent_buffer(right);
3313 : }
3314 : return 0;
3315 :
3316 : out_unlock:
3317 346224 : btrfs_tree_unlock(right);
3318 346226 : free_extent_buffer(right);
3319 346226 : return 1;
3320 : }
3321 :
3322 : /*
3323 : * push some data in the path leaf to the right, trying to free up at
3324 : * least data_size bytes. returns zero if the push worked, nonzero otherwise
3325 : *
3326 : * returns 1 if the push failed because the other node didn't have enough
3327 : * room, 0 if everything worked out and < 0 if there were major errors.
3328 : *
3329 : * this will push starting from min_slot to the end of the leaf. It won't
3330 : * push any slot lower than min_slot
3331 : */
3332 7737746 : static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3333 : *root, struct btrfs_path *path,
3334 : int min_data_size, int data_size,
3335 : int empty, u32 min_slot)
3336 : {
3337 7737746 : struct extent_buffer *left = path->nodes[0];
3338 7737746 : struct extent_buffer *right;
3339 7737746 : struct extent_buffer *upper;
3340 7737746 : int slot;
3341 7737746 : int free_space;
3342 7737746 : u32 left_nritems;
3343 7737746 : int ret;
3344 :
3345 7737746 : if (!path->nodes[1])
3346 : return 1;
3347 :
3348 4433998 : slot = path->slots[1];
3349 4433998 : upper = path->nodes[1];
3350 4433998 : if (slot >= btrfs_header_nritems(upper) - 1)
3351 : return 1;
3352 :
3353 3538319 : btrfs_assert_tree_write_locked(path->nodes[1]);
3354 :
3355 3538319 : right = btrfs_read_node_slot(upper, slot + 1);
3356 3538411 : if (IS_ERR(right))
3357 0 : return PTR_ERR(right);
3358 :
3359 3538411 : __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
3360 :
3361 3538389 : free_space = btrfs_leaf_free_space(right);
3362 3538422 : if (free_space < data_size)
3363 730484 : goto out_unlock;
3364 :
3365 2807938 : ret = btrfs_cow_block(trans, root, right, upper,
3366 : slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
3367 2807915 : if (ret)
3368 0 : goto out_unlock;
3369 :
3370 2807915 : left_nritems = btrfs_header_nritems(left);
3371 2807915 : if (left_nritems == 0)
3372 0 : goto out_unlock;
3373 :
3374 2807915 : if (check_sibling_keys(left, right)) {
3375 0 : ret = -EUCLEAN;
3376 0 : btrfs_abort_transaction(trans, ret);
3377 0 : btrfs_tree_unlock(right);
3378 0 : free_extent_buffer(right);
3379 0 : return ret;
3380 : }
3381 2807929 : if (path->slots[0] == left_nritems && !empty) {
3382 : /* Key greater than all keys in the leaf, right neighbor has
3383 : * enough room for it and we're not emptying our leaf to delete
3384 : * it, therefore use right neighbor to insert the new item and
3385 : * no need to touch/dirty our left leaf. */
3386 21792 : btrfs_tree_unlock(left);
3387 21792 : free_extent_buffer(left);
3388 21792 : path->nodes[0] = right;
3389 21792 : path->slots[0] = 0;
3390 21792 : path->slots[1]++;
3391 21792 : return 0;
3392 : }
3393 :
3394 2786137 : return __push_leaf_right(trans, path, min_data_size, empty, right,
3395 : free_space, left_nritems, min_slot);
3396 730484 : out_unlock:
3397 730484 : btrfs_tree_unlock(right);
3398 730488 : free_extent_buffer(right);
3399 730488 : return 1;
3400 : }
3401 :
3402 : /*
3403 : * push some data in the path leaf to the left, trying to free up at
3404 : * least data_size bytes. returns zero if the push worked, nonzero otherwise
3405 : *
3406 : * max_slot can put a limit on how far into the leaf we'll push items. The
3407 : * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
3408 : * items
3409 : */
3410 1215573 : static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3411 : struct btrfs_path *path, int data_size,
3412 : int empty, struct extent_buffer *left,
3413 : int free_space, u32 right_nritems,
3414 : u32 max_slot)
3415 : {
3416 1215573 : struct btrfs_fs_info *fs_info = left->fs_info;
3417 1215573 : struct btrfs_disk_key disk_key;
3418 1215573 : struct extent_buffer *right = path->nodes[0];
3419 1215573 : int i;
3420 1215573 : int push_space = 0;
3421 1215573 : int push_items = 0;
3422 1215573 : u32 old_left_nritems;
3423 1215573 : u32 nr;
3424 1215573 : int ret = 0;
3425 1215573 : u32 this_item_size;
3426 1215573 : u32 old_left_item_size;
3427 1215573 : struct btrfs_map_token token;
3428 :
3429 1215573 : if (empty)
3430 139601 : nr = min(right_nritems, max_slot);
3431 : else
3432 1075972 : nr = min(right_nritems - 1, max_slot);
3433 :
3434 41226771 : for (i = 0; i < nr; i++) {
3435 41143773 : if (!empty && push_items > 0) {
3436 36150302 : if (path->slots[0] < i)
3437 : break;
3438 36052586 : if (path->slots[0] == i) {
3439 165528 : int space = btrfs_leaf_free_space(right);
3440 :
3441 165528 : if (space + push_space * 2 > free_space)
3442 : break;
3443 : }
3444 : }
3445 :
3446 40980249 : if (path->slots[0] == i)
3447 227437 : push_space += data_size;
3448 :
3449 40980249 : this_item_size = btrfs_item_size(right, i);
3450 40980271 : if (this_item_size + sizeof(struct btrfs_item) + push_space >
3451 : free_space)
3452 : break;
3453 :
3454 40011198 : push_items++;
3455 40011198 : push_space += this_item_size + sizeof(struct btrfs_item);
3456 : }
3457 :
3458 1215595 : if (push_items == 0) {
3459 183254 : ret = 1;
3460 183254 : goto out;
3461 : }
3462 1925080 : WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3463 :
3464 : /* push data from right to left */
3465 1032341 : copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items);
3466 :
3467 1032336 : push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3468 1032336 : btrfs_item_offset(right, push_items - 1);
3469 :
3470 1032335 : copy_leaf_data(left, right, leaf_data_end(left) - push_space,
3471 : btrfs_item_offset(right, push_items - 1), push_space);
3472 1032345 : old_left_nritems = btrfs_header_nritems(left);
3473 1032345 : BUG_ON(old_left_nritems <= 0);
3474 :
3475 1032345 : btrfs_init_map_token(&token, left);
3476 1032343 : old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
3477 41040502 : for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3478 40008159 : u32 ioff;
3479 :
3480 40008159 : ioff = btrfs_token_item_offset(&token, i);
3481 40008261 : btrfs_set_token_item_offset(&token, i,
3482 40008261 : ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3483 : }
3484 1032343 : btrfs_set_header_nritems(left, old_left_nritems + push_items);
3485 :
3486 : /* fixup right node */
3487 1032343 : if (push_items > right_nritems)
3488 0 : WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3489 : right_nritems);
3490 :
3491 1032343 : if (push_items < right_nritems) {
3492 949737 : push_space = btrfs_item_offset(right, push_items - 1) -
3493 949734 : leaf_data_end(right);
3494 949732 : memmove_leaf_data(right,
3495 949732 : BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3496 949730 : leaf_data_end(right), push_space);
3497 :
3498 949736 : memmove_leaf_items(right, 0, push_items,
3499 949736 : btrfs_header_nritems(right) - push_items);
3500 : }
3501 :
3502 1032341 : btrfs_init_map_token(&token, right);
3503 1032337 : right_nritems -= push_items;
3504 1032337 : btrfs_set_header_nritems(right, right_nritems);
3505 1032337 : push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3506 169067818 : for (i = 0; i < right_nritems; i++) {
3507 168035474 : push_space = push_space - btrfs_token_item_size(&token, i);
3508 168035826 : btrfs_set_token_item_offset(&token, i, push_space);
3509 : }
3510 :
3511 1032344 : btrfs_mark_buffer_dirty(left);
3512 1032344 : if (right_nritems)
3513 949737 : btrfs_mark_buffer_dirty(right);
3514 : else
3515 82607 : btrfs_clear_buffer_dirty(trans, right);
3516 :
3517 1032346 : btrfs_item_key(right, &disk_key, 0);
3518 1032346 : fixup_low_keys(path, &disk_key, 1);
3519 :
3520 : /* then fixup the leaf pointer in the path */
3521 1032343 : if (path->slots[0] < push_items) {
3522 218454 : path->slots[0] += old_left_nritems;
3523 218454 : btrfs_tree_unlock(path->nodes[0]);
3524 218454 : free_extent_buffer(path->nodes[0]);
3525 218454 : path->nodes[0] = left;
3526 218454 : path->slots[1] -= 1;
3527 : } else {
3528 813889 : btrfs_tree_unlock(left);
3529 813884 : free_extent_buffer(left);
3530 813890 : path->slots[0] -= push_items;
3531 : }
3532 1032344 : BUG_ON(path->slots[0] < 0);
3533 : return ret;
3534 : out:
3535 183254 : btrfs_tree_unlock(left);
3536 183252 : free_extent_buffer(left);
3537 183252 : return ret;
3538 : }
3539 :
3540 : /*
3541 : * push some data in the path leaf to the left, trying to free up at
3542 : * least data_size bytes. returns zero if the push worked, nonzero otherwise
3543 : *
3544 : * max_slot can put a limit on how far into the leaf we'll push items. The
3545 : * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
3546 : * items
3547 : */
3548 5424570 : static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3549 : *root, struct btrfs_path *path, int min_data_size,
3550 : int data_size, int empty, u32 max_slot)
3551 : {
3552 5424570 : struct extent_buffer *right = path->nodes[0];
3553 5424570 : struct extent_buffer *left;
3554 5424570 : int slot;
3555 5424570 : int free_space;
3556 5424570 : u32 right_nritems;
3557 5424570 : int ret = 0;
3558 :
3559 5424570 : slot = path->slots[1];
3560 5424570 : if (slot == 0)
3561 : return 1;
3562 1995556 : if (!path->nodes[1])
3563 : return 1;
3564 :
3565 1995556 : right_nritems = btrfs_header_nritems(right);
3566 1995556 : if (right_nritems == 0)
3567 : return 1;
3568 :
3569 1995564 : btrfs_assert_tree_write_locked(path->nodes[1]);
3570 :
3571 1995564 : left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3572 1995576 : if (IS_ERR(left))
3573 0 : return PTR_ERR(left);
3574 :
3575 1995576 : __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
3576 :
3577 1995586 : free_space = btrfs_leaf_free_space(left);
3578 1995591 : if (free_space < data_size) {
3579 778702 : ret = 1;
3580 778702 : goto out;
3581 : }
3582 :
3583 1216889 : ret = btrfs_cow_block(trans, root, left,
3584 : path->nodes[1], slot - 1, &left,
3585 : BTRFS_NESTING_LEFT_COW);
3586 1216883 : if (ret) {
3587 : /* we hit -ENOSPC, but it isn't fatal here */
3588 1295 : if (ret == -ENOSPC)
3589 1295 : ret = 1;
3590 1295 : goto out;
3591 : }
3592 :
3593 1215588 : if (check_sibling_keys(left, right)) {
3594 0 : ret = -EUCLEAN;
3595 0 : btrfs_abort_transaction(trans, ret);
3596 0 : goto out;
3597 : }
3598 1215582 : return __push_leaf_left(trans, path, min_data_size, empty, left,
3599 : free_space, right_nritems, max_slot);
3600 779997 : out:
3601 779997 : btrfs_tree_unlock(left);
3602 779997 : free_extent_buffer(left);
3603 779997 : return ret;
3604 : }
3605 :
3606 : /*
3607 : * split the path's leaf in two, making sure there is at least data_size
3608 : * available for the resulting leaf level of the path.
3609 : */
3610 368333 : static noinline int copy_for_split(struct btrfs_trans_handle *trans,
3611 : struct btrfs_path *path,
3612 : struct extent_buffer *l,
3613 : struct extent_buffer *right,
3614 : int slot, int mid, int nritems)
3615 : {
3616 368333 : struct btrfs_fs_info *fs_info = trans->fs_info;
3617 368333 : int data_copy_size;
3618 368333 : int rt_data_off;
3619 368333 : int i;
3620 368333 : int ret;
3621 368333 : struct btrfs_disk_key disk_key;
3622 368333 : struct btrfs_map_token token;
3623 :
3624 368333 : nritems = nritems - mid;
3625 368333 : btrfs_set_header_nritems(right, nritems);
3626 368333 : data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
3627 :
3628 368333 : copy_leaf_items(right, l, 0, mid, nritems);
3629 :
3630 368333 : copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size,
3631 368333 : leaf_data_end(l), data_copy_size);
3632 :
3633 368333 : rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
3634 :
3635 368331 : btrfs_init_map_token(&token, right);
3636 30778302 : for (i = 0; i < nritems; i++) {
3637 30041640 : u32 ioff;
3638 :
3639 30041640 : ioff = btrfs_token_item_offset(&token, i);
3640 30041637 : btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
3641 : }
3642 :
3643 368331 : btrfs_set_header_nritems(l, mid);
3644 368331 : btrfs_item_key(right, &disk_key, 0);
3645 368331 : ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
3646 368327 : if (ret < 0)
3647 : return ret;
3648 :
3649 368327 : btrfs_mark_buffer_dirty(right);
3650 368332 : btrfs_mark_buffer_dirty(l);
3651 368333 : BUG_ON(path->slots[0] != slot);
3652 :
3653 368333 : if (mid <= slot) {
3654 256886 : btrfs_tree_unlock(path->nodes[0]);
3655 256886 : free_extent_buffer(path->nodes[0]);
3656 256886 : path->nodes[0] = right;
3657 256886 : path->slots[0] -= mid;
3658 256886 : path->slots[1] += 1;
3659 : } else {
3660 111447 : btrfs_tree_unlock(right);
3661 111447 : free_extent_buffer(right);
3662 : }
3663 :
3664 368333 : BUG_ON(path->slots[0] < 0);
3665 :
3666 : return 0;
3667 : }
3668 :
3669 : /*
3670 : * double splits happen when we need to insert a big item in the middle
3671 : * of a leaf. A double split can leave us with 3 mostly empty leaves:
3672 : * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3673 : * A B C
3674 : *
3675 : * We avoid this by trying to push the items on either side of our target
3676 : * into the adjacent leaves. If all goes well we can avoid the double split
3677 : * completely.
3678 : */
3679 402 : static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3680 : struct btrfs_root *root,
3681 : struct btrfs_path *path,
3682 : int data_size)
3683 : {
3684 402 : int ret;
3685 402 : int progress = 0;
3686 402 : int slot;
3687 402 : u32 nritems;
3688 402 : int space_needed = data_size;
3689 :
3690 402 : slot = path->slots[0];
3691 402 : if (slot < btrfs_header_nritems(path->nodes[0]))
3692 402 : space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3693 :
3694 : /*
3695 : * try to push all the items after our slot into the
3696 : * right leaf
3697 : */
3698 402 : ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3699 402 : if (ret < 0)
3700 : return ret;
3701 :
3702 402 : if (ret == 0)
3703 0 : progress++;
3704 :
3705 402 : nritems = btrfs_header_nritems(path->nodes[0]);
3706 : /*
3707 : * our goal is to get our slot at the start or end of a leaf. If
3708 : * we've done so we're done
3709 : */
3710 402 : if (path->slots[0] == 0 || path->slots[0] == nritems)
3711 : return 0;
3712 :
3713 402 : if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3714 : return 0;
3715 :
3716 : /* try to push all the items before our slot into the next leaf */
3717 402 : slot = path->slots[0];
3718 402 : space_needed = data_size;
3719 402 : if (slot > 0)
3720 402 : space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3721 402 : ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3722 402 : if (ret < 0)
3723 : return ret;
3724 :
3725 402 : if (ret == 0)
3726 0 : progress++;
3727 :
3728 402 : if (progress)
3729 0 : return 0;
3730 : return 1;
3731 : }
3732 :
3733 : /*
3734 : * split the path's leaf in two, making sure there is at least data_size
3735 : * available for the resulting leaf level of the path.
3736 : *
3737 : * returns 0 if all went well and < 0 on failure.
3738 : */
3739 3678081 : static noinline int split_leaf(struct btrfs_trans_handle *trans,
3740 : struct btrfs_root *root,
3741 : const struct btrfs_key *ins_key,
3742 : struct btrfs_path *path, int data_size,
3743 : int extend)
3744 : {
3745 3678081 : struct btrfs_disk_key disk_key;
3746 3678081 : struct extent_buffer *l;
3747 3678081 : u32 nritems;
3748 3678081 : int mid;
3749 3678081 : int slot;
3750 3678081 : struct extent_buffer *right;
3751 3678081 : struct btrfs_fs_info *fs_info = root->fs_info;
3752 3678081 : int ret = 0;
3753 3678081 : int wret;
3754 3678081 : int split;
3755 3678081 : int num_doubles = 0;
3756 3678081 : int tried_avoid_double = 0;
3757 :
3758 3678081 : l = path->nodes[0];
3759 3678081 : slot = path->slots[0];
3760 4369589 : if (extend && data_size + btrfs_item_size(l, slot) +
3761 691508 : sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
3762 : return -EOVERFLOW;
3763 :
3764 : /* first try to make some room by pushing left and right */
3765 3670041 : if (data_size && path->nodes[1]) {
3766 3668072 : int space_needed = data_size;
3767 :
3768 3668072 : if (slot < btrfs_header_nritems(l))
3769 3366532 : space_needed -= btrfs_leaf_free_space(l);
3770 :
3771 3668165 : wret = push_leaf_right(trans, root, path, space_needed,
3772 : space_needed, 0, 0);
3773 3668142 : if (wret < 0)
3774 : return wret;
3775 3668142 : if (wret) {
3776 1229798 : space_needed = data_size;
3777 1229798 : if (slot > 0)
3778 1224926 : space_needed -= btrfs_leaf_free_space(l);
3779 1229798 : wret = push_leaf_left(trans, root, path, space_needed,
3780 : space_needed, 0, (u32)-1);
3781 1229811 : if (wret < 0)
3782 : return wret;
3783 : }
3784 3668155 : l = path->nodes[0];
3785 :
3786 : /* did the pushes work? */
3787 3668155 : if (btrfs_leaf_free_space(l) >= data_size)
3788 : return 0;
3789 : }
3790 :
3791 371933 : if (!path->nodes[1]) {
3792 1944 : ret = insert_new_root(trans, root, path, 1);
3793 1944 : if (ret)
3794 : return ret;
3795 : }
3796 371933 : again:
3797 372747 : split = 1;
3798 372747 : l = path->nodes[0];
3799 372747 : slot = path->slots[0];
3800 372747 : nritems = btrfs_header_nritems(l);
3801 372747 : mid = (nritems + 1) / 2;
3802 :
3803 372747 : if (mid <= slot) {
3804 521119 : if (nritems == 1 ||
3805 260497 : leaf_space_used(l, mid, nritems - mid) + data_size >
3806 : BTRFS_LEAF_DATA_SIZE(fs_info)) {
3807 9120 : if (slot >= nritems) {
3808 : split = 0;
3809 : } else {
3810 5438 : mid = slot;
3811 5438 : if (mid != nritems &&
3812 5438 : leaf_space_used(l, mid, nritems - mid) +
3813 : data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3814 566 : if (data_size && !tried_avoid_double)
3815 283 : goto push_for_double;
3816 : split = 2;
3817 : }
3818 : }
3819 : }
3820 : } else {
3821 112126 : if (leaf_space_used(l, 0, mid) + data_size >
3822 : BTRFS_LEAF_DATA_SIZE(fs_info)) {
3823 685 : if (!extend && data_size && slot == 0) {
3824 : split = 0;
3825 346 : } else if ((extend || !data_size) && slot == 0) {
3826 : mid = 1;
3827 : } else {
3828 346 : mid = slot;
3829 692 : if (mid != nritems &&
3830 346 : leaf_space_used(l, mid, nritems - mid) +
3831 : data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3832 238 : if (data_size && !tried_avoid_double)
3833 119 : goto push_for_double;
3834 : split = 2;
3835 : }
3836 : }
3837 : }
3838 : }
3839 :
3840 : if (split == 0)
3841 4021 : btrfs_cpu_key_to_disk(&disk_key, ins_key);
3842 : else
3843 368321 : btrfs_item_key(l, &disk_key, mid);
3844 :
3845 : /*
3846 : * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3847 : * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3848 : * subclasses, which is 8 at the time of this patch, and we've maxed it
3849 : * out. In the future we could add a
3850 : * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3851 : * use BTRFS_NESTING_NEW_ROOT.
3852 : */
3853 744279 : right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3854 : &disk_key, 0, l->start, 0,
3855 : num_doubles ? BTRFS_NESTING_NEW_ROOT :
3856 : BTRFS_NESTING_SPLIT);
3857 372354 : if (IS_ERR(right))
3858 0 : return PTR_ERR(right);
3859 :
3860 372354 : root_add_used(root, fs_info->nodesize);
3861 :
3862 372354 : if (split == 0) {
3863 4021 : if (mid <= slot) {
3864 3682 : btrfs_set_header_nritems(right, 0);
3865 3682 : ret = insert_ptr(trans, path, &disk_key,
3866 3682 : right->start, path->slots[1] + 1, 1);
3867 3682 : if (ret < 0) {
3868 0 : btrfs_tree_unlock(right);
3869 0 : free_extent_buffer(right);
3870 0 : return ret;
3871 : }
3872 3682 : btrfs_tree_unlock(path->nodes[0]);
3873 3682 : free_extent_buffer(path->nodes[0]);
3874 3682 : path->nodes[0] = right;
3875 3682 : path->slots[0] = 0;
3876 3682 : path->slots[1] += 1;
3877 : } else {
3878 339 : btrfs_set_header_nritems(right, 0);
3879 339 : ret = insert_ptr(trans, path, &disk_key,
3880 : right->start, path->slots[1], 1);
3881 339 : if (ret < 0) {
3882 0 : btrfs_tree_unlock(right);
3883 0 : free_extent_buffer(right);
3884 0 : return ret;
3885 : }
3886 339 : btrfs_tree_unlock(path->nodes[0]);
3887 339 : free_extent_buffer(path->nodes[0]);
3888 339 : path->nodes[0] = right;
3889 339 : path->slots[0] = 0;
3890 339 : if (path->slots[1] == 0)
3891 51 : fixup_low_keys(path, &disk_key, 1);
3892 : }
3893 : /*
3894 : * We create a new leaf 'right' for the required ins_len and
3895 : * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3896 : * the content of ins_len to 'right'.
3897 : */
3898 4021 : return ret;
3899 : }
3900 :
3901 368333 : ret = copy_for_split(trans, path, l, right, slot, mid, nritems);
3902 368342 : if (ret < 0) {
3903 0 : btrfs_tree_unlock(right);
3904 0 : free_extent_buffer(right);
3905 0 : return ret;
3906 : }
3907 :
3908 368342 : if (split == 2) {
3909 412 : BUG_ON(num_doubles != 0);
3910 412 : num_doubles++;
3911 412 : goto again;
3912 : }
3913 :
3914 : return 0;
3915 :
3916 402 : push_for_double:
3917 402 : push_for_double_split(trans, root, path, data_size);
3918 402 : tried_avoid_double = 1;
3919 402 : if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3920 : return 0;
3921 402 : goto again;
3922 : }
3923 :
3924 2815250 : static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3925 : struct btrfs_root *root,
3926 : struct btrfs_path *path, int ins_len)
3927 : {
3928 2815250 : struct btrfs_key key;
3929 2815250 : struct extent_buffer *leaf;
3930 2815250 : struct btrfs_file_extent_item *fi;
3931 2815250 : u64 extent_len = 0;
3932 2815250 : u32 item_size;
3933 2815250 : int ret;
3934 :
3935 2815250 : leaf = path->nodes[0];
3936 2815250 : btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3937 :
3938 2815251 : BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3939 : key.type != BTRFS_EXTENT_CSUM_KEY);
3940 :
3941 2815251 : if (btrfs_leaf_free_space(leaf) >= ins_len)
3942 : return 0;
3943 :
3944 42303 : item_size = btrfs_item_size(leaf, path->slots[0]);
3945 42303 : if (key.type == BTRFS_EXTENT_DATA_KEY) {
3946 39917 : fi = btrfs_item_ptr(leaf, path->slots[0],
3947 : struct btrfs_file_extent_item);
3948 39917 : extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3949 : }
3950 42303 : btrfs_release_path(path);
3951 :
3952 42303 : path->keep_locks = 1;
3953 42303 : path->search_for_split = 1;
3954 42303 : ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3955 42303 : path->search_for_split = 0;
3956 42303 : if (ret > 0)
3957 : ret = -EAGAIN;
3958 42302 : if (ret < 0)
3959 1 : goto err;
3960 :
3961 42302 : ret = -EAGAIN;
3962 42302 : leaf = path->nodes[0];
3963 : /* if our item isn't there, return now */
3964 42302 : if (item_size != btrfs_item_size(leaf, path->slots[0]))
3965 0 : goto err;
3966 :
3967 : /* the leaf has changed, it now has room. return now */
3968 42302 : if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
3969 54 : goto err;
3970 :
3971 42248 : if (key.type == BTRFS_EXTENT_DATA_KEY) {
3972 39865 : fi = btrfs_item_ptr(leaf, path->slots[0],
3973 : struct btrfs_file_extent_item);
3974 39865 : if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3975 0 : goto err;
3976 : }
3977 :
3978 42248 : ret = split_leaf(trans, root, &key, path, ins_len, 1);
3979 42248 : if (ret)
3980 0 : goto err;
3981 :
3982 42248 : path->keep_locks = 0;
3983 42248 : btrfs_unlock_up_safe(path, 1);
3984 42248 : return 0;
3985 55 : err:
3986 55 : path->keep_locks = 0;
3987 55 : return ret;
3988 : }
3989 :
3990 203233 : static noinline int split_item(struct btrfs_path *path,
3991 : const struct btrfs_key *new_key,
3992 : unsigned long split_offset)
3993 : {
3994 203233 : struct extent_buffer *leaf;
3995 203233 : int orig_slot, slot;
3996 203233 : char *buf;
3997 203233 : u32 nritems;
3998 203233 : u32 item_size;
3999 203233 : u32 orig_offset;
4000 203233 : struct btrfs_disk_key disk_key;
4001 :
4002 203233 : leaf = path->nodes[0];
4003 : /*
4004 : * Shouldn't happen because the caller must have previously called
4005 : * setup_leaf_for_split() to make room for the new item in the leaf.
4006 : */
4007 203233 : if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)))
4008 : return -ENOSPC;
4009 :
4010 203233 : orig_slot = path->slots[0];
4011 203233 : orig_offset = btrfs_item_offset(leaf, path->slots[0]);
4012 203233 : item_size = btrfs_item_size(leaf, path->slots[0]);
4013 :
4014 203233 : buf = kmalloc(item_size, GFP_NOFS);
4015 203233 : if (!buf)
4016 : return -ENOMEM;
4017 :
4018 203233 : read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4019 : path->slots[0]), item_size);
4020 :
4021 203233 : slot = path->slots[0] + 1;
4022 203233 : nritems = btrfs_header_nritems(leaf);
4023 203233 : if (slot != nritems) {
4024 : /* shift the items */
4025 196822 : memmove_leaf_items(leaf, slot + 1, slot, nritems - slot);
4026 : }
4027 :
4028 203233 : btrfs_cpu_key_to_disk(&disk_key, new_key);
4029 203233 : btrfs_set_item_key(leaf, &disk_key, slot);
4030 :
4031 203233 : btrfs_set_item_offset(leaf, slot, orig_offset);
4032 203233 : btrfs_set_item_size(leaf, slot, item_size - split_offset);
4033 :
4034 203233 : btrfs_set_item_offset(leaf, orig_slot,
4035 203233 : orig_offset + item_size - split_offset);
4036 203233 : btrfs_set_item_size(leaf, orig_slot, split_offset);
4037 :
4038 203233 : btrfs_set_header_nritems(leaf, nritems + 1);
4039 :
4040 : /* write the data for the start of the original item */
4041 406466 : write_extent_buffer(leaf, buf,
4042 203233 : btrfs_item_ptr_offset(leaf, path->slots[0]),
4043 : split_offset);
4044 :
4045 : /* write the data for the new item */
4046 203233 : write_extent_buffer(leaf, buf + split_offset,
4047 203233 : btrfs_item_ptr_offset(leaf, slot),
4048 : item_size - split_offset);
4049 203233 : btrfs_mark_buffer_dirty(leaf);
4050 :
4051 203233 : BUG_ON(btrfs_leaf_free_space(leaf) < 0);
4052 203233 : kfree(buf);
4053 203233 : return 0;
4054 : }
4055 :
4056 : /*
4057 : * This function splits a single item into two items,
4058 : * giving 'new_key' to the new item and splitting the
4059 : * old one at split_offset (from the start of the item).
4060 : *
4061 : * The path may be released by this operation. After
4062 : * the split, the path is pointing to the old item. The
4063 : * new item is going to be in the same node as the old one.
4064 : *
4065 : * Note, the item being split must be smaller enough to live alone on
4066 : * a tree block with room for one extra struct btrfs_item
4067 : *
4068 : * This allows us to split the item in place, keeping a lock on the
4069 : * leaf the entire time.
4070 : */
4071 203236 : int btrfs_split_item(struct btrfs_trans_handle *trans,
4072 : struct btrfs_root *root,
4073 : struct btrfs_path *path,
4074 : const struct btrfs_key *new_key,
4075 : unsigned long split_offset)
4076 : {
4077 203236 : int ret;
4078 203236 : ret = setup_leaf_for_split(trans, root, path,
4079 : sizeof(struct btrfs_item));
4080 203236 : if (ret)
4081 : return ret;
4082 :
4083 203233 : ret = split_item(path, new_key, split_offset);
4084 203233 : return ret;
4085 : }
4086 :
4087 : /*
4088 : * make the item pointed to by the path smaller. new_size indicates
4089 : * how small to make it, and from_end tells us if we just chop bytes
4090 : * off the end of the item or if we shift the item to chop bytes off
4091 : * the front.
4092 : */
4093 18174370 : void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
4094 : {
4095 18174370 : int slot;
4096 18174370 : struct extent_buffer *leaf;
4097 18174370 : u32 nritems;
4098 18174370 : unsigned int data_end;
4099 18174370 : unsigned int old_data_start;
4100 18174370 : unsigned int old_size;
4101 18174370 : unsigned int size_diff;
4102 18174370 : int i;
4103 18174370 : struct btrfs_map_token token;
4104 :
4105 18174370 : leaf = path->nodes[0];
4106 18174370 : slot = path->slots[0];
4107 :
4108 18174370 : old_size = btrfs_item_size(leaf, slot);
4109 18174357 : if (old_size == new_size)
4110 0 : return;
4111 :
4112 18174357 : nritems = btrfs_header_nritems(leaf);
4113 18174357 : data_end = leaf_data_end(leaf);
4114 :
4115 18174358 : old_data_start = btrfs_item_offset(leaf, slot);
4116 :
4117 18174361 : size_diff = old_size - new_size;
4118 :
4119 18174361 : BUG_ON(slot < 0);
4120 18174361 : BUG_ON(slot >= nritems);
4121 :
4122 : /*
4123 : * item0..itemN ... dataN.offset..dataN.size .. data0.size
4124 : */
4125 : /* first correct the data pointers */
4126 18174361 : btrfs_init_map_token(&token, leaf);
4127 1559703858 : for (i = slot; i < nritems; i++) {
4128 1523355110 : u32 ioff;
4129 :
4130 1523355110 : ioff = btrfs_token_item_offset(&token, i);
4131 1523355141 : btrfs_set_token_item_offset(&token, i, ioff + size_diff);
4132 : }
4133 :
4134 : /* shift the data */
4135 18174387 : if (from_end) {
4136 16124292 : memmove_leaf_data(leaf, data_end + size_diff, data_end,
4137 16124292 : old_data_start + new_size - data_end);
4138 : } else {
4139 2050095 : struct btrfs_disk_key disk_key;
4140 2050095 : u64 offset;
4141 :
4142 2050095 : btrfs_item_key(leaf, &disk_key, slot);
4143 :
4144 2050095 : if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4145 0 : unsigned long ptr;
4146 0 : struct btrfs_file_extent_item *fi;
4147 :
4148 0 : fi = btrfs_item_ptr(leaf, slot,
4149 : struct btrfs_file_extent_item);
4150 0 : fi = (struct btrfs_file_extent_item *)(
4151 0 : (unsigned long)fi - size_diff);
4152 :
4153 0 : if (btrfs_file_extent_type(leaf, fi) ==
4154 : BTRFS_FILE_EXTENT_INLINE) {
4155 0 : ptr = btrfs_item_ptr_offset(leaf, slot);
4156 0 : memmove_extent_buffer(leaf, ptr,
4157 : (unsigned long)fi,
4158 : BTRFS_FILE_EXTENT_INLINE_DATA_START);
4159 : }
4160 : }
4161 :
4162 2050095 : memmove_leaf_data(leaf, data_end + size_diff, data_end,
4163 2050095 : old_data_start - data_end);
4164 :
4165 2050095 : offset = btrfs_disk_key_offset(&disk_key);
4166 2050095 : btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4167 2050095 : btrfs_set_item_key(leaf, &disk_key, slot);
4168 2050095 : if (slot == 0)
4169 552958 : fixup_low_keys(path, &disk_key, 1);
4170 : }
4171 :
4172 18174386 : btrfs_set_item_size(leaf, slot, new_size);
4173 18174375 : btrfs_mark_buffer_dirty(leaf);
4174 :
4175 18174384 : if (btrfs_leaf_free_space(leaf) < 0) {
4176 0 : btrfs_print_leaf(leaf);
4177 18174381 : BUG();
4178 : }
4179 : }
4180 :
4181 : /*
4182 : * make the item pointed to by the path bigger, data_size is the added size.
4183 : */
4184 20788647 : void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4185 : {
4186 20788647 : int slot;
4187 20788647 : struct extent_buffer *leaf;
4188 20788647 : u32 nritems;
4189 20788647 : unsigned int data_end;
4190 20788647 : unsigned int old_data;
4191 20788647 : unsigned int old_size;
4192 20788647 : int i;
4193 20788647 : struct btrfs_map_token token;
4194 :
4195 20788647 : leaf = path->nodes[0];
4196 :
4197 20788647 : nritems = btrfs_header_nritems(leaf);
4198 20788647 : data_end = leaf_data_end(leaf);
4199 :
4200 20788575 : if (btrfs_leaf_free_space(leaf) < data_size) {
4201 0 : btrfs_print_leaf(leaf);
4202 0 : BUG();
4203 : }
4204 20788613 : slot = path->slots[0];
4205 20788613 : old_data = btrfs_item_data_end(leaf, slot);
4206 :
4207 20788591 : BUG_ON(slot < 0);
4208 20788591 : if (slot >= nritems) {
4209 0 : btrfs_print_leaf(leaf);
4210 0 : btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4211 : slot, nritems);
4212 0 : BUG();
4213 : }
4214 :
4215 : /*
4216 : * item0..itemN ... dataN.offset..dataN.size .. data0.size
4217 : */
4218 : /* first correct the data pointers */
4219 20788591 : btrfs_init_map_token(&token, leaf);
4220 1602275426 : for (i = slot; i < nritems; i++) {
4221 1560698177 : u32 ioff;
4222 :
4223 1560698177 : ioff = btrfs_token_item_offset(&token, i);
4224 1560699269 : btrfs_set_token_item_offset(&token, i, ioff - data_size);
4225 : }
4226 :
4227 : /* shift the data */
4228 20788658 : memmove_leaf_data(leaf, data_end - data_size, data_end,
4229 20788658 : old_data - data_end);
4230 :
4231 20788683 : data_end = old_data;
4232 20788683 : old_size = btrfs_item_size(leaf, slot);
4233 20788655 : btrfs_set_item_size(leaf, slot, old_size + data_size);
4234 20788628 : btrfs_mark_buffer_dirty(leaf);
4235 :
4236 20788681 : if (btrfs_leaf_free_space(leaf) < 0) {
4237 0 : btrfs_print_leaf(leaf);
4238 0 : BUG();
4239 : }
4240 20788663 : }
4241 :
4242 : /*
4243 : * Make space in the node before inserting one or more items.
4244 : *
4245 : * @root: root we are inserting items to
4246 : * @path: points to the leaf/slot where we are going to insert new items
4247 : * @batch: information about the batch of items to insert
4248 : *
4249 : * Main purpose is to save stack depth by doing the bulk of the work in a
4250 : * function that doesn't call btrfs_search_slot
4251 : */
4252 68351620 : static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4253 : const struct btrfs_item_batch *batch)
4254 : {
4255 68351620 : struct btrfs_fs_info *fs_info = root->fs_info;
4256 68351620 : int i;
4257 68351620 : u32 nritems;
4258 68351620 : unsigned int data_end;
4259 68351620 : struct btrfs_disk_key disk_key;
4260 68351620 : struct extent_buffer *leaf;
4261 68351620 : int slot;
4262 68351620 : struct btrfs_map_token token;
4263 68351620 : u32 total_size;
4264 :
4265 : /*
4266 : * Before anything else, update keys in the parent and other ancestors
4267 : * if needed, then release the write locks on them, so that other tasks
4268 : * can use them while we modify the leaf.
4269 : */
4270 68351620 : if (path->slots[0] == 0) {
4271 375940 : btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
4272 375941 : fixup_low_keys(path, &disk_key, 1);
4273 : }
4274 68351623 : btrfs_unlock_up_safe(path, 1);
4275 :
4276 68360419 : leaf = path->nodes[0];
4277 68360419 : slot = path->slots[0];
4278 :
4279 68360419 : nritems = btrfs_header_nritems(leaf);
4280 68360419 : data_end = leaf_data_end(leaf);
4281 68359804 : total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4282 :
4283 68359804 : if (btrfs_leaf_free_space(leaf) < total_size) {
4284 0 : btrfs_print_leaf(leaf);
4285 0 : btrfs_crit(fs_info, "not enough freespace need %u have %d",
4286 : total_size, btrfs_leaf_free_space(leaf));
4287 0 : BUG();
4288 : }
4289 :
4290 68351563 : btrfs_init_map_token(&token, leaf);
4291 68346811 : if (slot != nritems) {
4292 41926462 : unsigned int old_data = btrfs_item_data_end(leaf, slot);
4293 :
4294 41925945 : if (old_data < data_end) {
4295 0 : btrfs_print_leaf(leaf);
4296 0 : btrfs_crit(fs_info,
4297 : "item at slot %d with data offset %u beyond data end of leaf %u",
4298 : slot, old_data, data_end);
4299 0 : BUG();
4300 : }
4301 : /*
4302 : * item0..itemN ... dataN.offset..dataN.size .. data0.size
4303 : */
4304 : /* first correct the data pointers */
4305 3754197850 : for (i = slot; i < nritems; i++) {
4306 3712271237 : u32 ioff;
4307 :
4308 3712271237 : ioff = btrfs_token_item_offset(&token, i);
4309 3712371106 : btrfs_set_token_item_offset(&token, i,
4310 3712371106 : ioff - batch->total_data_size);
4311 : }
4312 : /* shift the items */
4313 41926613 : memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot);
4314 :
4315 : /* shift the data */
4316 41926311 : memmove_leaf_data(leaf, data_end - batch->total_data_size,
4317 41926311 : data_end, old_data - data_end);
4318 41926311 : data_end = old_data;
4319 : }
4320 :
4321 : /* setup the item for the new data */
4322 144944594 : for (i = 0; i < batch->nr; i++) {
4323 76597554 : btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
4324 76581574 : btrfs_set_item_key(leaf, &disk_key, slot + i);
4325 76584070 : data_end -= batch->data_sizes[i];
4326 76584070 : btrfs_set_token_item_offset(&token, slot + i, data_end);
4327 76578575 : btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
4328 : }
4329 :
4330 68347040 : btrfs_set_header_nritems(leaf, nritems + batch->nr);
4331 68347040 : btrfs_mark_buffer_dirty(leaf);
4332 :
4333 68363147 : if (btrfs_leaf_free_space(leaf) < 0) {
4334 0 : btrfs_print_leaf(leaf);
4335 0 : BUG();
4336 : }
4337 68353148 : }
4338 :
4339 : /*
4340 : * Insert a new item into a leaf.
4341 : *
4342 : * @root: The root of the btree.
4343 : * @path: A path pointing to the target leaf and slot.
4344 : * @key: The key of the new item.
4345 : * @data_size: The size of the data associated with the new key.
4346 : */
4347 4045438 : void btrfs_setup_item_for_insert(struct btrfs_root *root,
4348 : struct btrfs_path *path,
4349 : const struct btrfs_key *key,
4350 : u32 data_size)
4351 : {
4352 6657400 : struct btrfs_item_batch batch;
4353 :
4354 6657400 : batch.keys = key;
4355 6657400 : batch.data_sizes = &data_size;
4356 6657400 : batch.total_data_size = data_size;
4357 6657400 : batch.nr = 1;
4358 :
4359 4045438 : setup_items_for_insert(root, path, &batch);
4360 4045436 : }
4361 :
4362 : /*
4363 : * Given a key and some data, insert items into the tree.
4364 : * This does all the path init required, making room in the tree if needed.
4365 : */
4366 62102077 : int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4367 : struct btrfs_root *root,
4368 : struct btrfs_path *path,
4369 : const struct btrfs_item_batch *batch)
4370 : {
4371 62102077 : int ret = 0;
4372 62102077 : int slot;
4373 62102077 : u32 total_size;
4374 :
4375 62102077 : total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4376 62102077 : ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
4377 62101105 : if (ret == 0)
4378 : return -EEXIST;
4379 61703441 : if (ret < 0)
4380 : return ret;
4381 :
4382 61695395 : slot = path->slots[0];
4383 61695395 : BUG_ON(slot < 0);
4384 :
4385 61695395 : setup_items_for_insert(root, path, batch);
4386 61695395 : return 0;
4387 : }
4388 :
4389 : /*
4390 : * Given a key and some data, insert an item into the tree.
4391 : * This does all the path init required, making room in the tree if needed.
4392 : */
4393 9943 : int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4394 : const struct btrfs_key *cpu_key, void *data,
4395 : u32 data_size)
4396 : {
4397 9943 : int ret = 0;
4398 9943 : struct btrfs_path *path;
4399 9943 : struct extent_buffer *leaf;
4400 9943 : unsigned long ptr;
4401 :
4402 9943 : path = btrfs_alloc_path();
4403 9943 : if (!path)
4404 : return -ENOMEM;
4405 9943 : ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4406 9943 : if (!ret) {
4407 9943 : leaf = path->nodes[0];
4408 9943 : ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4409 9943 : write_extent_buffer(leaf, data, ptr, data_size);
4410 9943 : btrfs_mark_buffer_dirty(leaf);
4411 : }
4412 9943 : btrfs_free_path(path);
4413 9943 : return ret;
4414 : }
4415 :
4416 : /*
4417 : * This function duplicates an item, giving 'new_key' to the new item.
4418 : * It guarantees both items live in the same tree leaf and the new item is
4419 : * contiguous with the original item.
4420 : *
4421 : * This allows us to split a file extent in place, keeping a lock on the leaf
4422 : * the entire time.
4423 : */
4424 2612014 : int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4425 : struct btrfs_root *root,
4426 : struct btrfs_path *path,
4427 : const struct btrfs_key *new_key)
4428 : {
4429 2612014 : struct extent_buffer *leaf;
4430 2612014 : int ret;
4431 2612014 : u32 item_size;
4432 :
4433 2612014 : leaf = path->nodes[0];
4434 2612014 : item_size = btrfs_item_size(leaf, path->slots[0]);
4435 2612014 : ret = setup_leaf_for_split(trans, root, path,
4436 2612014 : item_size + sizeof(struct btrfs_item));
4437 2612012 : if (ret)
4438 : return ret;
4439 :
4440 2611962 : path->slots[0]++;
4441 2611962 : btrfs_setup_item_for_insert(root, path, new_key, item_size);
4442 2611958 : leaf = path->nodes[0];
4443 7835871 : memcpy_extent_buffer(leaf,
4444 5223913 : btrfs_item_ptr_offset(leaf, path->slots[0]),
4445 2611958 : btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4446 : item_size);
4447 2611957 : return 0;
4448 : }
4449 :
4450 : /*
4451 : * delete the pointer from a given node.
4452 : *
4453 : * the tree should have been previously balanced so the deletion does not
4454 : * empty a node.
4455 : *
4456 : * This is exported for use inside btrfs-progs, don't un-export it.
4457 : */
4458 118321 : int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4459 : struct btrfs_path *path, int level, int slot)
4460 : {
4461 118321 : struct extent_buffer *parent = path->nodes[level];
4462 118321 : u32 nritems;
4463 118321 : int ret;
4464 :
4465 118321 : nritems = btrfs_header_nritems(parent);
4466 118321 : if (slot != nritems - 1) {
4467 114237 : if (level) {
4468 114237 : ret = btrfs_tree_mod_log_insert_move(parent, slot,
4469 114237 : slot + 1, nritems - slot - 1);
4470 114237 : if (ret < 0) {
4471 0 : btrfs_abort_transaction(trans, ret);
4472 0 : return ret;
4473 : }
4474 : }
4475 114237 : memmove_extent_buffer(parent,
4476 : btrfs_node_key_ptr_offset(parent, slot),
4477 : btrfs_node_key_ptr_offset(parent, slot + 1),
4478 : sizeof(struct btrfs_key_ptr) *
4479 114237 : (nritems - slot - 1));
4480 4084 : } else if (level) {
4481 4084 : ret = btrfs_tree_mod_log_insert_key(parent, slot,
4482 : BTRFS_MOD_LOG_KEY_REMOVE);
4483 4084 : if (ret < 0) {
4484 0 : btrfs_abort_transaction(trans, ret);
4485 0 : return ret;
4486 : }
4487 : }
4488 :
4489 118323 : nritems--;
4490 118323 : btrfs_set_header_nritems(parent, nritems);
4491 118323 : if (nritems == 0 && parent == root->node) {
4492 0 : BUG_ON(btrfs_header_level(root->node) != 1);
4493 : /* just turn the root into a leaf and break */
4494 0 : btrfs_set_header_level(root->node, 0);
4495 118323 : } else if (slot == 0) {
4496 8742 : struct btrfs_disk_key disk_key;
4497 :
4498 8742 : btrfs_node_key(parent, &disk_key, 0);
4499 8742 : fixup_low_keys(path, &disk_key, level + 1);
4500 : }
4501 118323 : btrfs_mark_buffer_dirty(parent);
4502 118323 : return 0;
4503 : }
4504 :
4505 : /*
4506 : * a helper function to delete the leaf pointed to by path->slots[1] and
4507 : * path->nodes[1].
4508 : *
4509 : * This deletes the pointer in path->nodes[1] and frees the leaf
4510 : * block extent. zero is returned if it all worked out, < 0 otherwise.
4511 : *
4512 : * The path must have already been setup for deleting the leaf, including
4513 : * all the proper balancing. path->nodes[1] must be locked.
4514 : */
4515 118208 : static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
4516 : struct btrfs_root *root,
4517 : struct btrfs_path *path,
4518 : struct extent_buffer *leaf)
4519 : {
4520 118208 : int ret;
4521 :
4522 118208 : WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4523 118208 : ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]);
4524 118211 : if (ret < 0)
4525 : return ret;
4526 :
4527 : /*
4528 : * btrfs_free_extent is expensive, we want to make sure we
4529 : * aren't holding any locks when we call it
4530 : */
4531 118211 : btrfs_unlock_up_safe(path, 0);
4532 :
4533 118211 : root_sub_used(root, leaf->len);
4534 :
4535 118211 : atomic_inc(&leaf->refs);
4536 118211 : btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
4537 118211 : free_extent_buffer_stale(leaf);
4538 118211 : return 0;
4539 : }
4540 : /*
4541 : * delete the item at the leaf level in path. If that empties
4542 : * the leaf, remove it from the tree
4543 : */
4544 27434080 : int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4545 : struct btrfs_path *path, int slot, int nr)
4546 : {
4547 27434080 : struct btrfs_fs_info *fs_info = root->fs_info;
4548 27434080 : struct extent_buffer *leaf;
4549 27434080 : int ret = 0;
4550 27434080 : int wret;
4551 27434080 : u32 nritems;
4552 :
4553 27434080 : leaf = path->nodes[0];
4554 27434080 : nritems = btrfs_header_nritems(leaf);
4555 :
4556 27434080 : if (slot + nr != nritems) {
4557 22121287 : const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
4558 22120344 : const int data_end = leaf_data_end(leaf);
4559 22119953 : struct btrfs_map_token token;
4560 22119953 : u32 dsize = 0;
4561 22119953 : int i;
4562 :
4563 49618652 : for (i = 0; i < nr; i++)
4564 27498976 : dsize += btrfs_item_size(leaf, slot + i);
4565 :
4566 22119676 : memmove_leaf_data(leaf, data_end + dsize, data_end,
4567 22119676 : last_off - data_end);
4568 :
4569 22119930 : btrfs_init_map_token(&token, leaf);
4570 2194062376 : for (i = slot + nr; i < nritems; i++) {
4571 2149821972 : u32 ioff;
4572 :
4573 2149821972 : ioff = btrfs_token_item_offset(&token, i);
4574 2149943119 : btrfs_set_token_item_offset(&token, i, ioff + dsize);
4575 : }
4576 :
4577 22120474 : memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
4578 : }
4579 27433730 : btrfs_set_header_nritems(leaf, nritems - nr);
4580 27433730 : nritems -= nr;
4581 :
4582 : /* delete the leaf if we've emptied it */
4583 27433730 : if (nritems == 0) {
4584 12708 : if (leaf == root->node) {
4585 466 : btrfs_set_header_level(leaf, 0);
4586 : } else {
4587 12242 : btrfs_clear_buffer_dirty(trans, leaf);
4588 12242 : ret = btrfs_del_leaf(trans, root, path, leaf);
4589 12242 : if (ret < 0)
4590 : return ret;
4591 : }
4592 : } else {
4593 27421022 : int used = leaf_space_used(leaf, 0, nritems);
4594 27420345 : if (slot == 0) {
4595 2031666 : struct btrfs_disk_key disk_key;
4596 :
4597 2031666 : btrfs_item_key(leaf, &disk_key, 0);
4598 2031672 : fixup_low_keys(path, &disk_key, 1);
4599 : }
4600 :
4601 : /*
4602 : * Try to delete the leaf if it is mostly empty. We do this by
4603 : * trying to move all its items into its left and right neighbours.
4604 : * If we can't move all the items, then we don't delete it - it's
4605 : * not ideal, but future insertions might fill the leaf with more
4606 : * items, or items from other leaves might be moved later into our
4607 : * leaf due to deletions on those leaves.
4608 : */
4609 27420399 : if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4610 4194338 : u32 min_push_space;
4611 :
4612 : /* push_leaf_left fixes the path.
4613 : * make sure the path still points to our leaf
4614 : * for possible call to btrfs_del_ptr below
4615 : */
4616 4194338 : slot = path->slots[1];
4617 4194338 : atomic_inc(&leaf->refs);
4618 : /*
4619 : * We want to be able to at least push one item to the
4620 : * left neighbour leaf, and that's the first item.
4621 : */
4622 4194350 : min_push_space = sizeof(struct btrfs_item) +
4623 : btrfs_item_size(leaf, 0);
4624 4194326 : wret = push_leaf_left(trans, root, path, 0,
4625 : min_push_space, 1, (u32)-1);
4626 4194323 : if (wret < 0 && wret != -ENOSPC)
4627 0 : ret = wret;
4628 :
4629 4194323 : if (path->nodes[0] == leaf &&
4630 : btrfs_header_nritems(leaf)) {
4631 : /*
4632 : * If we were not able to push all items from our
4633 : * leaf to its left neighbour, then attempt to
4634 : * either push all the remaining items to the
4635 : * right neighbour or none. There's no advantage
4636 : * in pushing only some items, instead of all, as
4637 : * it's pointless to end up with a leaf having
4638 : * too few items while the neighbours can be full
4639 : * or nearly full.
4640 : */
4641 4069237 : nritems = btrfs_header_nritems(leaf);
4642 4069237 : min_push_space = leaf_space_used(leaf, 0, nritems);
4643 4069221 : wret = push_leaf_right(trans, root, path, 0,
4644 : min_push_space, 1, 0);
4645 4069234 : if (wret < 0 && wret != -ENOSPC)
4646 0 : ret = wret;
4647 : }
4648 :
4649 4194320 : if (btrfs_header_nritems(leaf) == 0) {
4650 105967 : path->slots[1] = slot;
4651 105967 : ret = btrfs_del_leaf(trans, root, path, leaf);
4652 105969 : if (ret < 0)
4653 : return ret;
4654 105969 : free_extent_buffer(leaf);
4655 105969 : ret = 0;
4656 : } else {
4657 : /* if we're still in the path, make sure
4658 : * we're dirty. Otherwise, one of the
4659 : * push_leaf functions must have already
4660 : * dirtied this buffer
4661 : */
4662 4088353 : if (path->nodes[0] == leaf)
4663 4045842 : btrfs_mark_buffer_dirty(leaf);
4664 4088356 : free_extent_buffer(leaf);
4665 : }
4666 : } else {
4667 23226061 : btrfs_mark_buffer_dirty(leaf);
4668 : }
4669 : }
4670 : return ret;
4671 : }
4672 :
4673 : /*
4674 : * A helper function to walk down the tree starting at min_key, and looking
4675 : * for nodes or leaves that are have a minimum transaction id.
4676 : * This is used by the btree defrag code, and tree logging
4677 : *
4678 : * This does not cow, but it does stuff the starting key it finds back
4679 : * into min_key, so you can call btrfs_search_slot with cow=1 on the
4680 : * key and get a writable path.
4681 : *
4682 : * This honors path->lowest_level to prevent descent past a given level
4683 : * of the tree.
4684 : *
4685 : * min_trans indicates the oldest transaction that you are interested
4686 : * in walking through. Any nodes or leaves older than min_trans are
4687 : * skipped over (without reading them).
4688 : *
4689 : * returns zero if something useful was found, < 0 on error and 1 if there
4690 : * was nothing in the tree that matched the search criteria.
4691 : */
4692 202359 : int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4693 : struct btrfs_path *path,
4694 : u64 min_trans)
4695 : {
4696 202359 : struct extent_buffer *cur;
4697 202359 : struct btrfs_key found_key;
4698 202359 : int slot;
4699 202359 : int sret;
4700 202359 : u32 nritems;
4701 202359 : int level;
4702 202359 : int ret = 1;
4703 202359 : int keep_locks = path->keep_locks;
4704 :
4705 202359 : ASSERT(!path->nowait);
4706 202359 : path->keep_locks = 1;
4707 213019 : again:
4708 213019 : cur = btrfs_read_lock_root_node(root);
4709 213023 : level = btrfs_header_level(cur);
4710 213023 : WARN_ON(path->nodes[level]);
4711 213023 : path->nodes[level] = cur;
4712 213023 : path->locks[level] = BTRFS_READ_LOCK;
4713 :
4714 213023 : if (btrfs_header_generation(cur) < min_trans) {
4715 148 : ret = 1;
4716 148 : goto out;
4717 : }
4718 638959 : while (1) {
4719 425917 : nritems = btrfs_header_nritems(cur);
4720 425917 : level = btrfs_header_level(cur);
4721 425917 : sret = btrfs_bin_search(cur, 0, min_key, &slot);
4722 425915 : if (sret < 0) {
4723 0 : ret = sret;
4724 0 : goto out;
4725 : }
4726 :
4727 : /* at the lowest level, we're done, setup the path and exit */
4728 425915 : if (level == path->lowest_level) {
4729 212821 : if (slot >= nritems)
4730 58465 : goto find_next_key;
4731 154356 : ret = 0;
4732 154356 : path->slots[level] = slot;
4733 154356 : btrfs_item_key_to_cpu(cur, &found_key, slot);
4734 154356 : goto out;
4735 : }
4736 213094 : if (sret && slot > 0)
4737 202302 : slot--;
4738 : /*
4739 : * check this node pointer against the min_trans parameters.
4740 : * If it is too old, skip to the next one.
4741 : */
4742 221514 : while (slot < nritems) {
4743 221464 : u64 gen;
4744 :
4745 221464 : gen = btrfs_node_ptr_generation(cur, slot);
4746 221463 : if (gen < min_trans) {
4747 8420 : slot++;
4748 8420 : continue;
4749 : }
4750 : break;
4751 : }
4752 50 : find_next_key:
4753 : /*
4754 : * we didn't find a candidate key in this node, walk forward
4755 : * and find another one
4756 : */
4757 271558 : if (slot >= nritems) {
4758 58515 : path->slots[level] = slot;
4759 58515 : sret = btrfs_find_next_key(root, path, min_key, level,
4760 : min_trans);
4761 58515 : if (sret == 0) {
4762 10660 : btrfs_release_path(path);
4763 10660 : goto again;
4764 : } else {
4765 47855 : goto out;
4766 : }
4767 : }
4768 : /* save our key for returning back */
4769 213043 : btrfs_node_key_to_cpu(cur, &found_key, slot);
4770 213044 : path->slots[level] = slot;
4771 213044 : if (level == path->lowest_level) {
4772 0 : ret = 0;
4773 0 : goto out;
4774 : }
4775 213044 : cur = btrfs_read_node_slot(cur, slot);
4776 213041 : if (IS_ERR(cur)) {
4777 0 : ret = PTR_ERR(cur);
4778 0 : goto out;
4779 : }
4780 :
4781 213041 : btrfs_tree_read_lock(cur);
4782 :
4783 213042 : path->locks[level - 1] = BTRFS_READ_LOCK;
4784 213042 : path->nodes[level - 1] = cur;
4785 213042 : unlock_up(path, level, 1, 0, NULL);
4786 : }
4787 0 : out:
4788 202359 : path->keep_locks = keep_locks;
4789 202359 : if (ret == 0) {
4790 154356 : btrfs_unlock_up_safe(path, path->lowest_level + 1);
4791 308718 : memcpy(min_key, &found_key, sizeof(found_key));
4792 : }
4793 202362 : return ret;
4794 : }
4795 :
4796 : /*
4797 : * this is similar to btrfs_next_leaf, but does not try to preserve
4798 : * and fixup the path. It looks for and returns the next key in the
4799 : * tree based on the current path and the min_trans parameters.
4800 : *
4801 : * 0 is returned if another key is found, < 0 if there are any errors
4802 : * and 1 is returned if there are no higher keys in the tree
4803 : *
4804 : * path->keep_locks should be set to 1 on the search made before
4805 : * calling this function.
4806 : */
4807 58526 : int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4808 : struct btrfs_key *key, int level, u64 min_trans)
4809 : {
4810 58526 : int slot;
4811 58526 : struct extent_buffer *c;
4812 :
4813 58526 : WARN_ON(!path->keep_locks && !path->skip_locking);
4814 115348 : while (level < BTRFS_MAX_LEVEL) {
4815 115348 : if (!path->nodes[level])
4816 : return 1;
4817 :
4818 115348 : slot = path->slots[level] + 1;
4819 115348 : c = path->nodes[level];
4820 : next:
4821 115573 : if (slot >= btrfs_header_nritems(c)) {
4822 104688 : int ret;
4823 104688 : int orig_lowest;
4824 104688 : struct btrfs_key cur_key;
4825 104688 : if (level + 1 >= BTRFS_MAX_LEVEL ||
4826 104688 : !path->nodes[level + 1])
4827 47866 : return 1;
4828 :
4829 56822 : if (path->locks[level + 1] || path->skip_locking) {
4830 56822 : level++;
4831 56822 : continue;
4832 : }
4833 :
4834 0 : slot = btrfs_header_nritems(c) - 1;
4835 0 : if (level == 0)
4836 0 : btrfs_item_key_to_cpu(c, &cur_key, slot);
4837 : else
4838 0 : btrfs_node_key_to_cpu(c, &cur_key, slot);
4839 :
4840 0 : orig_lowest = path->lowest_level;
4841 0 : btrfs_release_path(path);
4842 0 : path->lowest_level = level;
4843 0 : ret = btrfs_search_slot(NULL, root, &cur_key, path,
4844 : 0, 0);
4845 0 : path->lowest_level = orig_lowest;
4846 0 : if (ret < 0)
4847 0 : return ret;
4848 :
4849 0 : c = path->nodes[level];
4850 0 : slot = path->slots[level];
4851 0 : if (ret == 0)
4852 0 : slot++;
4853 0 : goto next;
4854 : }
4855 :
4856 10885 : if (level == 0)
4857 0 : btrfs_item_key_to_cpu(c, key, slot);
4858 : else {
4859 10885 : u64 gen = btrfs_node_ptr_generation(c, slot);
4860 :
4861 10885 : if (gen < min_trans) {
4862 225 : slot++;
4863 225 : goto next;
4864 : }
4865 10660 : btrfs_node_key_to_cpu(c, key, slot);
4866 : }
4867 : return 0;
4868 : }
4869 : return 1;
4870 : }
4871 :
4872 26821443 : int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4873 : u64 time_seq)
4874 : {
4875 26821443 : int slot;
4876 26821443 : int level;
4877 26821443 : struct extent_buffer *c;
4878 26821443 : struct extent_buffer *next;
4879 26821443 : struct btrfs_fs_info *fs_info = root->fs_info;
4880 26821443 : struct btrfs_key key;
4881 26821443 : bool need_commit_sem = false;
4882 26821443 : u32 nritems;
4883 26821443 : int ret;
4884 26821443 : int i;
4885 :
4886 : /*
4887 : * The nowait semantics are used only for write paths, where we don't
4888 : * use the tree mod log and sequence numbers.
4889 : */
4890 26821443 : if (time_seq)
4891 : ASSERT(!path->nowait);
4892 :
4893 26821443 : nritems = btrfs_header_nritems(path->nodes[0]);
4894 26821443 : if (nritems == 0)
4895 : return 1;
4896 :
4897 26817931 : btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4898 : again:
4899 26822256 : level = 1;
4900 26822256 : next = NULL;
4901 26822256 : btrfs_release_path(path);
4902 :
4903 26823516 : path->keep_locks = 1;
4904 :
4905 26823516 : if (time_seq) {
4906 259 : ret = btrfs_search_old_slot(root, &key, path, time_seq);
4907 : } else {
4908 26823257 : if (path->need_commit_sem) {
4909 69837 : path->need_commit_sem = 0;
4910 69837 : need_commit_sem = true;
4911 69837 : if (path->nowait) {
4912 0 : if (!down_read_trylock(&fs_info->commit_root_sem)) {
4913 0 : ret = -EAGAIN;
4914 0 : goto done;
4915 : }
4916 : } else {
4917 69837 : down_read(&fs_info->commit_root_sem);
4918 : }
4919 : }
4920 26823257 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4921 : }
4922 26823129 : path->keep_locks = 0;
4923 :
4924 26823129 : if (ret < 0)
4925 0 : goto done;
4926 :
4927 26823129 : nritems = btrfs_header_nritems(path->nodes[0]);
4928 : /*
4929 : * by releasing the path above we dropped all our locks. A balance
4930 : * could have added more items next to the key that used to be
4931 : * at the very end of the block. So, check again here and
4932 : * advance the path if there are now more items available.
4933 : */
4934 26823129 : if (nritems > 0 && path->slots[0] < nritems - 1) {
4935 11426 : if (ret == 0)
4936 11426 : path->slots[0]++;
4937 11426 : ret = 0;
4938 11426 : goto done;
4939 : }
4940 : /*
4941 : * So the above check misses one case:
4942 : * - after releasing the path above, someone has removed the item that
4943 : * used to be at the very end of the block, and balance between leafs
4944 : * gets another one with bigger key.offset to replace it.
4945 : *
4946 : * This one should be returned as well, or we can get leaf corruption
4947 : * later(esp. in __btrfs_drop_extents()).
4948 : *
4949 : * And a bit more explanation about this check,
4950 : * with ret > 0, the key isn't found, the path points to the slot
4951 : * where it should be inserted, so the path->slots[0] item must be the
4952 : * bigger one.
4953 : */
4954 26811703 : if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4955 0 : ret = 0;
4956 0 : goto done;
4957 : }
4958 :
4959 56383969 : while (level < BTRFS_MAX_LEVEL) {
4960 56383969 : if (!path->nodes[level]) {
4961 16407411 : ret = 1;
4962 16407411 : goto done;
4963 : }
4964 :
4965 39976558 : slot = path->slots[level] + 1;
4966 39976558 : c = path->nodes[level];
4967 39976558 : if (slot >= btrfs_header_nritems(c)) {
4968 29572266 : level++;
4969 29572266 : if (level == BTRFS_MAX_LEVEL) {
4970 0 : ret = 1;
4971 0 : goto done;
4972 : }
4973 29572266 : continue;
4974 : }
4975 :
4976 :
4977 : /*
4978 : * Our current level is where we're going to start from, and to
4979 : * make sure lockdep doesn't complain we need to drop our locks
4980 : * and nodes from 0 to our current level.
4981 : */
4982 20842957 : for (i = 0; i < level; i++) {
4983 10438351 : if (path->locks[level]) {
4984 10255005 : btrfs_tree_read_unlock(path->nodes[i]);
4985 10254762 : path->locks[i] = 0;
4986 : }
4987 10438108 : free_extent_buffer(path->nodes[i]);
4988 10438665 : path->nodes[i] = NULL;
4989 : }
4990 :
4991 10404606 : next = c;
4992 10404606 : ret = read_block_for_search(root, path, &next, level,
4993 : slot, &key);
4994 10403405 : if (ret == -EAGAIN && !path->nowait)
4995 4577 : goto again;
4996 :
4997 10398828 : if (ret < 0) {
4998 0 : btrfs_release_path(path);
4999 0 : goto done;
5000 : }
5001 :
5002 10398828 : if (!path->skip_locking) {
5003 10215935 : ret = btrfs_try_tree_read_lock(next);
5004 10216470 : if (!ret && path->nowait) {
5005 0 : ret = -EAGAIN;
5006 0 : goto done;
5007 : }
5008 10216470 : if (!ret && time_seq) {
5009 : /*
5010 : * If we don't get the lock, we may be racing
5011 : * with push_leaf_left, holding that lock while
5012 : * itself waiting for the leaf we've currently
5013 : * locked. To solve this situation, we give up
5014 : * on our lock and cycle.
5015 : */
5016 0 : free_extent_buffer(next);
5017 0 : btrfs_release_path(path);
5018 0 : cond_resched();
5019 0 : goto again;
5020 : }
5021 10216470 : if (!ret)
5022 175 : btrfs_tree_read_lock(next);
5023 : }
5024 : break;
5025 : }
5026 10399363 : path->slots[level] = slot;
5027 10433290 : while (1) {
5028 10433290 : level--;
5029 10433290 : path->nodes[level] = next;
5030 10433290 : path->slots[level] = 0;
5031 10433290 : if (!path->skip_locking)
5032 10250031 : path->locks[level] = BTRFS_READ_LOCK;
5033 10433290 : if (!level)
5034 : break;
5035 :
5036 34138 : ret = read_block_for_search(root, path, &next, level,
5037 : 0, &key);
5038 34046 : if (ret == -EAGAIN && !path->nowait)
5039 118 : goto again;
5040 :
5041 33928 : if (ret < 0) {
5042 0 : btrfs_release_path(path);
5043 0 : goto done;
5044 : }
5045 :
5046 33928 : if (!path->skip_locking) {
5047 33674 : if (path->nowait) {
5048 0 : if (!btrfs_try_tree_read_lock(next)) {
5049 0 : ret = -EAGAIN;
5050 0 : goto done;
5051 : }
5052 : } else {
5053 33674 : btrfs_tree_read_lock(next);
5054 : }
5055 : }
5056 : }
5057 : ret = 0;
5058 26817989 : done:
5059 26817989 : unlock_up(path, 0, 1, 0, NULL);
5060 26817993 : if (need_commit_sem) {
5061 69837 : int ret2;
5062 :
5063 69837 : path->need_commit_sem = 1;
5064 69837 : ret2 = finish_need_commit_sem_search(path);
5065 69837 : up_read(&fs_info->commit_root_sem);
5066 69837 : if (ret2)
5067 0 : ret = ret2;
5068 : }
5069 :
5070 : return ret;
5071 : }
5072 :
5073 16602857 : int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq)
5074 : {
5075 16602857 : path->slots[0]++;
5076 16602857 : if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
5077 122220 : return btrfs_next_old_leaf(root, path, time_seq);
5078 : return 0;
5079 : }
5080 :
5081 : /*
5082 : * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5083 : * searching until it gets past min_objectid or finds an item of 'type'
5084 : *
5085 : * returns 0 if something is found, 1 if nothing was found and < 0 on error
5086 : */
5087 210021 : int btrfs_previous_item(struct btrfs_root *root,
5088 : struct btrfs_path *path, u64 min_objectid,
5089 : int type)
5090 : {
5091 210401 : struct btrfs_key found_key;
5092 210401 : struct extent_buffer *leaf;
5093 210401 : u32 nritems;
5094 210401 : int ret;
5095 :
5096 210401 : while (1) {
5097 210401 : if (path->slots[0] == 0) {
5098 950 : ret = btrfs_prev_leaf(root, path);
5099 950 : if (ret != 0)
5100 948 : return ret;
5101 : } else {
5102 209451 : path->slots[0]--;
5103 : }
5104 209453 : leaf = path->nodes[0];
5105 209453 : nritems = btrfs_header_nritems(leaf);
5106 209453 : if (nritems == 0)
5107 : return 1;
5108 209453 : if (path->slots[0] == nritems)
5109 2 : path->slots[0]--;
5110 :
5111 209453 : btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5112 209453 : if (found_key.objectid < min_objectid)
5113 : break;
5114 208727 : if (found_key.type == type)
5115 : return 0;
5116 1378 : if (found_key.objectid == min_objectid &&
5117 : found_key.type < type)
5118 : break;
5119 : }
5120 : return 1;
5121 : }
5122 :
5123 : /*
5124 : * search in extent tree to find a previous Metadata/Data extent item with
5125 : * min objecitd.
5126 : *
5127 : * returns 0 if something is found, 1 if nothing was found and < 0 on error
5128 : */
5129 66522 : int btrfs_previous_extent_item(struct btrfs_root *root,
5130 : struct btrfs_path *path, u64 min_objectid)
5131 : {
5132 74733 : struct btrfs_key found_key;
5133 74733 : struct extent_buffer *leaf;
5134 74733 : u32 nritems;
5135 74733 : int ret;
5136 :
5137 74733 : while (1) {
5138 74733 : if (path->slots[0] == 0) {
5139 10 : ret = btrfs_prev_leaf(root, path);
5140 10 : if (ret != 0)
5141 10 : return ret;
5142 : } else {
5143 74723 : path->slots[0]--;
5144 : }
5145 74723 : leaf = path->nodes[0];
5146 74723 : nritems = btrfs_header_nritems(leaf);
5147 74723 : if (nritems == 0)
5148 : return 1;
5149 74723 : if (path->slots[0] == nritems)
5150 0 : path->slots[0]--;
5151 :
5152 74723 : btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5153 74723 : if (found_key.objectid < min_objectid)
5154 : break;
5155 74723 : if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5156 : found_key.type == BTRFS_METADATA_ITEM_KEY)
5157 : return 0;
5158 8211 : if (found_key.objectid == min_objectid &&
5159 : found_key.type < BTRFS_EXTENT_ITEM_KEY)
5160 : break;
5161 : }
5162 : return 1;
5163 : }
5164 :
5165 11 : int __init btrfs_ctree_init(void)
5166 : {
5167 11 : btrfs_path_cachep = kmem_cache_create("btrfs_path",
5168 : sizeof(struct btrfs_path), 0,
5169 : SLAB_MEM_SPREAD, NULL);
5170 11 : if (!btrfs_path_cachep)
5171 0 : return -ENOMEM;
5172 : return 0;
5173 : }
5174 :
5175 0 : void __cold btrfs_ctree_exit(void)
5176 : {
5177 0 : kmem_cache_destroy(btrfs_path_cachep);
5178 0 : }
|