Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * fs/ext4/extents_status.c
4 : *
5 : * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
6 : * Modified by
7 : * Allison Henderson <achender@linux.vnet.ibm.com>
8 : * Hugh Dickins <hughd@google.com>
9 : * Zheng Liu <wenqing.lz@taobao.com>
10 : *
11 : * Ext4 extents status tree core functions.
12 : */
13 : #include <linux/list_sort.h>
14 : #include <linux/proc_fs.h>
15 : #include <linux/seq_file.h>
16 : #include "ext4.h"
17 :
18 : #include <trace/events/ext4.h>
19 :
20 : /*
21 : * According to previous discussion in Ext4 Developer Workshop, we
22 : * will introduce a new structure called io tree to track all extent
23 : * status in order to solve some problems that we have met
24 : * (e.g. Reservation space warning), and provide extent-level locking.
25 : * Delay extent tree is the first step to achieve this goal. It is
26 : * original built by Yongqiang Yang. At that time it is called delay
27 : * extent tree, whose goal is only track delayed extents in memory to
28 : * simplify the implementation of fiemap and bigalloc, and introduce
29 : * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
30 : * delay extent tree at the first commit. But for better understand
31 : * what it does, it has been rename to extent status tree.
32 : *
33 : * Step1:
34 : * Currently the first step has been done. All delayed extents are
35 : * tracked in the tree. It maintains the delayed extent when a delayed
36 : * allocation is issued, and the delayed extent is written out or
37 : * invalidated. Therefore the implementation of fiemap and bigalloc
38 : * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
39 : *
40 : * The following comment describes the implemenmtation of extent
41 : * status tree and future works.
42 : *
43 : * Step2:
44 : * In this step all extent status are tracked by extent status tree.
45 : * Thus, we can first try to lookup a block mapping in this tree before
46 : * finding it in extent tree. Hence, single extent cache can be removed
47 : * because extent status tree can do a better job. Extents in status
48 : * tree are loaded on-demand. Therefore, the extent status tree may not
49 : * contain all of the extents in a file. Meanwhile we define a shrinker
50 : * to reclaim memory from extent status tree because fragmented extent
51 : * tree will make status tree cost too much memory. written/unwritten/-
52 : * hole extents in the tree will be reclaimed by this shrinker when we
53 : * are under high memory pressure. Delayed extents will not be
54 : * reclimed because fiemap, bigalloc, and seek_data/hole need it.
55 : */
56 :
57 : /*
58 : * Extent status tree implementation for ext4.
59 : *
60 : *
61 : * ==========================================================================
62 : * Extent status tree tracks all extent status.
63 : *
64 : * 1. Why we need to implement extent status tree?
65 : *
66 : * Without extent status tree, ext4 identifies a delayed extent by looking
67 : * up page cache, this has several deficiencies - complicated, buggy,
68 : * and inefficient code.
69 : *
70 : * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
71 : * block or a range of blocks are belonged to a delayed extent.
72 : *
73 : * Let us have a look at how they do without extent status tree.
74 : * -- FIEMAP
75 : * FIEMAP looks up page cache to identify delayed allocations from holes.
76 : *
77 : * -- SEEK_HOLE/DATA
78 : * SEEK_HOLE/DATA has the same problem as FIEMAP.
79 : *
80 : * -- bigalloc
81 : * bigalloc looks up page cache to figure out if a block is
82 : * already under delayed allocation or not to determine whether
83 : * quota reserving is needed for the cluster.
84 : *
85 : * -- writeout
86 : * Writeout looks up whole page cache to see if a buffer is
87 : * mapped, If there are not very many delayed buffers, then it is
88 : * time consuming.
89 : *
90 : * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
91 : * bigalloc and writeout can figure out if a block or a range of
92 : * blocks is under delayed allocation(belonged to a delayed extent) or
93 : * not by searching the extent tree.
94 : *
95 : *
96 : * ==========================================================================
97 : * 2. Ext4 extent status tree impelmentation
98 : *
99 : * -- extent
100 : * A extent is a range of blocks which are contiguous logically and
101 : * physically. Unlike extent in extent tree, this extent in ext4 is
102 : * a in-memory struct, there is no corresponding on-disk data. There
103 : * is no limit on length of extent, so an extent can contain as many
104 : * blocks as they are contiguous logically and physically.
105 : *
106 : * -- extent status tree
107 : * Every inode has an extent status tree and all allocation blocks
108 : * are added to the tree with different status. The extent in the
109 : * tree are ordered by logical block no.
110 : *
111 : * -- operations on a extent status tree
112 : * There are three important operations on a delayed extent tree: find
113 : * next extent, adding a extent(a range of blocks) and removing a extent.
114 : *
115 : * -- race on a extent status tree
116 : * Extent status tree is protected by inode->i_es_lock.
117 : *
118 : * -- memory consumption
119 : * Fragmented extent tree will make extent status tree cost too much
120 : * memory. Hence, we will reclaim written/unwritten/hole extents from
121 : * the tree under a heavy memory pressure.
122 : *
123 : *
124 : * ==========================================================================
125 : * 3. Performance analysis
126 : *
127 : * -- overhead
128 : * 1. There is a cache extent for write access, so if writes are
129 : * not very random, adding space operaions are in O(1) time.
130 : *
131 : * -- gain
132 : * 2. Code is much simpler, more readable, more maintainable and
133 : * more efficient.
134 : *
135 : *
136 : * ==========================================================================
137 : * 4. TODO list
138 : *
139 : * -- Refactor delayed space reservation
140 : *
141 : * -- Extent-level locking
142 : */
143 :
144 : static struct kmem_cache *ext4_es_cachep;
145 : static struct kmem_cache *ext4_pending_cachep;
146 :
147 : static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
148 : struct extent_status *prealloc);
149 : static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
150 : ext4_lblk_t end, int *reserved,
151 : struct extent_status *prealloc);
152 : static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
153 : static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
154 : struct ext4_inode_info *locked_ei);
155 : static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
156 : ext4_lblk_t len);
157 :
158 2 : int __init ext4_init_es(void)
159 : {
160 2 : ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
161 2 : if (ext4_es_cachep == NULL)
162 0 : return -ENOMEM;
163 : return 0;
164 : }
165 :
166 0 : void ext4_exit_es(void)
167 : {
168 0 : kmem_cache_destroy(ext4_es_cachep);
169 0 : }
170 :
171 0 : void ext4_es_init_tree(struct ext4_es_tree *tree)
172 : {
173 0 : tree->root = RB_ROOT;
174 0 : tree->cache_es = NULL;
175 0 : }
176 :
177 : #ifdef ES_DEBUG__
178 : static void ext4_es_print_tree(struct inode *inode)
179 : {
180 : struct ext4_es_tree *tree;
181 : struct rb_node *node;
182 :
183 : printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
184 : tree = &EXT4_I(inode)->i_es_tree;
185 : node = rb_first(&tree->root);
186 : while (node) {
187 : struct extent_status *es;
188 : es = rb_entry(node, struct extent_status, rb_node);
189 : printk(KERN_DEBUG " [%u/%u) %llu %x",
190 : es->es_lblk, es->es_len,
191 : ext4_es_pblock(es), ext4_es_status(es));
192 : node = rb_next(node);
193 : }
194 : printk(KERN_DEBUG "\n");
195 : }
196 : #else
197 : #define ext4_es_print_tree(inode)
198 : #endif
199 :
200 0 : static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
201 : {
202 0 : BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
203 0 : return es->es_lblk + es->es_len - 1;
204 : }
205 :
206 : /*
207 : * search through the tree for an delayed extent with a given offset. If
208 : * it can't be found, try to find next extent.
209 : */
210 0 : static struct extent_status *__es_tree_search(struct rb_root *root,
211 : ext4_lblk_t lblk)
212 : {
213 0 : struct rb_node *node = root->rb_node;
214 0 : struct extent_status *es = NULL;
215 :
216 0 : while (node) {
217 0 : es = rb_entry(node, struct extent_status, rb_node);
218 0 : if (lblk < es->es_lblk)
219 0 : node = node->rb_left;
220 0 : else if (lblk > ext4_es_end(es))
221 0 : node = node->rb_right;
222 : else
223 0 : return es;
224 : }
225 :
226 0 : if (es && lblk < es->es_lblk)
227 : return es;
228 :
229 0 : if (es && lblk > ext4_es_end(es)) {
230 0 : node = rb_next(&es->rb_node);
231 0 : return node ? rb_entry(node, struct extent_status, rb_node) :
232 : NULL;
233 : }
234 :
235 : return NULL;
236 : }
237 :
238 : /*
239 : * ext4_es_find_extent_range - find extent with specified status within block
240 : * range or next extent following block range in
241 : * extents status tree
242 : *
243 : * @inode - file containing the range
244 : * @matching_fn - pointer to function that matches extents with desired status
245 : * @lblk - logical block defining start of range
246 : * @end - logical block defining end of range
247 : * @es - extent found, if any
248 : *
249 : * Find the first extent within the block range specified by @lblk and @end
250 : * in the extents status tree that satisfies @matching_fn. If a match
251 : * is found, it's returned in @es. If not, and a matching extent is found
252 : * beyond the block range, it's returned in @es. If no match is found, an
253 : * extent is returned in @es whose es_lblk, es_len, and es_pblk components
254 : * are 0.
255 : */
256 0 : static void __es_find_extent_range(struct inode *inode,
257 : int (*matching_fn)(struct extent_status *es),
258 : ext4_lblk_t lblk, ext4_lblk_t end,
259 : struct extent_status *es)
260 : {
261 0 : struct ext4_es_tree *tree = NULL;
262 0 : struct extent_status *es1 = NULL;
263 0 : struct rb_node *node;
264 :
265 0 : WARN_ON(es == NULL);
266 0 : WARN_ON(end < lblk);
267 :
268 0 : tree = &EXT4_I(inode)->i_es_tree;
269 :
270 : /* see if the extent has been cached */
271 0 : es->es_lblk = es->es_len = es->es_pblk = 0;
272 0 : es1 = READ_ONCE(tree->cache_es);
273 0 : if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
274 0 : es_debug("%u cached by [%u/%u) %llu %x\n",
275 : lblk, es1->es_lblk, es1->es_len,
276 : ext4_es_pblock(es1), ext4_es_status(es1));
277 0 : goto out;
278 : }
279 :
280 0 : es1 = __es_tree_search(&tree->root, lblk);
281 :
282 0 : out:
283 0 : if (es1 && !matching_fn(es1)) {
284 0 : while ((node = rb_next(&es1->rb_node)) != NULL) {
285 0 : es1 = rb_entry(node, struct extent_status, rb_node);
286 0 : if (es1->es_lblk > end) {
287 : es1 = NULL;
288 : break;
289 : }
290 0 : if (matching_fn(es1))
291 : break;
292 : }
293 : }
294 :
295 0 : if (es1 && matching_fn(es1)) {
296 0 : WRITE_ONCE(tree->cache_es, es1);
297 0 : es->es_lblk = es1->es_lblk;
298 0 : es->es_len = es1->es_len;
299 0 : es->es_pblk = es1->es_pblk;
300 : }
301 :
302 0 : }
303 :
304 : /*
305 : * Locking for __es_find_extent_range() for external use
306 : */
307 0 : void ext4_es_find_extent_range(struct inode *inode,
308 : int (*matching_fn)(struct extent_status *es),
309 : ext4_lblk_t lblk, ext4_lblk_t end,
310 : struct extent_status *es)
311 : {
312 0 : if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
313 : return;
314 :
315 0 : trace_ext4_es_find_extent_range_enter(inode, lblk);
316 :
317 0 : read_lock(&EXT4_I(inode)->i_es_lock);
318 0 : __es_find_extent_range(inode, matching_fn, lblk, end, es);
319 0 : read_unlock(&EXT4_I(inode)->i_es_lock);
320 :
321 0 : trace_ext4_es_find_extent_range_exit(inode, es);
322 : }
323 :
324 : /*
325 : * __es_scan_range - search block range for block with specified status
326 : * in extents status tree
327 : *
328 : * @inode - file containing the range
329 : * @matching_fn - pointer to function that matches extents with desired status
330 : * @lblk - logical block defining start of range
331 : * @end - logical block defining end of range
332 : *
333 : * Returns true if at least one block in the specified block range satisfies
334 : * the criterion specified by @matching_fn, and false if not. If at least
335 : * one extent has the specified status, then there is at least one block
336 : * in the cluster with that status. Should only be called by code that has
337 : * taken i_es_lock.
338 : */
339 0 : static bool __es_scan_range(struct inode *inode,
340 : int (*matching_fn)(struct extent_status *es),
341 : ext4_lblk_t start, ext4_lblk_t end)
342 : {
343 0 : struct extent_status es;
344 :
345 0 : __es_find_extent_range(inode, matching_fn, start, end, &es);
346 0 : if (es.es_len == 0)
347 : return false; /* no matching extent in the tree */
348 0 : else if (es.es_lblk <= start &&
349 0 : start < es.es_lblk + es.es_len)
350 : return true;
351 0 : else if (start <= es.es_lblk && es.es_lblk <= end)
352 : return true;
353 : else
354 0 : return false;
355 : }
356 : /*
357 : * Locking for __es_scan_range() for external use
358 : */
359 0 : bool ext4_es_scan_range(struct inode *inode,
360 : int (*matching_fn)(struct extent_status *es),
361 : ext4_lblk_t lblk, ext4_lblk_t end)
362 : {
363 0 : bool ret;
364 :
365 0 : if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
366 : return false;
367 :
368 0 : read_lock(&EXT4_I(inode)->i_es_lock);
369 0 : ret = __es_scan_range(inode, matching_fn, lblk, end);
370 0 : read_unlock(&EXT4_I(inode)->i_es_lock);
371 :
372 0 : return ret;
373 : }
374 :
375 : /*
376 : * __es_scan_clu - search cluster for block with specified status in
377 : * extents status tree
378 : *
379 : * @inode - file containing the cluster
380 : * @matching_fn - pointer to function that matches extents with desired status
381 : * @lblk - logical block in cluster to be searched
382 : *
383 : * Returns true if at least one extent in the cluster containing @lblk
384 : * satisfies the criterion specified by @matching_fn, and false if not. If at
385 : * least one extent has the specified status, then there is at least one block
386 : * in the cluster with that status. Should only be called by code that has
387 : * taken i_es_lock.
388 : */
389 0 : static bool __es_scan_clu(struct inode *inode,
390 : int (*matching_fn)(struct extent_status *es),
391 : ext4_lblk_t lblk)
392 : {
393 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
394 0 : ext4_lblk_t lblk_start, lblk_end;
395 :
396 0 : lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
397 0 : lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
398 :
399 0 : return __es_scan_range(inode, matching_fn, lblk_start, lblk_end);
400 : }
401 :
402 : /*
403 : * Locking for __es_scan_clu() for external use
404 : */
405 0 : bool ext4_es_scan_clu(struct inode *inode,
406 : int (*matching_fn)(struct extent_status *es),
407 : ext4_lblk_t lblk)
408 : {
409 0 : bool ret;
410 :
411 0 : if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
412 : return false;
413 :
414 0 : read_lock(&EXT4_I(inode)->i_es_lock);
415 0 : ret = __es_scan_clu(inode, matching_fn, lblk);
416 0 : read_unlock(&EXT4_I(inode)->i_es_lock);
417 :
418 0 : return ret;
419 : }
420 :
421 0 : static void ext4_es_list_add(struct inode *inode)
422 : {
423 0 : struct ext4_inode_info *ei = EXT4_I(inode);
424 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
425 :
426 0 : if (!list_empty(&ei->i_es_list))
427 : return;
428 :
429 0 : spin_lock(&sbi->s_es_lock);
430 0 : if (list_empty(&ei->i_es_list)) {
431 0 : list_add_tail(&ei->i_es_list, &sbi->s_es_list);
432 0 : sbi->s_es_nr_inode++;
433 : }
434 0 : spin_unlock(&sbi->s_es_lock);
435 : }
436 :
437 0 : static void ext4_es_list_del(struct inode *inode)
438 : {
439 0 : struct ext4_inode_info *ei = EXT4_I(inode);
440 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
441 :
442 0 : spin_lock(&sbi->s_es_lock);
443 0 : if (!list_empty(&ei->i_es_list)) {
444 0 : list_del_init(&ei->i_es_list);
445 0 : sbi->s_es_nr_inode--;
446 0 : WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
447 : }
448 0 : spin_unlock(&sbi->s_es_lock);
449 0 : }
450 :
451 : /*
452 : * Returns true if we cannot fail to allocate memory for this extent_status
453 : * entry and cannot reclaim it until its status changes.
454 : */
455 : static inline bool ext4_es_must_keep(struct extent_status *es)
456 : {
457 : /* fiemap, bigalloc, and seek_data/hole need to use it. */
458 0 : if (ext4_es_is_delayed(es))
459 0 : return true;
460 :
461 : return false;
462 : }
463 :
464 : static inline struct extent_status *__es_alloc_extent(bool nofail)
465 : {
466 0 : if (!nofail)
467 0 : return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
468 :
469 0 : return kmem_cache_zalloc(ext4_es_cachep, GFP_KERNEL | __GFP_NOFAIL);
470 : }
471 :
472 0 : static void ext4_es_init_extent(struct inode *inode, struct extent_status *es,
473 : ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk)
474 : {
475 0 : es->es_lblk = lblk;
476 0 : es->es_len = len;
477 0 : es->es_pblk = pblk;
478 :
479 : /* We never try to reclaim a must kept extent, so we don't count it. */
480 0 : if (!ext4_es_must_keep(es)) {
481 0 : if (!EXT4_I(inode)->i_es_shk_nr++)
482 0 : ext4_es_list_add(inode);
483 0 : percpu_counter_inc(&EXT4_SB(inode->i_sb)->
484 : s_es_stats.es_stats_shk_cnt);
485 : }
486 :
487 0 : EXT4_I(inode)->i_es_all_nr++;
488 0 : percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
489 0 : }
490 :
491 : static inline void __es_free_extent(struct extent_status *es)
492 : {
493 0 : kmem_cache_free(ext4_es_cachep, es);
494 0 : }
495 :
496 0 : static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
497 : {
498 0 : EXT4_I(inode)->i_es_all_nr--;
499 0 : percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
500 :
501 : /* Decrease the shrink counter when we can reclaim the extent. */
502 0 : if (!ext4_es_must_keep(es)) {
503 0 : BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
504 0 : if (!--EXT4_I(inode)->i_es_shk_nr)
505 0 : ext4_es_list_del(inode);
506 0 : percpu_counter_dec(&EXT4_SB(inode->i_sb)->
507 : s_es_stats.es_stats_shk_cnt);
508 : }
509 :
510 0 : __es_free_extent(es);
511 0 : }
512 :
513 : /*
514 : * Check whether or not two extents can be merged
515 : * Condition:
516 : * - logical block number is contiguous
517 : * - physical block number is contiguous
518 : * - status is equal
519 : */
520 0 : static int ext4_es_can_be_merged(struct extent_status *es1,
521 : struct extent_status *es2)
522 : {
523 0 : if (ext4_es_type(es1) != ext4_es_type(es2))
524 : return 0;
525 :
526 0 : if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
527 0 : pr_warn("ES assertion failed when merging extents. "
528 : "The sum of lengths of es1 (%d) and es2 (%d) "
529 : "is bigger than allowed file size (%d)\n",
530 : es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
531 0 : WARN_ON(1);
532 0 : return 0;
533 : }
534 :
535 0 : if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
536 : return 0;
537 :
538 0 : if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
539 0 : (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
540 : return 1;
541 :
542 0 : if (ext4_es_is_hole(es1))
543 : return 1;
544 :
545 : /* we need to check delayed extent is without unwritten status */
546 0 : if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
547 0 : return 1;
548 :
549 : return 0;
550 : }
551 :
552 : static struct extent_status *
553 0 : ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
554 : {
555 0 : struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
556 0 : struct extent_status *es1;
557 0 : struct rb_node *node;
558 :
559 0 : node = rb_prev(&es->rb_node);
560 0 : if (!node)
561 : return es;
562 :
563 0 : es1 = rb_entry(node, struct extent_status, rb_node);
564 0 : if (ext4_es_can_be_merged(es1, es)) {
565 0 : es1->es_len += es->es_len;
566 0 : if (ext4_es_is_referenced(es))
567 0 : ext4_es_set_referenced(es1);
568 0 : rb_erase(&es->rb_node, &tree->root);
569 0 : ext4_es_free_extent(inode, es);
570 0 : es = es1;
571 : }
572 :
573 : return es;
574 : }
575 :
576 : static struct extent_status *
577 0 : ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
578 : {
579 0 : struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
580 0 : struct extent_status *es1;
581 0 : struct rb_node *node;
582 :
583 0 : node = rb_next(&es->rb_node);
584 0 : if (!node)
585 : return es;
586 :
587 0 : es1 = rb_entry(node, struct extent_status, rb_node);
588 0 : if (ext4_es_can_be_merged(es, es1)) {
589 0 : es->es_len += es1->es_len;
590 0 : if (ext4_es_is_referenced(es1))
591 0 : ext4_es_set_referenced(es);
592 0 : rb_erase(node, &tree->root);
593 0 : ext4_es_free_extent(inode, es1);
594 : }
595 :
596 : return es;
597 : }
598 :
599 : #ifdef ES_AGGRESSIVE_TEST
600 : #include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */
601 :
602 : static void ext4_es_insert_extent_ext_check(struct inode *inode,
603 : struct extent_status *es)
604 : {
605 : struct ext4_ext_path *path = NULL;
606 : struct ext4_extent *ex;
607 : ext4_lblk_t ee_block;
608 : ext4_fsblk_t ee_start;
609 : unsigned short ee_len;
610 : int depth, ee_status, es_status;
611 :
612 : path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
613 : if (IS_ERR(path))
614 : return;
615 :
616 : depth = ext_depth(inode);
617 : ex = path[depth].p_ext;
618 :
619 : if (ex) {
620 :
621 : ee_block = le32_to_cpu(ex->ee_block);
622 : ee_start = ext4_ext_pblock(ex);
623 : ee_len = ext4_ext_get_actual_len(ex);
624 :
625 : ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
626 : es_status = ext4_es_is_unwritten(es) ? 1 : 0;
627 :
628 : /*
629 : * Make sure ex and es are not overlap when we try to insert
630 : * a delayed/hole extent.
631 : */
632 : if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
633 : if (in_range(es->es_lblk, ee_block, ee_len)) {
634 : pr_warn("ES insert assertion failed for "
635 : "inode: %lu we can find an extent "
636 : "at block [%d/%d/%llu/%c], but we "
637 : "want to add a delayed/hole extent "
638 : "[%d/%d/%llu/%x]\n",
639 : inode->i_ino, ee_block, ee_len,
640 : ee_start, ee_status ? 'u' : 'w',
641 : es->es_lblk, es->es_len,
642 : ext4_es_pblock(es), ext4_es_status(es));
643 : }
644 : goto out;
645 : }
646 :
647 : /*
648 : * We don't check ee_block == es->es_lblk, etc. because es
649 : * might be a part of whole extent, vice versa.
650 : */
651 : if (es->es_lblk < ee_block ||
652 : ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
653 : pr_warn("ES insert assertion failed for inode: %lu "
654 : "ex_status [%d/%d/%llu/%c] != "
655 : "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
656 : ee_block, ee_len, ee_start,
657 : ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
658 : ext4_es_pblock(es), es_status ? 'u' : 'w');
659 : goto out;
660 : }
661 :
662 : if (ee_status ^ es_status) {
663 : pr_warn("ES insert assertion failed for inode: %lu "
664 : "ex_status [%d/%d/%llu/%c] != "
665 : "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
666 : ee_block, ee_len, ee_start,
667 : ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
668 : ext4_es_pblock(es), es_status ? 'u' : 'w');
669 : }
670 : } else {
671 : /*
672 : * We can't find an extent on disk. So we need to make sure
673 : * that we don't want to add an written/unwritten extent.
674 : */
675 : if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
676 : pr_warn("ES insert assertion failed for inode: %lu "
677 : "can't find an extent at block %d but we want "
678 : "to add a written/unwritten extent "
679 : "[%d/%d/%llu/%x]\n", inode->i_ino,
680 : es->es_lblk, es->es_lblk, es->es_len,
681 : ext4_es_pblock(es), ext4_es_status(es));
682 : }
683 : }
684 : out:
685 : ext4_free_ext_path(path);
686 : }
687 :
688 : static void ext4_es_insert_extent_ind_check(struct inode *inode,
689 : struct extent_status *es)
690 : {
691 : struct ext4_map_blocks map;
692 : int retval;
693 :
694 : /*
695 : * Here we call ext4_ind_map_blocks to lookup a block mapping because
696 : * 'Indirect' structure is defined in indirect.c. So we couldn't
697 : * access direct/indirect tree from outside. It is too dirty to define
698 : * this function in indirect.c file.
699 : */
700 :
701 : map.m_lblk = es->es_lblk;
702 : map.m_len = es->es_len;
703 :
704 : retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
705 : if (retval > 0) {
706 : if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
707 : /*
708 : * We want to add a delayed/hole extent but this
709 : * block has been allocated.
710 : */
711 : pr_warn("ES insert assertion failed for inode: %lu "
712 : "We can find blocks but we want to add a "
713 : "delayed/hole extent [%d/%d/%llu/%x]\n",
714 : inode->i_ino, es->es_lblk, es->es_len,
715 : ext4_es_pblock(es), ext4_es_status(es));
716 : return;
717 : } else if (ext4_es_is_written(es)) {
718 : if (retval != es->es_len) {
719 : pr_warn("ES insert assertion failed for "
720 : "inode: %lu retval %d != es_len %d\n",
721 : inode->i_ino, retval, es->es_len);
722 : return;
723 : }
724 : if (map.m_pblk != ext4_es_pblock(es)) {
725 : pr_warn("ES insert assertion failed for "
726 : "inode: %lu m_pblk %llu != "
727 : "es_pblk %llu\n",
728 : inode->i_ino, map.m_pblk,
729 : ext4_es_pblock(es));
730 : return;
731 : }
732 : } else {
733 : /*
734 : * We don't need to check unwritten extent because
735 : * indirect-based file doesn't have it.
736 : */
737 : BUG();
738 : }
739 : } else if (retval == 0) {
740 : if (ext4_es_is_written(es)) {
741 : pr_warn("ES insert assertion failed for inode: %lu "
742 : "We can't find the block but we want to add "
743 : "a written extent [%d/%d/%llu/%x]\n",
744 : inode->i_ino, es->es_lblk, es->es_len,
745 : ext4_es_pblock(es), ext4_es_status(es));
746 : return;
747 : }
748 : }
749 : }
750 :
751 : static inline void ext4_es_insert_extent_check(struct inode *inode,
752 : struct extent_status *es)
753 : {
754 : /*
755 : * We don't need to worry about the race condition because
756 : * caller takes i_data_sem locking.
757 : */
758 : BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
759 : if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
760 : ext4_es_insert_extent_ext_check(inode, es);
761 : else
762 : ext4_es_insert_extent_ind_check(inode, es);
763 : }
764 : #else
765 : static inline void ext4_es_insert_extent_check(struct inode *inode,
766 : struct extent_status *es)
767 : {
768 : }
769 : #endif
770 :
771 0 : static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
772 : struct extent_status *prealloc)
773 : {
774 0 : struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
775 0 : struct rb_node **p = &tree->root.rb_node;
776 0 : struct rb_node *parent = NULL;
777 0 : struct extent_status *es;
778 :
779 0 : while (*p) {
780 0 : parent = *p;
781 0 : es = rb_entry(parent, struct extent_status, rb_node);
782 :
783 0 : if (newes->es_lblk < es->es_lblk) {
784 0 : if (ext4_es_can_be_merged(newes, es)) {
785 : /*
786 : * Here we can modify es_lblk directly
787 : * because it isn't overlapped.
788 : */
789 0 : es->es_lblk = newes->es_lblk;
790 0 : es->es_len += newes->es_len;
791 0 : if (ext4_es_is_written(es) ||
792 : ext4_es_is_unwritten(es))
793 0 : ext4_es_store_pblock(es,
794 : newes->es_pblk);
795 0 : es = ext4_es_try_to_merge_left(inode, es);
796 0 : goto out;
797 : }
798 0 : p = &(*p)->rb_left;
799 0 : } else if (newes->es_lblk > ext4_es_end(es)) {
800 0 : if (ext4_es_can_be_merged(es, newes)) {
801 0 : es->es_len += newes->es_len;
802 0 : es = ext4_es_try_to_merge_right(inode, es);
803 0 : goto out;
804 : }
805 0 : p = &(*p)->rb_right;
806 : } else {
807 0 : BUG();
808 : return -EINVAL;
809 : }
810 : }
811 :
812 0 : if (prealloc)
813 : es = prealloc;
814 : else
815 0 : es = __es_alloc_extent(false);
816 0 : if (!es)
817 : return -ENOMEM;
818 0 : ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len,
819 : newes->es_pblk);
820 :
821 0 : rb_link_node(&es->rb_node, parent, p);
822 0 : rb_insert_color(&es->rb_node, &tree->root);
823 :
824 0 : out:
825 0 : tree->cache_es = es;
826 0 : return 0;
827 : }
828 :
829 : /*
830 : * ext4_es_insert_extent() adds information to an inode's extent
831 : * status tree.
832 : */
833 0 : void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
834 : ext4_lblk_t len, ext4_fsblk_t pblk,
835 : unsigned int status)
836 : {
837 0 : struct extent_status newes;
838 0 : ext4_lblk_t end = lblk + len - 1;
839 0 : int err1 = 0;
840 0 : int err2 = 0;
841 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
842 0 : struct extent_status *es1 = NULL;
843 0 : struct extent_status *es2 = NULL;
844 :
845 0 : if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
846 : return;
847 :
848 0 : es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
849 : lblk, len, pblk, status, inode->i_ino);
850 :
851 0 : if (!len)
852 : return;
853 :
854 0 : BUG_ON(end < lblk);
855 :
856 0 : if ((status & EXTENT_STATUS_DELAYED) &&
857 : (status & EXTENT_STATUS_WRITTEN)) {
858 0 : ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
859 : " delayed and written which can potentially "
860 : " cause data loss.", lblk, len);
861 0 : WARN_ON(1);
862 : }
863 :
864 0 : newes.es_lblk = lblk;
865 0 : newes.es_len = len;
866 0 : ext4_es_store_pblock_status(&newes, pblk, status);
867 0 : trace_ext4_es_insert_extent(inode, &newes);
868 :
869 0 : ext4_es_insert_extent_check(inode, &newes);
870 :
871 0 : retry:
872 0 : if (err1 && !es1)
873 0 : es1 = __es_alloc_extent(true);
874 0 : if ((err1 || err2) && !es2)
875 0 : es2 = __es_alloc_extent(true);
876 0 : write_lock(&EXT4_I(inode)->i_es_lock);
877 :
878 0 : err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
879 0 : if (err1 != 0)
880 0 : goto error;
881 :
882 0 : err2 = __es_insert_extent(inode, &newes, es2);
883 0 : if (err2 == -ENOMEM && !ext4_es_must_keep(&newes))
884 : err2 = 0;
885 0 : if (err2 != 0)
886 0 : goto error;
887 :
888 0 : if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
889 0 : (status & EXTENT_STATUS_WRITTEN ||
890 : status & EXTENT_STATUS_UNWRITTEN))
891 0 : __revise_pending(inode, lblk, len);
892 :
893 : /* es is pre-allocated but not used, free it. */
894 0 : if (es1 && !es1->es_len)
895 0 : __es_free_extent(es1);
896 0 : if (es2 && !es2->es_len)
897 0 : __es_free_extent(es2);
898 0 : error:
899 0 : write_unlock(&EXT4_I(inode)->i_es_lock);
900 0 : if (err1 || err2)
901 0 : goto retry;
902 :
903 : ext4_es_print_tree(inode);
904 : return;
905 : }
906 :
907 : /*
908 : * ext4_es_cache_extent() inserts information into the extent status
909 : * tree if and only if there isn't information about the range in
910 : * question already.
911 : */
912 0 : void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
913 : ext4_lblk_t len, ext4_fsblk_t pblk,
914 : unsigned int status)
915 : {
916 0 : struct extent_status *es;
917 0 : struct extent_status newes;
918 0 : ext4_lblk_t end = lblk + len - 1;
919 :
920 0 : if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
921 0 : return;
922 :
923 0 : newes.es_lblk = lblk;
924 0 : newes.es_len = len;
925 0 : ext4_es_store_pblock_status(&newes, pblk, status);
926 0 : trace_ext4_es_cache_extent(inode, &newes);
927 :
928 0 : if (!len)
929 : return;
930 :
931 0 : BUG_ON(end < lblk);
932 :
933 0 : write_lock(&EXT4_I(inode)->i_es_lock);
934 :
935 0 : es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
936 0 : if (!es || es->es_lblk > end)
937 0 : __es_insert_extent(inode, &newes, NULL);
938 0 : write_unlock(&EXT4_I(inode)->i_es_lock);
939 : }
940 :
941 : /*
942 : * ext4_es_lookup_extent() looks up an extent in extent status tree.
943 : *
944 : * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
945 : *
946 : * Return: 1 on found, 0 on not
947 : */
948 0 : int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
949 : ext4_lblk_t *next_lblk,
950 : struct extent_status *es)
951 : {
952 0 : struct ext4_es_tree *tree;
953 0 : struct ext4_es_stats *stats;
954 0 : struct extent_status *es1 = NULL;
955 0 : struct rb_node *node;
956 0 : int found = 0;
957 :
958 0 : if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
959 : return 0;
960 :
961 0 : trace_ext4_es_lookup_extent_enter(inode, lblk);
962 0 : es_debug("lookup extent in block %u\n", lblk);
963 :
964 0 : tree = &EXT4_I(inode)->i_es_tree;
965 0 : read_lock(&EXT4_I(inode)->i_es_lock);
966 :
967 : /* find extent in cache firstly */
968 0 : es->es_lblk = es->es_len = es->es_pblk = 0;
969 0 : es1 = READ_ONCE(tree->cache_es);
970 0 : if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
971 0 : es_debug("%u cached by [%u/%u)\n",
972 : lblk, es1->es_lblk, es1->es_len);
973 0 : found = 1;
974 0 : goto out;
975 : }
976 :
977 0 : node = tree->root.rb_node;
978 0 : while (node) {
979 0 : es1 = rb_entry(node, struct extent_status, rb_node);
980 0 : if (lblk < es1->es_lblk)
981 0 : node = node->rb_left;
982 0 : else if (lblk > ext4_es_end(es1))
983 0 : node = node->rb_right;
984 : else {
985 : found = 1;
986 : break;
987 : }
988 : }
989 :
990 0 : out:
991 0 : stats = &EXT4_SB(inode->i_sb)->s_es_stats;
992 0 : if (found) {
993 0 : BUG_ON(!es1);
994 0 : es->es_lblk = es1->es_lblk;
995 0 : es->es_len = es1->es_len;
996 0 : es->es_pblk = es1->es_pblk;
997 0 : if (!ext4_es_is_referenced(es1))
998 0 : ext4_es_set_referenced(es1);
999 0 : percpu_counter_inc(&stats->es_stats_cache_hits);
1000 0 : if (next_lblk) {
1001 0 : node = rb_next(&es1->rb_node);
1002 0 : if (node) {
1003 0 : es1 = rb_entry(node, struct extent_status,
1004 : rb_node);
1005 0 : *next_lblk = es1->es_lblk;
1006 : } else
1007 0 : *next_lblk = 0;
1008 : }
1009 : } else {
1010 0 : percpu_counter_inc(&stats->es_stats_cache_misses);
1011 : }
1012 :
1013 0 : read_unlock(&EXT4_I(inode)->i_es_lock);
1014 :
1015 0 : trace_ext4_es_lookup_extent_exit(inode, es, found);
1016 0 : return found;
1017 : }
1018 :
1019 : struct rsvd_count {
1020 : int ndelonly;
1021 : bool first_do_lblk_found;
1022 : ext4_lblk_t first_do_lblk;
1023 : ext4_lblk_t last_do_lblk;
1024 : struct extent_status *left_es;
1025 : bool partial;
1026 : ext4_lblk_t lclu;
1027 : };
1028 :
1029 : /*
1030 : * init_rsvd - initialize reserved count data before removing block range
1031 : * in file from extent status tree
1032 : *
1033 : * @inode - file containing range
1034 : * @lblk - first block in range
1035 : * @es - pointer to first extent in range
1036 : * @rc - pointer to reserved count data
1037 : *
1038 : * Assumes es is not NULL
1039 : */
1040 0 : static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
1041 : struct extent_status *es, struct rsvd_count *rc)
1042 : {
1043 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1044 0 : struct rb_node *node;
1045 :
1046 0 : rc->ndelonly = 0;
1047 :
1048 : /*
1049 : * for bigalloc, note the first delonly block in the range has not
1050 : * been found, record the extent containing the block to the left of
1051 : * the region to be removed, if any, and note that there's no partial
1052 : * cluster to track
1053 : */
1054 0 : if (sbi->s_cluster_ratio > 1) {
1055 0 : rc->first_do_lblk_found = false;
1056 0 : if (lblk > es->es_lblk) {
1057 0 : rc->left_es = es;
1058 : } else {
1059 0 : node = rb_prev(&es->rb_node);
1060 0 : rc->left_es = node ? rb_entry(node,
1061 : struct extent_status,
1062 0 : rb_node) : NULL;
1063 : }
1064 0 : rc->partial = false;
1065 : }
1066 0 : }
1067 :
1068 : /*
1069 : * count_rsvd - count the clusters containing delayed and not unwritten
1070 : * (delonly) blocks in a range within an extent and add to
1071 : * the running tally in rsvd_count
1072 : *
1073 : * @inode - file containing extent
1074 : * @lblk - first block in range
1075 : * @len - length of range in blocks
1076 : * @es - pointer to extent containing clusters to be counted
1077 : * @rc - pointer to reserved count data
1078 : *
1079 : * Tracks partial clusters found at the beginning and end of extents so
1080 : * they aren't overcounted when they span adjacent extents
1081 : */
1082 0 : static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
1083 : struct extent_status *es, struct rsvd_count *rc)
1084 : {
1085 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1086 0 : ext4_lblk_t i, end, nclu;
1087 :
1088 0 : if (!ext4_es_is_delonly(es))
1089 : return;
1090 :
1091 0 : WARN_ON(len <= 0);
1092 :
1093 0 : if (sbi->s_cluster_ratio == 1) {
1094 0 : rc->ndelonly += (int) len;
1095 0 : return;
1096 : }
1097 :
1098 : /* bigalloc */
1099 :
1100 0 : i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
1101 0 : end = lblk + (ext4_lblk_t) len - 1;
1102 0 : end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
1103 :
1104 : /* record the first block of the first delonly extent seen */
1105 0 : if (!rc->first_do_lblk_found) {
1106 0 : rc->first_do_lblk = i;
1107 0 : rc->first_do_lblk_found = true;
1108 : }
1109 :
1110 : /* update the last lblk in the region seen so far */
1111 0 : rc->last_do_lblk = end;
1112 :
1113 : /*
1114 : * if we're tracking a partial cluster and the current extent
1115 : * doesn't start with it, count it and stop tracking
1116 : */
1117 0 : if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
1118 0 : rc->ndelonly++;
1119 0 : rc->partial = false;
1120 : }
1121 :
1122 : /*
1123 : * if the first cluster doesn't start on a cluster boundary but
1124 : * ends on one, count it
1125 : */
1126 0 : if (EXT4_LBLK_COFF(sbi, i) != 0) {
1127 0 : if (end >= EXT4_LBLK_CFILL(sbi, i)) {
1128 0 : rc->ndelonly++;
1129 0 : rc->partial = false;
1130 0 : i = EXT4_LBLK_CFILL(sbi, i) + 1;
1131 : }
1132 : }
1133 :
1134 : /*
1135 : * if the current cluster starts on a cluster boundary, count the
1136 : * number of whole delonly clusters in the extent
1137 : */
1138 0 : if ((i + sbi->s_cluster_ratio - 1) <= end) {
1139 0 : nclu = (end - i + 1) >> sbi->s_cluster_bits;
1140 0 : rc->ndelonly += nclu;
1141 0 : i += nclu << sbi->s_cluster_bits;
1142 : }
1143 :
1144 : /*
1145 : * start tracking a partial cluster if there's a partial at the end
1146 : * of the current extent and we're not already tracking one
1147 : */
1148 0 : if (!rc->partial && i <= end) {
1149 0 : rc->partial = true;
1150 0 : rc->lclu = EXT4_B2C(sbi, i);
1151 : }
1152 : }
1153 :
1154 : /*
1155 : * __pr_tree_search - search for a pending cluster reservation
1156 : *
1157 : * @root - root of pending reservation tree
1158 : * @lclu - logical cluster to search for
1159 : *
1160 : * Returns the pending reservation for the cluster identified by @lclu
1161 : * if found. If not, returns a reservation for the next cluster if any,
1162 : * and if not, returns NULL.
1163 : */
1164 0 : static struct pending_reservation *__pr_tree_search(struct rb_root *root,
1165 : ext4_lblk_t lclu)
1166 : {
1167 0 : struct rb_node *node = root->rb_node;
1168 0 : struct pending_reservation *pr = NULL;
1169 :
1170 0 : while (node) {
1171 0 : pr = rb_entry(node, struct pending_reservation, rb_node);
1172 0 : if (lclu < pr->lclu)
1173 0 : node = node->rb_left;
1174 0 : else if (lclu > pr->lclu)
1175 0 : node = node->rb_right;
1176 : else
1177 0 : return pr;
1178 : }
1179 0 : if (pr && lclu < pr->lclu)
1180 : return pr;
1181 0 : if (pr && lclu > pr->lclu) {
1182 0 : node = rb_next(&pr->rb_node);
1183 0 : return node ? rb_entry(node, struct pending_reservation,
1184 0 : rb_node) : NULL;
1185 : }
1186 : return NULL;
1187 : }
1188 :
1189 : /*
1190 : * get_rsvd - calculates and returns the number of cluster reservations to be
1191 : * released when removing a block range from the extent status tree
1192 : * and releases any pending reservations within the range
1193 : *
1194 : * @inode - file containing block range
1195 : * @end - last block in range
1196 : * @right_es - pointer to extent containing next block beyond end or NULL
1197 : * @rc - pointer to reserved count data
1198 : *
1199 : * The number of reservations to be released is equal to the number of
1200 : * clusters containing delayed and not unwritten (delonly) blocks within
1201 : * the range, minus the number of clusters still containing delonly blocks
1202 : * at the ends of the range, and minus the number of pending reservations
1203 : * within the range.
1204 : */
1205 0 : static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
1206 : struct extent_status *right_es,
1207 : struct rsvd_count *rc)
1208 : {
1209 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1210 0 : struct pending_reservation *pr;
1211 0 : struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
1212 0 : struct rb_node *node;
1213 0 : ext4_lblk_t first_lclu, last_lclu;
1214 0 : bool left_delonly, right_delonly, count_pending;
1215 0 : struct extent_status *es;
1216 :
1217 0 : if (sbi->s_cluster_ratio > 1) {
1218 : /* count any remaining partial cluster */
1219 0 : if (rc->partial)
1220 0 : rc->ndelonly++;
1221 :
1222 0 : if (rc->ndelonly == 0)
1223 : return 0;
1224 :
1225 0 : first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
1226 0 : last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
1227 :
1228 : /*
1229 : * decrease the delonly count by the number of clusters at the
1230 : * ends of the range that still contain delonly blocks -
1231 : * these clusters still need to be reserved
1232 : */
1233 0 : left_delonly = right_delonly = false;
1234 :
1235 0 : es = rc->left_es;
1236 0 : while (es && ext4_es_end(es) >=
1237 0 : EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
1238 0 : if (ext4_es_is_delonly(es)) {
1239 0 : rc->ndelonly--;
1240 0 : left_delonly = true;
1241 0 : break;
1242 : }
1243 0 : node = rb_prev(&es->rb_node);
1244 0 : if (!node)
1245 : break;
1246 : es = rb_entry(node, struct extent_status, rb_node);
1247 : }
1248 0 : if (right_es && (!left_delonly || first_lclu != last_lclu)) {
1249 0 : if (end < ext4_es_end(right_es)) {
1250 : es = right_es;
1251 : } else {
1252 0 : node = rb_next(&right_es->rb_node);
1253 0 : es = node ? rb_entry(node, struct extent_status,
1254 0 : rb_node) : NULL;
1255 : }
1256 0 : while (es && es->es_lblk <=
1257 0 : EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
1258 0 : if (ext4_es_is_delonly(es)) {
1259 0 : rc->ndelonly--;
1260 0 : right_delonly = true;
1261 0 : break;
1262 : }
1263 0 : node = rb_next(&es->rb_node);
1264 0 : if (!node)
1265 : break;
1266 : es = rb_entry(node, struct extent_status,
1267 : rb_node);
1268 : }
1269 : }
1270 :
1271 : /*
1272 : * Determine the block range that should be searched for
1273 : * pending reservations, if any. Clusters on the ends of the
1274 : * original removed range containing delonly blocks are
1275 : * excluded. They've already been accounted for and it's not
1276 : * possible to determine if an associated pending reservation
1277 : * should be released with the information available in the
1278 : * extents status tree.
1279 : */
1280 0 : if (first_lclu == last_lclu) {
1281 0 : if (left_delonly | right_delonly)
1282 : count_pending = false;
1283 : else
1284 0 : count_pending = true;
1285 : } else {
1286 0 : if (left_delonly)
1287 0 : first_lclu++;
1288 0 : if (right_delonly)
1289 0 : last_lclu--;
1290 0 : if (first_lclu <= last_lclu)
1291 : count_pending = true;
1292 : else
1293 0 : count_pending = false;
1294 : }
1295 :
1296 : /*
1297 : * a pending reservation found between first_lclu and last_lclu
1298 : * represents an allocated cluster that contained at least one
1299 : * delonly block, so the delonly total must be reduced by one
1300 : * for each pending reservation found and released
1301 : */
1302 0 : if (count_pending) {
1303 0 : pr = __pr_tree_search(&tree->root, first_lclu);
1304 0 : while (pr && pr->lclu <= last_lclu) {
1305 0 : rc->ndelonly--;
1306 0 : node = rb_next(&pr->rb_node);
1307 0 : rb_erase(&pr->rb_node, &tree->root);
1308 0 : kmem_cache_free(ext4_pending_cachep, pr);
1309 0 : if (!node)
1310 : break;
1311 : pr = rb_entry(node, struct pending_reservation,
1312 : rb_node);
1313 : }
1314 : }
1315 : }
1316 0 : return rc->ndelonly;
1317 : }
1318 :
1319 :
1320 : /*
1321 : * __es_remove_extent - removes block range from extent status tree
1322 : *
1323 : * @inode - file containing range
1324 : * @lblk - first block in range
1325 : * @end - last block in range
1326 : * @reserved - number of cluster reservations released
1327 : * @prealloc - pre-allocated es to avoid memory allocation failures
1328 : *
1329 : * If @reserved is not NULL and delayed allocation is enabled, counts
1330 : * block/cluster reservations freed by removing range and if bigalloc
1331 : * enabled cancels pending reservations as needed. Returns 0 on success,
1332 : * error code on failure.
1333 : */
1334 0 : static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
1335 : ext4_lblk_t end, int *reserved,
1336 : struct extent_status *prealloc)
1337 : {
1338 0 : struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
1339 0 : struct rb_node *node;
1340 0 : struct extent_status *es;
1341 0 : struct extent_status orig_es;
1342 0 : ext4_lblk_t len1, len2;
1343 0 : ext4_fsblk_t block;
1344 0 : int err = 0;
1345 0 : bool count_reserved = true;
1346 0 : struct rsvd_count rc;
1347 :
1348 0 : if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
1349 : count_reserved = false;
1350 :
1351 0 : es = __es_tree_search(&tree->root, lblk);
1352 0 : if (!es)
1353 0 : goto out;
1354 0 : if (es->es_lblk > end)
1355 0 : goto out;
1356 :
1357 : /* Simply invalidate cache_es. */
1358 0 : tree->cache_es = NULL;
1359 0 : if (count_reserved)
1360 0 : init_rsvd(inode, lblk, es, &rc);
1361 :
1362 0 : orig_es.es_lblk = es->es_lblk;
1363 0 : orig_es.es_len = es->es_len;
1364 0 : orig_es.es_pblk = es->es_pblk;
1365 :
1366 0 : len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
1367 0 : len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
1368 0 : if (len1 > 0)
1369 0 : es->es_len = len1;
1370 0 : if (len2 > 0) {
1371 0 : if (len1 > 0) {
1372 0 : struct extent_status newes;
1373 :
1374 0 : newes.es_lblk = end + 1;
1375 0 : newes.es_len = len2;
1376 0 : block = 0x7FDEADBEEFULL;
1377 0 : if (ext4_es_is_written(&orig_es) ||
1378 : ext4_es_is_unwritten(&orig_es))
1379 0 : block = ext4_es_pblock(&orig_es) +
1380 0 : orig_es.es_len - len2;
1381 0 : ext4_es_store_pblock_status(&newes, block,
1382 : ext4_es_status(&orig_es));
1383 0 : err = __es_insert_extent(inode, &newes, prealloc);
1384 0 : if (err) {
1385 0 : if (!ext4_es_must_keep(&newes))
1386 0 : return 0;
1387 :
1388 0 : es->es_lblk = orig_es.es_lblk;
1389 0 : es->es_len = orig_es.es_len;
1390 0 : goto out;
1391 : }
1392 : } else {
1393 0 : es->es_lblk = end + 1;
1394 0 : es->es_len = len2;
1395 0 : if (ext4_es_is_written(es) ||
1396 : ext4_es_is_unwritten(es)) {
1397 0 : block = orig_es.es_pblk + orig_es.es_len - len2;
1398 0 : ext4_es_store_pblock(es, block);
1399 : }
1400 : }
1401 0 : if (count_reserved)
1402 0 : count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
1403 : &orig_es, &rc);
1404 0 : goto out_get_reserved;
1405 : }
1406 :
1407 0 : if (len1 > 0) {
1408 0 : if (count_reserved)
1409 0 : count_rsvd(inode, lblk, orig_es.es_len - len1,
1410 : &orig_es, &rc);
1411 0 : node = rb_next(&es->rb_node);
1412 0 : if (node)
1413 : es = rb_entry(node, struct extent_status, rb_node);
1414 : else
1415 0 : es = NULL;
1416 : }
1417 :
1418 0 : while (es && ext4_es_end(es) <= end) {
1419 0 : if (count_reserved)
1420 0 : count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
1421 0 : node = rb_next(&es->rb_node);
1422 0 : rb_erase(&es->rb_node, &tree->root);
1423 0 : ext4_es_free_extent(inode, es);
1424 0 : if (!node) {
1425 : es = NULL;
1426 : break;
1427 : }
1428 : es = rb_entry(node, struct extent_status, rb_node);
1429 : }
1430 :
1431 0 : if (es && es->es_lblk < end + 1) {
1432 0 : ext4_lblk_t orig_len = es->es_len;
1433 :
1434 0 : len1 = ext4_es_end(es) - end;
1435 0 : if (count_reserved)
1436 0 : count_rsvd(inode, es->es_lblk, orig_len - len1,
1437 : es, &rc);
1438 0 : es->es_lblk = end + 1;
1439 0 : es->es_len = len1;
1440 0 : if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
1441 0 : block = es->es_pblk + orig_len - len1;
1442 0 : ext4_es_store_pblock(es, block);
1443 : }
1444 : }
1445 :
1446 0 : out_get_reserved:
1447 0 : if (count_reserved)
1448 0 : *reserved = get_rsvd(inode, end, es, &rc);
1449 0 : out:
1450 : return err;
1451 : }
1452 :
1453 : /*
1454 : * ext4_es_remove_extent - removes block range from extent status tree
1455 : *
1456 : * @inode - file containing range
1457 : * @lblk - first block in range
1458 : * @len - number of blocks to remove
1459 : *
1460 : * Reduces block/cluster reservation count and for bigalloc cancels pending
1461 : * reservations as needed.
1462 : */
1463 0 : void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
1464 : ext4_lblk_t len)
1465 : {
1466 0 : ext4_lblk_t end;
1467 0 : int err = 0;
1468 0 : int reserved = 0;
1469 0 : struct extent_status *es = NULL;
1470 :
1471 0 : if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
1472 : return;
1473 :
1474 0 : trace_ext4_es_remove_extent(inode, lblk, len);
1475 0 : es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
1476 : lblk, len, inode->i_ino);
1477 :
1478 0 : if (!len)
1479 : return;
1480 :
1481 0 : end = lblk + len - 1;
1482 0 : BUG_ON(end < lblk);
1483 :
1484 0 : retry:
1485 0 : if (err && !es)
1486 0 : es = __es_alloc_extent(true);
1487 : /*
1488 : * ext4_clear_inode() depends on us taking i_es_lock unconditionally
1489 : * so that we are sure __es_shrink() is done with the inode before it
1490 : * is reclaimed.
1491 : */
1492 0 : write_lock(&EXT4_I(inode)->i_es_lock);
1493 0 : err = __es_remove_extent(inode, lblk, end, &reserved, es);
1494 0 : if (es && !es->es_len)
1495 0 : __es_free_extent(es);
1496 0 : write_unlock(&EXT4_I(inode)->i_es_lock);
1497 0 : if (err)
1498 0 : goto retry;
1499 :
1500 0 : ext4_es_print_tree(inode);
1501 0 : ext4_da_release_space(inode, reserved);
1502 0 : return;
1503 : }
1504 :
1505 0 : static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
1506 : struct ext4_inode_info *locked_ei)
1507 : {
1508 0 : struct ext4_inode_info *ei;
1509 0 : struct ext4_es_stats *es_stats;
1510 0 : ktime_t start_time;
1511 0 : u64 scan_time;
1512 0 : int nr_to_walk;
1513 0 : int nr_shrunk = 0;
1514 0 : int retried = 0, nr_skipped = 0;
1515 :
1516 0 : es_stats = &sbi->s_es_stats;
1517 0 : start_time = ktime_get();
1518 :
1519 0 : retry:
1520 0 : spin_lock(&sbi->s_es_lock);
1521 0 : nr_to_walk = sbi->s_es_nr_inode;
1522 0 : while (nr_to_walk-- > 0) {
1523 0 : if (list_empty(&sbi->s_es_list)) {
1524 0 : spin_unlock(&sbi->s_es_lock);
1525 0 : goto out;
1526 : }
1527 0 : ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
1528 : i_es_list);
1529 : /* Move the inode to the tail */
1530 0 : list_move_tail(&ei->i_es_list, &sbi->s_es_list);
1531 :
1532 : /*
1533 : * Normally we try hard to avoid shrinking precached inodes,
1534 : * but we will as a last resort.
1535 : */
1536 0 : if (!retried && ext4_test_inode_state(&ei->vfs_inode,
1537 : EXT4_STATE_EXT_PRECACHED)) {
1538 0 : nr_skipped++;
1539 0 : continue;
1540 : }
1541 :
1542 0 : if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
1543 0 : nr_skipped++;
1544 0 : continue;
1545 : }
1546 : /*
1547 : * Now we hold i_es_lock which protects us from inode reclaim
1548 : * freeing inode under us
1549 : */
1550 0 : spin_unlock(&sbi->s_es_lock);
1551 :
1552 0 : nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
1553 0 : write_unlock(&ei->i_es_lock);
1554 :
1555 0 : if (nr_to_scan <= 0)
1556 0 : goto out;
1557 0 : spin_lock(&sbi->s_es_lock);
1558 : }
1559 0 : spin_unlock(&sbi->s_es_lock);
1560 :
1561 : /*
1562 : * If we skipped any inodes, and we weren't able to make any
1563 : * forward progress, try again to scan precached inodes.
1564 : */
1565 0 : if ((nr_shrunk == 0) && nr_skipped && !retried) {
1566 0 : retried++;
1567 0 : goto retry;
1568 : }
1569 :
1570 0 : if (locked_ei && nr_shrunk == 0)
1571 0 : nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
1572 :
1573 0 : out:
1574 0 : scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1575 0 : if (likely(es_stats->es_stats_scan_time))
1576 0 : es_stats->es_stats_scan_time = (scan_time +
1577 0 : es_stats->es_stats_scan_time*3) / 4;
1578 : else
1579 0 : es_stats->es_stats_scan_time = scan_time;
1580 0 : if (scan_time > es_stats->es_stats_max_scan_time)
1581 0 : es_stats->es_stats_max_scan_time = scan_time;
1582 0 : if (likely(es_stats->es_stats_shrunk))
1583 0 : es_stats->es_stats_shrunk = (nr_shrunk +
1584 0 : es_stats->es_stats_shrunk*3) / 4;
1585 : else
1586 0 : es_stats->es_stats_shrunk = nr_shrunk;
1587 :
1588 0 : trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
1589 : nr_skipped, retried);
1590 0 : return nr_shrunk;
1591 : }
1592 :
1593 0 : static unsigned long ext4_es_count(struct shrinker *shrink,
1594 : struct shrink_control *sc)
1595 : {
1596 0 : unsigned long nr;
1597 0 : struct ext4_sb_info *sbi;
1598 :
1599 0 : sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
1600 0 : nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1601 0 : trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
1602 0 : return nr;
1603 : }
1604 :
1605 0 : static unsigned long ext4_es_scan(struct shrinker *shrink,
1606 : struct shrink_control *sc)
1607 : {
1608 0 : struct ext4_sb_info *sbi = container_of(shrink,
1609 : struct ext4_sb_info, s_es_shrinker);
1610 0 : int nr_to_scan = sc->nr_to_scan;
1611 0 : int ret, nr_shrunk;
1612 :
1613 0 : ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1614 0 : trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
1615 :
1616 0 : nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
1617 :
1618 0 : ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1619 0 : trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
1620 0 : return nr_shrunk;
1621 : }
1622 :
1623 0 : int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
1624 : {
1625 0 : struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private);
1626 0 : struct ext4_es_stats *es_stats = &sbi->s_es_stats;
1627 0 : struct ext4_inode_info *ei, *max = NULL;
1628 0 : unsigned int inode_cnt = 0;
1629 :
1630 0 : if (v != SEQ_START_TOKEN)
1631 : return 0;
1632 :
1633 : /* here we just find an inode that has the max nr. of objects */
1634 0 : spin_lock(&sbi->s_es_lock);
1635 0 : list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
1636 0 : inode_cnt++;
1637 0 : if (max && max->i_es_all_nr < ei->i_es_all_nr)
1638 : max = ei;
1639 0 : else if (!max)
1640 0 : max = ei;
1641 : }
1642 0 : spin_unlock(&sbi->s_es_lock);
1643 :
1644 0 : seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
1645 : percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
1646 : percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
1647 0 : seq_printf(seq, " %lld/%lld cache hits/misses\n",
1648 : percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
1649 : percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
1650 0 : if (inode_cnt)
1651 0 : seq_printf(seq, " %d inodes on list\n", inode_cnt);
1652 :
1653 0 : seq_printf(seq, "average:\n %llu us scan time\n",
1654 : div_u64(es_stats->es_stats_scan_time, 1000));
1655 0 : seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk);
1656 0 : if (inode_cnt)
1657 0 : seq_printf(seq,
1658 : "maximum:\n %lu inode (%u objects, %u reclaimable)\n"
1659 : " %llu us max scan time\n",
1660 : max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
1661 : div_u64(es_stats->es_stats_max_scan_time, 1000));
1662 :
1663 : return 0;
1664 : }
1665 :
1666 0 : int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1667 : {
1668 0 : int err;
1669 :
1670 : /* Make sure we have enough bits for physical block number */
1671 0 : BUILD_BUG_ON(ES_SHIFT < 48);
1672 0 : INIT_LIST_HEAD(&sbi->s_es_list);
1673 0 : sbi->s_es_nr_inode = 0;
1674 0 : spin_lock_init(&sbi->s_es_lock);
1675 0 : sbi->s_es_stats.es_stats_shrunk = 0;
1676 0 : err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0,
1677 : GFP_KERNEL);
1678 0 : if (err)
1679 : return err;
1680 0 : err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0,
1681 : GFP_KERNEL);
1682 0 : if (err)
1683 0 : goto err1;
1684 0 : sbi->s_es_stats.es_stats_scan_time = 0;
1685 0 : sbi->s_es_stats.es_stats_max_scan_time = 0;
1686 0 : err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
1687 0 : if (err)
1688 0 : goto err2;
1689 0 : err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
1690 0 : if (err)
1691 0 : goto err3;
1692 :
1693 0 : sbi->s_es_shrinker.scan_objects = ext4_es_scan;
1694 0 : sbi->s_es_shrinker.count_objects = ext4_es_count;
1695 0 : sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
1696 0 : err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s",
1697 0 : sbi->s_sb->s_id);
1698 0 : if (err)
1699 0 : goto err4;
1700 :
1701 : return 0;
1702 : err4:
1703 0 : percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1704 0 : err3:
1705 0 : percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1706 0 : err2:
1707 0 : percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
1708 0 : err1:
1709 0 : percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
1710 0 : return err;
1711 : }
1712 :
1713 0 : void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
1714 : {
1715 0 : percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
1716 0 : percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
1717 0 : percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1718 0 : percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1719 0 : unregister_shrinker(&sbi->s_es_shrinker);
1720 0 : }
1721 :
1722 : /*
1723 : * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
1724 : * most *nr_to_scan extents, update *nr_to_scan accordingly.
1725 : *
1726 : * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
1727 : * Increment *nr_shrunk by the number of reclaimed extents. Also update
1728 : * ei->i_es_shrink_lblk to where we should continue scanning.
1729 : */
1730 0 : static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
1731 : int *nr_to_scan, int *nr_shrunk)
1732 : {
1733 0 : struct inode *inode = &ei->vfs_inode;
1734 0 : struct ext4_es_tree *tree = &ei->i_es_tree;
1735 0 : struct extent_status *es;
1736 0 : struct rb_node *node;
1737 :
1738 0 : es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
1739 0 : if (!es)
1740 0 : goto out_wrap;
1741 :
1742 0 : while (*nr_to_scan > 0) {
1743 0 : if (es->es_lblk > end) {
1744 0 : ei->i_es_shrink_lblk = end + 1;
1745 0 : return 0;
1746 : }
1747 :
1748 0 : (*nr_to_scan)--;
1749 0 : node = rb_next(&es->rb_node);
1750 :
1751 0 : if (ext4_es_must_keep(es))
1752 0 : goto next;
1753 0 : if (ext4_es_is_referenced(es)) {
1754 0 : ext4_es_clear_referenced(es);
1755 0 : goto next;
1756 : }
1757 :
1758 0 : rb_erase(&es->rb_node, &tree->root);
1759 0 : ext4_es_free_extent(inode, es);
1760 0 : (*nr_shrunk)++;
1761 0 : next:
1762 0 : if (!node)
1763 0 : goto out_wrap;
1764 : es = rb_entry(node, struct extent_status, rb_node);
1765 : }
1766 0 : ei->i_es_shrink_lblk = es->es_lblk;
1767 0 : return 1;
1768 0 : out_wrap:
1769 0 : ei->i_es_shrink_lblk = 0;
1770 0 : return 0;
1771 : }
1772 :
1773 0 : static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
1774 : {
1775 0 : struct inode *inode = &ei->vfs_inode;
1776 0 : int nr_shrunk = 0;
1777 0 : ext4_lblk_t start = ei->i_es_shrink_lblk;
1778 0 : static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1779 : DEFAULT_RATELIMIT_BURST);
1780 :
1781 0 : if (ei->i_es_shk_nr == 0)
1782 : return 0;
1783 :
1784 0 : if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
1785 0 : __ratelimit(&_rs))
1786 0 : ext4_warning(inode->i_sb, "forced shrink of precached extents");
1787 :
1788 0 : if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
1789 : start != 0)
1790 0 : es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
1791 :
1792 0 : ei->i_es_tree.cache_es = NULL;
1793 0 : return nr_shrunk;
1794 : }
1795 :
1796 : /*
1797 : * Called to support EXT4_IOC_CLEAR_ES_CACHE. We can only remove
1798 : * discretionary entries from the extent status cache. (Some entries
1799 : * must be present for proper operations.)
1800 : */
1801 0 : void ext4_clear_inode_es(struct inode *inode)
1802 : {
1803 0 : struct ext4_inode_info *ei = EXT4_I(inode);
1804 0 : struct extent_status *es;
1805 0 : struct ext4_es_tree *tree;
1806 0 : struct rb_node *node;
1807 :
1808 0 : write_lock(&ei->i_es_lock);
1809 0 : tree = &EXT4_I(inode)->i_es_tree;
1810 0 : tree->cache_es = NULL;
1811 0 : node = rb_first(&tree->root);
1812 0 : while (node) {
1813 0 : es = rb_entry(node, struct extent_status, rb_node);
1814 0 : node = rb_next(node);
1815 0 : if (!ext4_es_must_keep(es)) {
1816 0 : rb_erase(&es->rb_node, &tree->root);
1817 0 : ext4_es_free_extent(inode, es);
1818 : }
1819 : }
1820 0 : ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
1821 0 : write_unlock(&ei->i_es_lock);
1822 0 : }
1823 :
1824 : #ifdef ES_DEBUG__
1825 : static void ext4_print_pending_tree(struct inode *inode)
1826 : {
1827 : struct ext4_pending_tree *tree;
1828 : struct rb_node *node;
1829 : struct pending_reservation *pr;
1830 :
1831 : printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino);
1832 : tree = &EXT4_I(inode)->i_pending_tree;
1833 : node = rb_first(&tree->root);
1834 : while (node) {
1835 : pr = rb_entry(node, struct pending_reservation, rb_node);
1836 : printk(KERN_DEBUG " %u", pr->lclu);
1837 : node = rb_next(node);
1838 : }
1839 : printk(KERN_DEBUG "\n");
1840 : }
1841 : #else
1842 : #define ext4_print_pending_tree(inode)
1843 : #endif
1844 :
1845 2 : int __init ext4_init_pending(void)
1846 : {
1847 2 : ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT);
1848 2 : if (ext4_pending_cachep == NULL)
1849 0 : return -ENOMEM;
1850 : return 0;
1851 : }
1852 :
1853 0 : void ext4_exit_pending(void)
1854 : {
1855 0 : kmem_cache_destroy(ext4_pending_cachep);
1856 0 : }
1857 :
1858 0 : void ext4_init_pending_tree(struct ext4_pending_tree *tree)
1859 : {
1860 0 : tree->root = RB_ROOT;
1861 0 : }
1862 :
1863 : /*
1864 : * __get_pending - retrieve a pointer to a pending reservation
1865 : *
1866 : * @inode - file containing the pending cluster reservation
1867 : * @lclu - logical cluster of interest
1868 : *
1869 : * Returns a pointer to a pending reservation if it's a member of
1870 : * the set, and NULL if not. Must be called holding i_es_lock.
1871 : */
1872 0 : static struct pending_reservation *__get_pending(struct inode *inode,
1873 : ext4_lblk_t lclu)
1874 : {
1875 0 : struct ext4_pending_tree *tree;
1876 0 : struct rb_node *node;
1877 0 : struct pending_reservation *pr = NULL;
1878 :
1879 0 : tree = &EXT4_I(inode)->i_pending_tree;
1880 0 : node = (&tree->root)->rb_node;
1881 :
1882 0 : while (node) {
1883 0 : pr = rb_entry(node, struct pending_reservation, rb_node);
1884 0 : if (lclu < pr->lclu)
1885 0 : node = node->rb_left;
1886 0 : else if (lclu > pr->lclu)
1887 0 : node = node->rb_right;
1888 0 : else if (lclu == pr->lclu)
1889 0 : return pr;
1890 : }
1891 : return NULL;
1892 : }
1893 :
1894 : /*
1895 : * __insert_pending - adds a pending cluster reservation to the set of
1896 : * pending reservations
1897 : *
1898 : * @inode - file containing the cluster
1899 : * @lblk - logical block in the cluster to be added
1900 : *
1901 : * Returns 0 on successful insertion and -ENOMEM on failure. If the
1902 : * pending reservation is already in the set, returns successfully.
1903 : */
1904 0 : static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
1905 : {
1906 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1907 0 : struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
1908 0 : struct rb_node **p = &tree->root.rb_node;
1909 0 : struct rb_node *parent = NULL;
1910 0 : struct pending_reservation *pr;
1911 0 : ext4_lblk_t lclu;
1912 0 : int ret = 0;
1913 :
1914 0 : lclu = EXT4_B2C(sbi, lblk);
1915 : /* search to find parent for insertion */
1916 0 : while (*p) {
1917 0 : parent = *p;
1918 0 : pr = rb_entry(parent, struct pending_reservation, rb_node);
1919 :
1920 0 : if (lclu < pr->lclu) {
1921 0 : p = &(*p)->rb_left;
1922 0 : } else if (lclu > pr->lclu) {
1923 0 : p = &(*p)->rb_right;
1924 : } else {
1925 : /* pending reservation already inserted */
1926 0 : goto out;
1927 : }
1928 : }
1929 :
1930 0 : pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
1931 0 : if (pr == NULL) {
1932 0 : ret = -ENOMEM;
1933 0 : goto out;
1934 : }
1935 0 : pr->lclu = lclu;
1936 :
1937 0 : rb_link_node(&pr->rb_node, parent, p);
1938 0 : rb_insert_color(&pr->rb_node, &tree->root);
1939 :
1940 0 : out:
1941 0 : return ret;
1942 : }
1943 :
1944 : /*
1945 : * __remove_pending - removes a pending cluster reservation from the set
1946 : * of pending reservations
1947 : *
1948 : * @inode - file containing the cluster
1949 : * @lblk - logical block in the pending cluster reservation to be removed
1950 : *
1951 : * Returns successfully if pending reservation is not a member of the set.
1952 : */
1953 0 : static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
1954 : {
1955 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1956 0 : struct pending_reservation *pr;
1957 0 : struct ext4_pending_tree *tree;
1958 :
1959 0 : pr = __get_pending(inode, EXT4_B2C(sbi, lblk));
1960 0 : if (pr != NULL) {
1961 0 : tree = &EXT4_I(inode)->i_pending_tree;
1962 0 : rb_erase(&pr->rb_node, &tree->root);
1963 0 : kmem_cache_free(ext4_pending_cachep, pr);
1964 : }
1965 0 : }
1966 :
1967 : /*
1968 : * ext4_remove_pending - removes a pending cluster reservation from the set
1969 : * of pending reservations
1970 : *
1971 : * @inode - file containing the cluster
1972 : * @lblk - logical block in the pending cluster reservation to be removed
1973 : *
1974 : * Locking for external use of __remove_pending.
1975 : */
1976 0 : void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
1977 : {
1978 0 : struct ext4_inode_info *ei = EXT4_I(inode);
1979 :
1980 0 : write_lock(&ei->i_es_lock);
1981 0 : __remove_pending(inode, lblk);
1982 0 : write_unlock(&ei->i_es_lock);
1983 0 : }
1984 :
1985 : /*
1986 : * ext4_is_pending - determine whether a cluster has a pending reservation
1987 : * on it
1988 : *
1989 : * @inode - file containing the cluster
1990 : * @lblk - logical block in the cluster
1991 : *
1992 : * Returns true if there's a pending reservation for the cluster in the
1993 : * set of pending reservations, and false if not.
1994 : */
1995 0 : bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
1996 : {
1997 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1998 0 : struct ext4_inode_info *ei = EXT4_I(inode);
1999 0 : bool ret;
2000 :
2001 0 : read_lock(&ei->i_es_lock);
2002 0 : ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
2003 0 : read_unlock(&ei->i_es_lock);
2004 :
2005 0 : return ret;
2006 : }
2007 :
2008 : /*
2009 : * ext4_es_insert_delayed_block - adds a delayed block to the extents status
2010 : * tree, adding a pending reservation where
2011 : * needed
2012 : *
2013 : * @inode - file containing the newly added block
2014 : * @lblk - logical block to be added
2015 : * @allocated - indicates whether a physical cluster has been allocated for
2016 : * the logical cluster that contains the block
2017 : */
2018 0 : void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
2019 : bool allocated)
2020 : {
2021 0 : struct extent_status newes;
2022 0 : int err1 = 0;
2023 0 : int err2 = 0;
2024 0 : struct extent_status *es1 = NULL;
2025 0 : struct extent_status *es2 = NULL;
2026 :
2027 0 : if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
2028 : return;
2029 :
2030 0 : es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
2031 : lblk, inode->i_ino);
2032 :
2033 0 : newes.es_lblk = lblk;
2034 0 : newes.es_len = 1;
2035 0 : ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
2036 0 : trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
2037 :
2038 0 : ext4_es_insert_extent_check(inode, &newes);
2039 :
2040 0 : retry:
2041 0 : if (err1 && !es1)
2042 0 : es1 = __es_alloc_extent(true);
2043 0 : if ((err1 || err2) && !es2)
2044 0 : es2 = __es_alloc_extent(true);
2045 0 : write_lock(&EXT4_I(inode)->i_es_lock);
2046 :
2047 0 : err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
2048 0 : if (err1 != 0)
2049 0 : goto error;
2050 :
2051 0 : err2 = __es_insert_extent(inode, &newes, es2);
2052 0 : if (err2 != 0)
2053 0 : goto error;
2054 :
2055 0 : if (allocated)
2056 0 : __insert_pending(inode, lblk);
2057 :
2058 : /* es is pre-allocated but not used, free it. */
2059 0 : if (es1 && !es1->es_len)
2060 0 : __es_free_extent(es1);
2061 0 : if (es2 && !es2->es_len)
2062 0 : __es_free_extent(es2);
2063 0 : error:
2064 0 : write_unlock(&EXT4_I(inode)->i_es_lock);
2065 0 : if (err1 || err2)
2066 0 : goto retry;
2067 :
2068 : ext4_es_print_tree(inode);
2069 : ext4_print_pending_tree(inode);
2070 : return;
2071 : }
2072 :
2073 : /*
2074 : * __es_delayed_clu - count number of clusters containing blocks that
2075 : * are delayed only
2076 : *
2077 : * @inode - file containing block range
2078 : * @start - logical block defining start of range
2079 : * @end - logical block defining end of range
2080 : *
2081 : * Returns the number of clusters containing only delayed (not delayed
2082 : * and unwritten) blocks in the range specified by @start and @end. Any
2083 : * cluster or part of a cluster within the range and containing a delayed
2084 : * and not unwritten block within the range is counted as a whole cluster.
2085 : */
2086 0 : static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
2087 : ext4_lblk_t end)
2088 : {
2089 0 : struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
2090 0 : struct extent_status *es;
2091 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2092 0 : struct rb_node *node;
2093 0 : ext4_lblk_t first_lclu, last_lclu;
2094 0 : unsigned long long last_counted_lclu;
2095 0 : unsigned int n = 0;
2096 :
2097 : /* guaranteed to be unequal to any ext4_lblk_t value */
2098 0 : last_counted_lclu = ~0ULL;
2099 :
2100 0 : es = __es_tree_search(&tree->root, start);
2101 :
2102 0 : while (es && (es->es_lblk <= end)) {
2103 0 : if (ext4_es_is_delonly(es)) {
2104 0 : if (es->es_lblk <= start)
2105 0 : first_lclu = EXT4_B2C(sbi, start);
2106 : else
2107 0 : first_lclu = EXT4_B2C(sbi, es->es_lblk);
2108 :
2109 0 : if (ext4_es_end(es) >= end)
2110 0 : last_lclu = EXT4_B2C(sbi, end);
2111 : else
2112 0 : last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
2113 :
2114 0 : if (first_lclu == last_counted_lclu)
2115 0 : n += last_lclu - first_lclu;
2116 : else
2117 0 : n += last_lclu - first_lclu + 1;
2118 0 : last_counted_lclu = last_lclu;
2119 : }
2120 0 : node = rb_next(&es->rb_node);
2121 0 : if (!node)
2122 : break;
2123 : es = rb_entry(node, struct extent_status, rb_node);
2124 : }
2125 :
2126 0 : return n;
2127 : }
2128 :
2129 : /*
2130 : * ext4_es_delayed_clu - count number of clusters containing blocks that
2131 : * are both delayed and unwritten
2132 : *
2133 : * @inode - file containing block range
2134 : * @lblk - logical block defining start of range
2135 : * @len - number of blocks in range
2136 : *
2137 : * Locking for external use of __es_delayed_clu().
2138 : */
2139 0 : unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
2140 : ext4_lblk_t len)
2141 : {
2142 0 : struct ext4_inode_info *ei = EXT4_I(inode);
2143 0 : ext4_lblk_t end;
2144 0 : unsigned int n;
2145 :
2146 0 : if (len == 0)
2147 : return 0;
2148 :
2149 0 : end = lblk + len - 1;
2150 0 : WARN_ON(end < lblk);
2151 :
2152 0 : read_lock(&ei->i_es_lock);
2153 :
2154 0 : n = __es_delayed_clu(inode, lblk, end);
2155 :
2156 0 : read_unlock(&ei->i_es_lock);
2157 :
2158 0 : return n;
2159 : }
2160 :
2161 : /*
2162 : * __revise_pending - makes, cancels, or leaves unchanged pending cluster
2163 : * reservations for a specified block range depending
2164 : * upon the presence or absence of delayed blocks
2165 : * outside the range within clusters at the ends of the
2166 : * range
2167 : *
2168 : * @inode - file containing the range
2169 : * @lblk - logical block defining the start of range
2170 : * @len - length of range in blocks
2171 : *
2172 : * Used after a newly allocated extent is added to the extents status tree.
2173 : * Requires that the extents in the range have either written or unwritten
2174 : * status. Must be called while holding i_es_lock.
2175 : */
2176 0 : static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
2177 : ext4_lblk_t len)
2178 : {
2179 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2180 0 : ext4_lblk_t end = lblk + len - 1;
2181 0 : ext4_lblk_t first, last;
2182 0 : bool f_del = false, l_del = false;
2183 :
2184 0 : if (len == 0)
2185 : return;
2186 :
2187 : /*
2188 : * Two cases - block range within single cluster and block range
2189 : * spanning two or more clusters. Note that a cluster belonging
2190 : * to a range starting and/or ending on a cluster boundary is treated
2191 : * as if it does not contain a delayed extent. The new range may
2192 : * have allocated space for previously delayed blocks out to the
2193 : * cluster boundary, requiring that any pre-existing pending
2194 : * reservation be canceled. Because this code only looks at blocks
2195 : * outside the range, it should revise pending reservations
2196 : * correctly even if the extent represented by the range can't be
2197 : * inserted in the extents status tree due to ENOSPC.
2198 : */
2199 :
2200 0 : if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
2201 0 : first = EXT4_LBLK_CMASK(sbi, lblk);
2202 0 : if (first != lblk)
2203 0 : f_del = __es_scan_range(inode, &ext4_es_is_delonly,
2204 : first, lblk - 1);
2205 0 : if (f_del) {
2206 0 : __insert_pending(inode, first);
2207 : } else {
2208 0 : last = EXT4_LBLK_CMASK(sbi, end) +
2209 : sbi->s_cluster_ratio - 1;
2210 0 : if (last != end)
2211 0 : l_del = __es_scan_range(inode,
2212 : &ext4_es_is_delonly,
2213 : end + 1, last);
2214 0 : if (l_del)
2215 0 : __insert_pending(inode, last);
2216 : else
2217 0 : __remove_pending(inode, last);
2218 : }
2219 : } else {
2220 0 : first = EXT4_LBLK_CMASK(sbi, lblk);
2221 0 : if (first != lblk)
2222 0 : f_del = __es_scan_range(inode, &ext4_es_is_delonly,
2223 : first, lblk - 1);
2224 0 : if (f_del)
2225 0 : __insert_pending(inode, first);
2226 : else
2227 0 : __remove_pending(inode, first);
2228 :
2229 0 : last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
2230 0 : if (last != end)
2231 0 : l_del = __es_scan_range(inode, &ext4_es_is_delonly,
2232 : end + 1, last);
2233 0 : if (l_del)
2234 0 : __insert_pending(inode, last);
2235 : else
2236 0 : __remove_pending(inode, last);
2237 : }
2238 : }
|