Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
4 : * Written by Alex Tomas <alex@clusterfs.com>
5 : */
6 :
7 :
8 : /*
9 : * mballoc.c contains the multiblocks allocation routines
10 : */
11 :
12 : #include "ext4_jbd2.h"
13 : #include "mballoc.h"
14 : #include <linux/log2.h>
15 : #include <linux/module.h>
16 : #include <linux/slab.h>
17 : #include <linux/nospec.h>
18 : #include <linux/backing-dev.h>
19 : #include <trace/events/ext4.h>
20 :
21 : /*
22 : * MUSTDO:
23 : * - test ext4_ext_search_left() and ext4_ext_search_right()
24 : * - search for metadata in few groups
25 : *
26 : * TODO v4:
27 : * - normalization should take into account whether file is still open
28 : * - discard preallocations if no free space left (policy?)
29 : * - don't normalize tails
30 : * - quota
31 : * - reservation for superuser
32 : *
33 : * TODO v3:
34 : * - bitmap read-ahead (proposed by Oleg Drokin aka green)
35 : * - track min/max extents in each group for better group selection
36 : * - mb_mark_used() may allocate chunk right after splitting buddy
37 : * - tree of groups sorted by number of free blocks
38 : * - error handling
39 : */
40 :
41 : /*
42 : * The allocation request involve request for multiple number of blocks
43 : * near to the goal(block) value specified.
44 : *
45 : * During initialization phase of the allocator we decide to use the
46 : * group preallocation or inode preallocation depending on the size of
47 : * the file. The size of the file could be the resulting file size we
48 : * would have after allocation, or the current file size, which ever
49 : * is larger. If the size is less than sbi->s_mb_stream_request we
50 : * select to use the group preallocation. The default value of
51 : * s_mb_stream_request is 16 blocks. This can also be tuned via
52 : * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
53 : * terms of number of blocks.
54 : *
55 : * The main motivation for having small file use group preallocation is to
56 : * ensure that we have small files closer together on the disk.
57 : *
58 : * First stage the allocator looks at the inode prealloc list,
59 : * ext4_inode_info->i_prealloc_list, which contains list of prealloc
60 : * spaces for this particular inode. The inode prealloc space is
61 : * represented as:
62 : *
63 : * pa_lstart -> the logical start block for this prealloc space
64 : * pa_pstart -> the physical start block for this prealloc space
65 : * pa_len -> length for this prealloc space (in clusters)
66 : * pa_free -> free space available in this prealloc space (in clusters)
67 : *
68 : * The inode preallocation space is used looking at the _logical_ start
69 : * block. If only the logical file block falls within the range of prealloc
70 : * space we will consume the particular prealloc space. This makes sure that
71 : * we have contiguous physical blocks representing the file blocks
72 : *
73 : * The important thing to be noted in case of inode prealloc space is that
74 : * we don't modify the values associated to inode prealloc space except
75 : * pa_free.
76 : *
77 : * If we are not able to find blocks in the inode prealloc space and if we
78 : * have the group allocation flag set then we look at the locality group
79 : * prealloc space. These are per CPU prealloc list represented as
80 : *
81 : * ext4_sb_info.s_locality_groups[smp_processor_id()]
82 : *
83 : * The reason for having a per cpu locality group is to reduce the contention
84 : * between CPUs. It is possible to get scheduled at this point.
85 : *
86 : * The locality group prealloc space is used looking at whether we have
87 : * enough free space (pa_free) within the prealloc space.
88 : *
89 : * If we can't allocate blocks via inode prealloc or/and locality group
90 : * prealloc then we look at the buddy cache. The buddy cache is represented
91 : * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
92 : * mapped to the buddy and bitmap information regarding different
93 : * groups. The buddy information is attached to buddy cache inode so that
94 : * we can access them through the page cache. The information regarding
95 : * each group is loaded via ext4_mb_load_buddy. The information involve
96 : * block bitmap and buddy information. The information are stored in the
97 : * inode as:
98 : *
99 : * { page }
100 : * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
101 : *
102 : *
103 : * one block each for bitmap and buddy information. So for each group we
104 : * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE /
105 : * blocksize) blocks. So it can have information regarding groups_per_page
106 : * which is blocks_per_page/2
107 : *
108 : * The buddy cache inode is not stored on disk. The inode is thrown
109 : * away when the filesystem is unmounted.
110 : *
111 : * We look for count number of blocks in the buddy cache. If we were able
112 : * to locate that many free blocks we return with additional information
113 : * regarding rest of the contiguous physical block available
114 : *
115 : * Before allocating blocks via buddy cache we normalize the request
116 : * blocks. This ensure we ask for more blocks that we needed. The extra
117 : * blocks that we get after allocation is added to the respective prealloc
118 : * list. In case of inode preallocation we follow a list of heuristics
119 : * based on file size. This can be found in ext4_mb_normalize_request. If
120 : * we are doing a group prealloc we try to normalize the request to
121 : * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is
122 : * dependent on the cluster size; for non-bigalloc file systems, it is
123 : * 512 blocks. This can be tuned via
124 : * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
125 : * terms of number of blocks. If we have mounted the file system with -O
126 : * stripe=<value> option the group prealloc request is normalized to the
127 : * smallest multiple of the stripe value (sbi->s_stripe) which is
128 : * greater than the default mb_group_prealloc.
129 : *
130 : * If "mb_optimize_scan" mount option is set, we maintain in memory group info
131 : * structures in two data structures:
132 : *
133 : * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders)
134 : *
135 : * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks)
136 : *
137 : * This is an array of lists where the index in the array represents the
138 : * largest free order in the buddy bitmap of the participating group infos of
139 : * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total
140 : * number of buddy bitmap orders possible) number of lists. Group-infos are
141 : * placed in appropriate lists.
142 : *
143 : * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size)
144 : *
145 : * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks)
146 : *
147 : * This is an array of lists where in the i-th list there are groups with
148 : * average fragment size >= 2^i and < 2^(i+1). The average fragment size
149 : * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
150 : * Note that we don't bother with a special list for completely empty groups
151 : * so we only have MB_NUM_ORDERS(sb) lists.
152 : *
153 : * When "mb_optimize_scan" mount option is set, mballoc consults the above data
154 : * structures to decide the order in which groups are to be traversed for
155 : * fulfilling an allocation request.
156 : *
157 : * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order
158 : * >= the order of the request. We directly look at the largest free order list
159 : * in the data structure (1) above where largest_free_order = order of the
160 : * request. If that list is empty, we look at remaining list in the increasing
161 : * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED
162 : * lookup in O(1) time.
163 : *
164 : * At CR_GOAL_LEN_FAST, we only consider groups where
165 : * average fragment size > request size. So, we lookup a group which has average
166 : * fragment size just above or equal to request size using our average fragment
167 : * size group lists (data structure 2) in O(1) time.
168 : *
169 : * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied
170 : * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in
171 : * CR_GOAL_LEN_FAST suggests that there is no BG that has avg
172 : * fragment size > goal length. So before falling to the slower
173 : * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and
174 : * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big
175 : * enough average fragment size. This increases the chances of finding a
176 : * suitable block group in O(1) time and results in faster allocation at the
177 : * cost of reduced size of allocation.
178 : *
179 : * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
180 : * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and
181 : * CR_GOAL_LEN_FAST phase.
182 : *
183 : * The regular allocator (using the buddy cache) supports a few tunables.
184 : *
185 : * /sys/fs/ext4/<partition>/mb_min_to_scan
186 : * /sys/fs/ext4/<partition>/mb_max_to_scan
187 : * /sys/fs/ext4/<partition>/mb_order2_req
188 : * /sys/fs/ext4/<partition>/mb_linear_limit
189 : *
190 : * The regular allocator uses buddy scan only if the request len is power of
191 : * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
192 : * value of s_mb_order2_reqs can be tuned via
193 : * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
194 : * stripe size (sbi->s_stripe), we try to search for contiguous block in
195 : * stripe size. This should result in better allocation on RAID setups. If
196 : * not, we search in the specific group using bitmap for best extents. The
197 : * tunable min_to_scan and max_to_scan control the behaviour here.
198 : * min_to_scan indicate how long the mballoc __must__ look for a best
199 : * extent and max_to_scan indicates how long the mballoc __can__ look for a
200 : * best extent in the found extents. Searching for the blocks starts with
201 : * the group specified as the goal value in allocation context via
202 : * ac_g_ex. Each group is first checked based on the criteria whether it
203 : * can be used for allocation. ext4_mb_good_group explains how the groups are
204 : * checked.
205 : *
206 : * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not
207 : * get traversed linearly. That may result in subsequent allocations being not
208 : * close to each other. And so, the underlying device may get filled up in a
209 : * non-linear fashion. While that may not matter on non-rotational devices, for
210 : * rotational devices that may result in higher seek times. "mb_linear_limit"
211 : * tells mballoc how many groups mballoc should search linearly before
212 : * performing consulting above data structures for more efficient lookups. For
213 : * non rotational devices, this value defaults to 0 and for rotational devices
214 : * this is set to MB_DEFAULT_LINEAR_LIMIT.
215 : *
216 : * Both the prealloc space are getting populated as above. So for the first
217 : * request we will hit the buddy cache which will result in this prealloc
218 : * space getting filled. The prealloc space is then later used for the
219 : * subsequent request.
220 : */
221 :
222 : /*
223 : * mballoc operates on the following data:
224 : * - on-disk bitmap
225 : * - in-core buddy (actually includes buddy and bitmap)
226 : * - preallocation descriptors (PAs)
227 : *
228 : * there are two types of preallocations:
229 : * - inode
230 : * assiged to specific inode and can be used for this inode only.
231 : * it describes part of inode's space preallocated to specific
232 : * physical blocks. any block from that preallocated can be used
233 : * independent. the descriptor just tracks number of blocks left
234 : * unused. so, before taking some block from descriptor, one must
235 : * make sure corresponded logical block isn't allocated yet. this
236 : * also means that freeing any block within descriptor's range
237 : * must discard all preallocated blocks.
238 : * - locality group
239 : * assigned to specific locality group which does not translate to
240 : * permanent set of inodes: inode can join and leave group. space
241 : * from this type of preallocation can be used for any inode. thus
242 : * it's consumed from the beginning to the end.
243 : *
244 : * relation between them can be expressed as:
245 : * in-core buddy = on-disk bitmap + preallocation descriptors
246 : *
247 : * this mean blocks mballoc considers used are:
248 : * - allocated blocks (persistent)
249 : * - preallocated blocks (non-persistent)
250 : *
251 : * consistency in mballoc world means that at any time a block is either
252 : * free or used in ALL structures. notice: "any time" should not be read
253 : * literally -- time is discrete and delimited by locks.
254 : *
255 : * to keep it simple, we don't use block numbers, instead we count number of
256 : * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
257 : *
258 : * all operations can be expressed as:
259 : * - init buddy: buddy = on-disk + PAs
260 : * - new PA: buddy += N; PA = N
261 : * - use inode PA: on-disk += N; PA -= N
262 : * - discard inode PA buddy -= on-disk - PA; PA = 0
263 : * - use locality group PA on-disk += N; PA -= N
264 : * - discard locality group PA buddy -= PA; PA = 0
265 : * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
266 : * is used in real operation because we can't know actual used
267 : * bits from PA, only from on-disk bitmap
268 : *
269 : * if we follow this strict logic, then all operations above should be atomic.
270 : * given some of them can block, we'd have to use something like semaphores
271 : * killing performance on high-end SMP hardware. let's try to relax it using
272 : * the following knowledge:
273 : * 1) if buddy is referenced, it's already initialized
274 : * 2) while block is used in buddy and the buddy is referenced,
275 : * nobody can re-allocate that block
276 : * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
277 : * bit set and PA claims same block, it's OK. IOW, one can set bit in
278 : * on-disk bitmap if buddy has same bit set or/and PA covers corresponded
279 : * block
280 : *
281 : * so, now we're building a concurrency table:
282 : * - init buddy vs.
283 : * - new PA
284 : * blocks for PA are allocated in the buddy, buddy must be referenced
285 : * until PA is linked to allocation group to avoid concurrent buddy init
286 : * - use inode PA
287 : * we need to make sure that either on-disk bitmap or PA has uptodate data
288 : * given (3) we care that PA-=N operation doesn't interfere with init
289 : * - discard inode PA
290 : * the simplest way would be to have buddy initialized by the discard
291 : * - use locality group PA
292 : * again PA-=N must be serialized with init
293 : * - discard locality group PA
294 : * the simplest way would be to have buddy initialized by the discard
295 : * - new PA vs.
296 : * - use inode PA
297 : * i_data_sem serializes them
298 : * - discard inode PA
299 : * discard process must wait until PA isn't used by another process
300 : * - use locality group PA
301 : * some mutex should serialize them
302 : * - discard locality group PA
303 : * discard process must wait until PA isn't used by another process
304 : * - use inode PA
305 : * - use inode PA
306 : * i_data_sem or another mutex should serializes them
307 : * - discard inode PA
308 : * discard process must wait until PA isn't used by another process
309 : * - use locality group PA
310 : * nothing wrong here -- they're different PAs covering different blocks
311 : * - discard locality group PA
312 : * discard process must wait until PA isn't used by another process
313 : *
314 : * now we're ready to make few consequences:
315 : * - PA is referenced and while it is no discard is possible
316 : * - PA is referenced until block isn't marked in on-disk bitmap
317 : * - PA changes only after on-disk bitmap
318 : * - discard must not compete with init. either init is done before
319 : * any discard or they're serialized somehow
320 : * - buddy init as sum of on-disk bitmap and PAs is done atomically
321 : *
322 : * a special case when we've used PA to emptiness. no need to modify buddy
323 : * in this case, but we should care about concurrent init
324 : *
325 : */
326 :
327 : /*
328 : * Logic in few words:
329 : *
330 : * - allocation:
331 : * load group
332 : * find blocks
333 : * mark bits in on-disk bitmap
334 : * release group
335 : *
336 : * - use preallocation:
337 : * find proper PA (per-inode or group)
338 : * load group
339 : * mark bits in on-disk bitmap
340 : * release group
341 : * release PA
342 : *
343 : * - free:
344 : * load group
345 : * mark bits in on-disk bitmap
346 : * release group
347 : *
348 : * - discard preallocations in group:
349 : * mark PAs deleted
350 : * move them onto local list
351 : * load on-disk bitmap
352 : * load group
353 : * remove PA from object (inode or locality group)
354 : * mark free blocks in-core
355 : *
356 : * - discard inode's preallocations:
357 : */
358 :
359 : /*
360 : * Locking rules
361 : *
362 : * Locks:
363 : * - bitlock on a group (group)
364 : * - object (inode/locality) (object)
365 : * - per-pa lock (pa)
366 : * - cr_power2_aligned lists lock (cr_power2_aligned)
367 : * - cr_goal_len_fast lists lock (cr_goal_len_fast)
368 : *
369 : * Paths:
370 : * - new pa
371 : * object
372 : * group
373 : *
374 : * - find and use pa:
375 : * pa
376 : *
377 : * - release consumed pa:
378 : * pa
379 : * group
380 : * object
381 : *
382 : * - generate in-core bitmap:
383 : * group
384 : * pa
385 : *
386 : * - discard all for given object (inode, locality group):
387 : * object
388 : * pa
389 : * group
390 : *
391 : * - discard all for given group:
392 : * group
393 : * pa
394 : * group
395 : * object
396 : *
397 : * - allocation path (ext4_mb_regular_allocator)
398 : * group
399 : * cr_power2_aligned/cr_goal_len_fast
400 : */
401 : static struct kmem_cache *ext4_pspace_cachep;
402 : static struct kmem_cache *ext4_ac_cachep;
403 : static struct kmem_cache *ext4_free_data_cachep;
404 :
405 : /* We create slab caches for groupinfo data structures based on the
406 : * superblock block size. There will be one per mounted filesystem for
407 : * each unique s_blocksize_bits */
408 : #define NR_GRPINFO_CACHES 8
409 : static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
410 :
411 : static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
412 : "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
413 : "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
414 : "ext4_groupinfo_64k", "ext4_groupinfo_128k"
415 : };
416 :
417 : static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
418 : ext4_group_t group);
419 : static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
420 : ext4_group_t group);
421 : static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
422 :
423 : static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
424 : ext4_group_t group, enum criteria cr);
425 :
426 : static int ext4_try_to_trim_range(struct super_block *sb,
427 : struct ext4_buddy *e4b, ext4_grpblk_t start,
428 : ext4_grpblk_t max, ext4_grpblk_t minblocks);
429 :
430 : /*
431 : * The algorithm using this percpu seq counter goes below:
432 : * 1. We sample the percpu discard_pa_seq counter before trying for block
433 : * allocation in ext4_mb_new_blocks().
434 : * 2. We increment this percpu discard_pa_seq counter when we either allocate
435 : * or free these blocks i.e. while marking those blocks as used/free in
436 : * mb_mark_used()/mb_free_blocks().
437 : * 3. We also increment this percpu seq counter when we successfully identify
438 : * that the bb_prealloc_list is not empty and hence proceed for discarding
439 : * of those PAs inside ext4_mb_discard_group_preallocations().
440 : *
441 : * Now to make sure that the regular fast path of block allocation is not
442 : * affected, as a small optimization we only sample the percpu seq counter
443 : * on that cpu. Only when the block allocation fails and when freed blocks
444 : * found were 0, that is when we sample percpu seq counter for all cpus using
445 : * below function ext4_get_discard_pa_seq_sum(). This happens after making
446 : * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
447 : */
448 : static DEFINE_PER_CPU(u64, discard_pa_seq);
449 446 : static inline u64 ext4_get_discard_pa_seq_sum(void)
450 : {
451 446 : int __cpu;
452 446 : u64 __seq = 0;
453 :
454 2228 : for_each_possible_cpu(__cpu)
455 1781 : __seq += per_cpu(discard_pa_seq, __cpu);
456 445 : return __seq;
457 : }
458 :
459 : static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
460 : {
461 : #if BITS_PER_LONG == 64
462 3364458863 : *bit += ((unsigned long) addr & 7UL) << 3;
463 3364458863 : addr = (void *) ((unsigned long) addr & ~7UL);
464 : #elif BITS_PER_LONG == 32
465 : *bit += ((unsigned long) addr & 3UL) << 3;
466 : addr = (void *) ((unsigned long) addr & ~3UL);
467 : #else
468 : #error "how many bits you are?!"
469 : #endif
470 3364458863 : return addr;
471 : }
472 :
473 3020835651 : static inline int mb_test_bit(int bit, void *addr)
474 : {
475 : /*
476 : * ext4_test_bit on architecture like powerpc
477 : * needs unsigned long aligned address
478 : */
479 3020835651 : addr = mb_correct_addr_and_bit(&bit, addr);
480 3020835651 : return ext4_test_bit(bit, addr);
481 : }
482 :
483 73270782 : static inline void mb_set_bit(int bit, void *addr)
484 : {
485 73270782 : addr = mb_correct_addr_and_bit(&bit, addr);
486 73270782 : ext4_set_bit(bit, addr);
487 73270021 : }
488 :
489 36914588 : static inline void mb_clear_bit(int bit, void *addr)
490 : {
491 36914588 : addr = mb_correct_addr_and_bit(&bit, addr);
492 36914588 : ext4_clear_bit(bit, addr);
493 36914612 : }
494 :
495 40227496 : static inline int mb_test_and_clear_bit(int bit, void *addr)
496 : {
497 40227496 : addr = mb_correct_addr_and_bit(&bit, addr);
498 40227496 : return ext4_test_and_clear_bit(bit, addr);
499 : }
500 :
501 169842610 : static inline int mb_find_next_zero_bit(void *addr, int max, int start)
502 : {
503 169842610 : int fix = 0, ret, tmpmax;
504 169842610 : addr = mb_correct_addr_and_bit(&fix, addr);
505 169842610 : tmpmax = max + fix;
506 169842610 : start += fix;
507 :
508 169842610 : ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
509 169885752 : if (ret > max)
510 : return max;
511 : return ret;
512 : }
513 :
514 23367736 : static inline int mb_find_next_bit(void *addr, int max, int start)
515 : {
516 23367736 : int fix = 0, ret, tmpmax;
517 23367736 : addr = mb_correct_addr_and_bit(&fix, addr);
518 23367736 : tmpmax = max + fix;
519 23367736 : start += fix;
520 :
521 23367736 : ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
522 23367653 : if (ret > max)
523 : return max;
524 : return ret;
525 : }
526 :
527 3065653436 : static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
528 : {
529 3065653436 : char *bb;
530 :
531 3065653436 : BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
532 3065653436 : BUG_ON(max == NULL);
533 :
534 3065653436 : if (order > e4b->bd_blkbits + 1) {
535 20153 : *max = 0;
536 20153 : return NULL;
537 : }
538 :
539 : /* at order 0 we see each particular block */
540 3065633283 : if (order == 0) {
541 286459304 : *max = 1 << (e4b->bd_blkbits + 3);
542 286459304 : return e4b->bd_bitmap;
543 : }
544 :
545 2779173979 : bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
546 2779173979 : *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
547 :
548 2779173979 : return bb;
549 : }
550 :
551 : #ifdef DOUBLE_CHECK
552 : static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
553 : int first, int count)
554 : {
555 : int i;
556 : struct super_block *sb = e4b->bd_sb;
557 :
558 : if (unlikely(e4b->bd_info->bb_bitmap == NULL))
559 : return;
560 : assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
561 : for (i = 0; i < count; i++) {
562 : if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
563 : ext4_fsblk_t blocknr;
564 :
565 : blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
566 : blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
567 : ext4_grp_locked_error(sb, e4b->bd_group,
568 : inode ? inode->i_ino : 0,
569 : blocknr,
570 : "freeing block already freed "
571 : "(bit %u)",
572 : first + i);
573 : ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
574 : EXT4_GROUP_INFO_BBITMAP_CORRUPT);
575 : }
576 : mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
577 : }
578 : }
579 :
580 : static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
581 : {
582 : int i;
583 :
584 : if (unlikely(e4b->bd_info->bb_bitmap == NULL))
585 : return;
586 : assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
587 : for (i = 0; i < count; i++) {
588 : BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
589 : mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
590 : }
591 : }
592 :
593 : static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
594 : {
595 : if (unlikely(e4b->bd_info->bb_bitmap == NULL))
596 : return;
597 : if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
598 : unsigned char *b1, *b2;
599 : int i;
600 : b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
601 : b2 = (unsigned char *) bitmap;
602 : for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
603 : if (b1[i] != b2[i]) {
604 : ext4_msg(e4b->bd_sb, KERN_ERR,
605 : "corruption in group %u "
606 : "at byte %u(%u): %x in copy != %x "
607 : "on disk/prealloc",
608 : e4b->bd_group, i, i * 8, b1[i], b2[i]);
609 : BUG();
610 : }
611 : }
612 : }
613 : }
614 :
615 : static void mb_group_bb_bitmap_alloc(struct super_block *sb,
616 : struct ext4_group_info *grp, ext4_group_t group)
617 : {
618 : struct buffer_head *bh;
619 :
620 : grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
621 : if (!grp->bb_bitmap)
622 : return;
623 :
624 : bh = ext4_read_block_bitmap(sb, group);
625 : if (IS_ERR_OR_NULL(bh)) {
626 : kfree(grp->bb_bitmap);
627 : grp->bb_bitmap = NULL;
628 : return;
629 : }
630 :
631 : memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize);
632 : put_bh(bh);
633 : }
634 :
635 : static void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
636 : {
637 : kfree(grp->bb_bitmap);
638 : }
639 :
640 : #else
641 : static inline void mb_free_blocks_double(struct inode *inode,
642 : struct ext4_buddy *e4b, int first, int count)
643 : {
644 : return;
645 : }
646 : static inline void mb_mark_used_double(struct ext4_buddy *e4b,
647 : int first, int count)
648 : {
649 : return;
650 : }
651 : static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
652 : {
653 : return;
654 : }
655 :
656 : static inline void mb_group_bb_bitmap_alloc(struct super_block *sb,
657 : struct ext4_group_info *grp, ext4_group_t group)
658 : {
659 : return;
660 : }
661 :
662 : static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
663 : {
664 : return;
665 : }
666 : #endif
667 :
668 : #ifdef AGGRESSIVE_CHECK
669 :
670 : #define MB_CHECK_ASSERT(assert) \
671 : do { \
672 : if (!(assert)) { \
673 : printk(KERN_EMERG \
674 : "Assertion failure in %s() at %s:%d: \"%s\"\n", \
675 : function, file, line, # assert); \
676 : BUG(); \
677 : } \
678 : } while (0)
679 :
680 : static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
681 : const char *function, int line)
682 : {
683 : struct super_block *sb = e4b->bd_sb;
684 : int order = e4b->bd_blkbits + 1;
685 : int max;
686 : int max2;
687 : int i;
688 : int j;
689 : int k;
690 : int count;
691 : struct ext4_group_info *grp;
692 : int fragments = 0;
693 : int fstart;
694 : struct list_head *cur;
695 : void *buddy;
696 : void *buddy2;
697 :
698 : if (e4b->bd_info->bb_check_counter++ % 10)
699 : return 0;
700 :
701 : while (order > 1) {
702 : buddy = mb_find_buddy(e4b, order, &max);
703 : MB_CHECK_ASSERT(buddy);
704 : buddy2 = mb_find_buddy(e4b, order - 1, &max2);
705 : MB_CHECK_ASSERT(buddy2);
706 : MB_CHECK_ASSERT(buddy != buddy2);
707 : MB_CHECK_ASSERT(max * 2 == max2);
708 :
709 : count = 0;
710 : for (i = 0; i < max; i++) {
711 :
712 : if (mb_test_bit(i, buddy)) {
713 : /* only single bit in buddy2 may be 0 */
714 : if (!mb_test_bit(i << 1, buddy2)) {
715 : MB_CHECK_ASSERT(
716 : mb_test_bit((i<<1)+1, buddy2));
717 : }
718 : continue;
719 : }
720 :
721 : /* both bits in buddy2 must be 1 */
722 : MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
723 : MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
724 :
725 : for (j = 0; j < (1 << order); j++) {
726 : k = (i * (1 << order)) + j;
727 : MB_CHECK_ASSERT(
728 : !mb_test_bit(k, e4b->bd_bitmap));
729 : }
730 : count++;
731 : }
732 : MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
733 : order--;
734 : }
735 :
736 : fstart = -1;
737 : buddy = mb_find_buddy(e4b, 0, &max);
738 : for (i = 0; i < max; i++) {
739 : if (!mb_test_bit(i, buddy)) {
740 : MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
741 : if (fstart == -1) {
742 : fragments++;
743 : fstart = i;
744 : }
745 : continue;
746 : }
747 : fstart = -1;
748 : /* check used bits only */
749 : for (j = 0; j < e4b->bd_blkbits + 1; j++) {
750 : buddy2 = mb_find_buddy(e4b, j, &max2);
751 : k = i >> j;
752 : MB_CHECK_ASSERT(k < max2);
753 : MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
754 : }
755 : }
756 : MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
757 : MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
758 :
759 : grp = ext4_get_group_info(sb, e4b->bd_group);
760 : if (!grp)
761 : return NULL;
762 : list_for_each(cur, &grp->bb_prealloc_list) {
763 : ext4_group_t groupnr;
764 : struct ext4_prealloc_space *pa;
765 : pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
766 : ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
767 : MB_CHECK_ASSERT(groupnr == e4b->bd_group);
768 : for (i = 0; i < pa->pa_len; i++)
769 : MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
770 : }
771 : return 0;
772 : }
773 : #undef MB_CHECK_ASSERT
774 : #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
775 : __FILE__, __func__, __LINE__)
776 : #else
777 : #define mb_check_buddy(e4b)
778 : #endif
779 :
780 : /*
781 : * Divide blocks started from @first with length @len into
782 : * smaller chunks with power of 2 blocks.
783 : * Clear the bits in bitmap which the blocks of the chunk(s) covered,
784 : * then increase bb_counters[] for corresponded chunk size.
785 : */
786 230397 : static void ext4_mb_mark_free_simple(struct super_block *sb,
787 : void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
788 : struct ext4_group_info *grp)
789 : {
790 230397 : struct ext4_sb_info *sbi = EXT4_SB(sb);
791 230397 : ext4_grpblk_t min;
792 230397 : ext4_grpblk_t max;
793 230397 : ext4_grpblk_t chunk;
794 230397 : unsigned int border;
795 :
796 230397 : BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
797 :
798 230397 : border = 2 << sb->s_blocksize_bits;
799 :
800 1356364 : while (len > 0) {
801 : /* find how many blocks can be covered since this position */
802 1125968 : max = ffs(first | border) - 1;
803 :
804 : /* find how many blocks of power 2 we need to mark */
805 1125968 : min = fls(len) - 1;
806 :
807 1125968 : if (max < min)
808 : min = max;
809 1125968 : chunk = 1 << min;
810 :
811 : /* mark multiblock chunks only */
812 1125968 : grp->bb_counters[min]++;
813 1125968 : if (min > 0)
814 1067225 : mb_clear_bit(first >> min,
815 1067225 : buddy + sbi->s_mb_offsets[min]);
816 :
817 1125967 : len -= chunk;
818 1125967 : first += chunk;
819 : }
820 230396 : }
821 :
822 : static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
823 : {
824 6867833 : int order;
825 :
826 : /*
827 : * We don't bother with a special lists groups with only 1 block free
828 : * extents and for completely empty groups.
829 : */
830 6867833 : order = fls(len) - 2;
831 6867833 : if (order < 0)
832 : return 0;
833 6833801 : if (order == MB_NUM_ORDERS(sb))
834 150767 : order--;
835 : return order;
836 : }
837 :
838 : /* Move group to appropriate avg_fragment_size list */
839 : static void
840 5695943 : mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
841 : {
842 5695943 : struct ext4_sb_info *sbi = EXT4_SB(sb);
843 5695943 : int new_order;
844 :
845 5695943 : if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
846 : return;
847 :
848 5359699 : new_order = mb_avg_fragment_size_order(sb,
849 5359699 : grp->bb_free / grp->bb_fragments);
850 5359699 : if (new_order == grp->bb_avg_fragment_size_order)
851 : return;
852 :
853 498177 : if (grp->bb_avg_fragment_size_order != -1) {
854 338227 : write_lock(&sbi->s_mb_avg_fragment_size_locks[
855 : grp->bb_avg_fragment_size_order]);
856 338225 : list_del(&grp->bb_avg_fragment_size_node);
857 338224 : write_unlock(&sbi->s_mb_avg_fragment_size_locks[
858 : grp->bb_avg_fragment_size_order]);
859 : }
860 498178 : grp->bb_avg_fragment_size_order = new_order;
861 498178 : write_lock(&sbi->s_mb_avg_fragment_size_locks[
862 : grp->bb_avg_fragment_size_order]);
863 498192 : list_add_tail(&grp->bb_avg_fragment_size_node,
864 498192 : &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]);
865 498191 : write_unlock(&sbi->s_mb_avg_fragment_size_locks[
866 : grp->bb_avg_fragment_size_order]);
867 : }
868 :
869 : /*
870 : * Choose next group by traversing largest_free_order lists. Updates *new_cr if
871 : * cr level needs an update.
872 : */
873 104538 : static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac,
874 : enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
875 : {
876 104538 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
877 104538 : struct ext4_group_info *iter, *grp;
878 104538 : int i;
879 :
880 104538 : if (ac->ac_status == AC_STATUS_FOUND)
881 : return;
882 :
883 104538 : if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED))
884 0 : atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions);
885 :
886 104538 : grp = NULL;
887 428224 : for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
888 325513 : if (list_empty(&sbi->s_mb_largest_free_orders[i]))
889 300180 : continue;
890 25333 : read_lock(&sbi->s_mb_largest_free_orders_locks[i]);
891 25342 : if (list_empty(&sbi->s_mb_largest_free_orders[i])) {
892 0 : read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
893 0 : continue;
894 : }
895 25342 : grp = NULL;
896 160857 : list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
897 : bb_largest_free_order_node) {
898 137372 : if (sbi->s_mb_stats)
899 0 : atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]);
900 137372 : if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) {
901 : grp = iter;
902 : break;
903 : }
904 : }
905 25339 : read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
906 25343 : if (grp)
907 : break;
908 : }
909 :
910 104548 : if (!grp) {
911 : /* Increment cr and search again */
912 102711 : *new_cr = CR_GOAL_LEN_FAST;
913 : } else {
914 1837 : *group = grp->bb_group;
915 1837 : ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED;
916 : }
917 : }
918 :
919 : /*
920 : * Find a suitable group of given order from the average fragments list.
921 : */
922 : static struct ext4_group_info *
923 5325082 : ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int order)
924 : {
925 5325082 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
926 5325082 : struct list_head *frag_list = &sbi->s_mb_avg_fragment_size[order];
927 5325082 : rwlock_t *frag_list_lock = &sbi->s_mb_avg_fragment_size_locks[order];
928 5325082 : struct ext4_group_info *grp = NULL, *iter;
929 5325082 : enum criteria cr = ac->ac_criteria;
930 :
931 5325082 : if (list_empty(frag_list))
932 : return NULL;
933 675269 : read_lock(frag_list_lock);
934 675370 : if (list_empty(frag_list)) {
935 0 : read_unlock(frag_list_lock);
936 0 : return NULL;
937 : }
938 11476296 : list_for_each_entry(iter, frag_list, bb_avg_fragment_size_node) {
939 10968000 : if (sbi->s_mb_stats)
940 0 : atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]);
941 10968000 : if (likely(ext4_mb_good_group(ac, iter->bb_group, cr))) {
942 : grp = iter;
943 : break;
944 : }
945 : }
946 674484 : read_unlock(frag_list_lock);
947 674484 : return grp;
948 : }
949 :
950 : /*
951 : * Choose next group by traversing average fragment size list of suitable
952 : * order. Updates *new_cr if cr level needs an update.
953 : */
954 796074 : static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac,
955 : enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
956 : {
957 796074 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
958 796074 : struct ext4_group_info *grp = NULL;
959 796074 : int i;
960 :
961 796074 : if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) {
962 37 : if (sbi->s_mb_stats)
963 0 : atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions);
964 : }
965 :
966 796074 : for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
967 5392846 : i < MB_NUM_ORDERS(ac->ac_sb); i++) {
968 4612904 : grp = ext4_mb_find_good_group_avg_frag_lists(ac, i);
969 4613002 : if (grp)
970 : break;
971 : }
972 :
973 796172 : if (grp) {
974 16230 : *group = grp->bb_group;
975 16230 : ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED;
976 : } else {
977 779942 : *new_cr = CR_BEST_AVAIL_LEN;
978 : }
979 796172 : }
980 :
981 : /*
982 : * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment
983 : * order we have and proactively trim the goal request length to that order to
984 : * find a suitable group faster.
985 : *
986 : * This optimizes allocation speed at the cost of slightly reduced
987 : * preallocations. However, we make sure that we don't trim the request too
988 : * much and fall to CR_GOAL_LEN_SLOW in that case.
989 : */
990 918557 : static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac,
991 : enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
992 : {
993 918557 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
994 918557 : struct ext4_group_info *grp = NULL;
995 918557 : int i, order, min_order;
996 918557 : unsigned long num_stripe_clusters = 0;
997 :
998 918557 : if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) {
999 138730 : if (sbi->s_mb_stats)
1000 0 : atomic_inc(&sbi->s_bal_best_avail_bad_suggestions);
1001 : }
1002 :
1003 : /*
1004 : * mb_avg_fragment_size_order() returns order in a way that makes
1005 : * retrieving back the length using (1 << order) inaccurate. Hence, use
1006 : * fls() instead since we need to know the actual length while modifying
1007 : * goal length.
1008 : */
1009 918557 : order = fls(ac->ac_g_ex.fe_len) - 1;
1010 918557 : min_order = order - sbi->s_mb_best_avail_max_trim_order;
1011 918557 : if (min_order < 0)
1012 : min_order = 0;
1013 :
1014 918557 : if (sbi->s_stripe > 0) {
1015 : /*
1016 : * We are assuming that stripe size is always a multiple of
1017 : * cluster ratio otherwise __ext4_fill_super exists early.
1018 : */
1019 918523 : num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe);
1020 918523 : if (1 << min_order < num_stripe_clusters)
1021 : /*
1022 : * We consider 1 order less because later we round
1023 : * up the goal len to num_stripe_clusters
1024 : */
1025 821167 : min_order = fls(num_stripe_clusters) - 1;
1026 : }
1027 :
1028 918557 : if (1 << min_order < ac->ac_o_ex.fe_len)
1029 488014 : min_order = fls(ac->ac_o_ex.fe_len);
1030 :
1031 1480038 : for (i = order; i >= min_order; i--) {
1032 712060 : int frag_order;
1033 : /*
1034 : * Scale down goal len to make sure we find something
1035 : * in the free fragments list. Basically, reduce
1036 : * preallocations.
1037 : */
1038 712060 : ac->ac_g_ex.fe_len = 1 << i;
1039 :
1040 712060 : if (num_stripe_clusters > 0) {
1041 : /*
1042 : * Try to round up the adjusted goal length to
1043 : * stripe size (in cluster units) multiple for
1044 : * efficiency.
1045 : */
1046 712077 : ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len,
1047 : num_stripe_clusters);
1048 : }
1049 :
1050 712060 : frag_order = mb_avg_fragment_size_order(ac->ac_sb,
1051 : ac->ac_g_ex.fe_len);
1052 :
1053 712060 : grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order);
1054 712174 : if (grp)
1055 : break;
1056 : }
1057 :
1058 918671 : if (grp) {
1059 150690 : *group = grp->bb_group;
1060 150690 : ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
1061 : } else {
1062 : /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
1063 767981 : ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
1064 767981 : *new_cr = CR_GOAL_LEN_SLOW;
1065 : }
1066 918671 : }
1067 :
1068 331213080 : static inline int should_optimize_scan(struct ext4_allocation_context *ac)
1069 : {
1070 331213080 : if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN)))
1071 : return 0;
1072 328227370 : if (ac->ac_criteria >= CR_GOAL_LEN_SLOW)
1073 : return 0;
1074 15365166 : if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
1075 2416 : return 0;
1076 : return 1;
1077 : }
1078 :
1079 : /*
1080 : * Return next linear group for allocation. If linear traversal should not be
1081 : * performed, this function just returns the same group
1082 : */
1083 : static int
1084 164582975 : next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups)
1085 : {
1086 164582975 : if (!should_optimize_scan(ac))
1087 157866898 : goto inc_and_return;
1088 :
1089 6774602 : if (ac->ac_groups_linear_remaining) {
1090 6774663 : ac->ac_groups_linear_remaining--;
1091 6774663 : goto inc_and_return;
1092 : }
1093 :
1094 : return group;
1095 164641561 : inc_and_return:
1096 : /*
1097 : * Artificially restricted ngroups for non-extent
1098 : * files makes group > ngroups possible on first loop.
1099 : */
1100 164641561 : return group + 1 >= ngroups ? 0 : group + 1;
1101 : }
1102 :
1103 : /*
1104 : * ext4_mb_choose_next_group: choose next group for allocation.
1105 : *
1106 : * @ac Allocation Context
1107 : * @new_cr This is an output parameter. If the there is no good group
1108 : * available at current CR level, this field is updated to indicate
1109 : * the new cr level that should be used.
1110 : * @group This is an input / output parameter. As an input it indicates the
1111 : * next group that the allocator intends to use for allocation. As
1112 : * output, this field indicates the next group that should be used as
1113 : * determined by the optimization functions.
1114 : * @ngroups Total number of groups
1115 : */
1116 166305182 : static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
1117 : enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
1118 : {
1119 166305182 : *new_cr = ac->ac_criteria;
1120 :
1121 166305182 : if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) {
1122 164718585 : *group = next_linear_group(ac, *group, ngroups);
1123 164167449 : return;
1124 : }
1125 :
1126 1819298 : if (*new_cr == CR_POWER2_ALIGNED) {
1127 104619 : ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups);
1128 1714679 : } else if (*new_cr == CR_GOAL_LEN_FAST) {
1129 796071 : ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups);
1130 918608 : } else if (*new_cr == CR_BEST_AVAIL_LEN) {
1131 918608 : ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups);
1132 : } else {
1133 : /*
1134 : * TODO: For CR=2, we can arrange groups in an rb tree sorted by
1135 : * bb_free. But until that happens, we should never come here.
1136 : */
1137 0 : WARN_ON(1);
1138 : }
1139 : }
1140 :
1141 : /*
1142 : * Cache the order of the largest free extent we have available in this block
1143 : * group.
1144 : */
1145 : static void
1146 5696025 : mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
1147 : {
1148 5696025 : struct ext4_sb_info *sbi = EXT4_SB(sb);
1149 5696025 : int i;
1150 :
1151 21167276 : for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
1152 21120788 : if (grp->bb_counters[i] > 0)
1153 : break;
1154 : /* No need to move between order lists? */
1155 5696025 : if (!test_opt2(sb, MB_OPTIMIZE_SCAN) ||
1156 5403936 : i == grp->bb_largest_free_order) {
1157 5283465 : grp->bb_largest_free_order = i;
1158 5283465 : return;
1159 : }
1160 :
1161 412560 : if (grp->bb_largest_free_order >= 0) {
1162 210251 : write_lock(&sbi->s_mb_largest_free_orders_locks[
1163 : grp->bb_largest_free_order]);
1164 210254 : list_del_init(&grp->bb_largest_free_order_node);
1165 210252 : write_unlock(&sbi->s_mb_largest_free_orders_locks[
1166 : grp->bb_largest_free_order]);
1167 : }
1168 412562 : grp->bb_largest_free_order = i;
1169 412562 : if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
1170 368742 : write_lock(&sbi->s_mb_largest_free_orders_locks[
1171 : grp->bb_largest_free_order]);
1172 368740 : list_add_tail(&grp->bb_largest_free_order_node,
1173 368740 : &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]);
1174 368738 : write_unlock(&sbi->s_mb_largest_free_orders_locks[
1175 : grp->bb_largest_free_order]);
1176 : }
1177 : }
1178 :
1179 : static noinline_for_stack
1180 160468 : void ext4_mb_generate_buddy(struct super_block *sb,
1181 : void *buddy, void *bitmap, ext4_group_t group,
1182 : struct ext4_group_info *grp)
1183 : {
1184 160468 : struct ext4_sb_info *sbi = EXT4_SB(sb);
1185 160468 : ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
1186 160468 : ext4_grpblk_t i = 0;
1187 160468 : ext4_grpblk_t first;
1188 160468 : ext4_grpblk_t len;
1189 160468 : unsigned free = 0;
1190 160468 : unsigned fragments = 0;
1191 160468 : unsigned long long period = get_cycles();
1192 :
1193 : /* initialize buddy from bitmap which is aggregation
1194 : * of on-disk bitmap and preallocations */
1195 160466 : i = mb_find_next_zero_bit(bitmap, max, 0);
1196 160467 : grp->bb_first_free = i;
1197 399642 : while (i < max) {
1198 239174 : fragments++;
1199 239174 : first = i;
1200 239174 : i = mb_find_next_bit(bitmap, max, i);
1201 239175 : len = i - first;
1202 239175 : free += len;
1203 239175 : if (len > 1)
1204 230397 : ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
1205 : else
1206 8778 : grp->bb_counters[0]++;
1207 239174 : if (i < max)
1208 80441 : i = mb_find_next_zero_bit(bitmap, max, i);
1209 : }
1210 160468 : grp->bb_fragments = fragments;
1211 :
1212 160468 : if (free != grp->bb_free) {
1213 0 : ext4_grp_locked_error(sb, group, 0, 0,
1214 : "block bitmap and bg descriptor "
1215 : "inconsistent: %u vs %u free clusters",
1216 : free, grp->bb_free);
1217 : /*
1218 : * If we intend to continue, we consider group descriptor
1219 : * corrupt and update bb_free using bitmap value
1220 : */
1221 0 : grp->bb_free = free;
1222 0 : ext4_mark_group_bitmap_corrupted(sb, group,
1223 : EXT4_GROUP_INFO_BBITMAP_CORRUPT);
1224 : }
1225 160468 : mb_set_largest_free_order(sb, grp);
1226 160468 : mb_update_avg_fragment_size(sb, grp);
1227 :
1228 160468 : clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
1229 :
1230 160468 : period = get_cycles() - period;
1231 160468 : atomic_inc(&sbi->s_mb_buddies_generated);
1232 160467 : atomic64_add(period, &sbi->s_mb_generation_time);
1233 160468 : }
1234 :
1235 : /* The buddy information is attached the buddy cache inode
1236 : * for convenience. The information regarding each group
1237 : * is loaded via ext4_mb_load_buddy. The information involve
1238 : * block bitmap and buddy information. The information are
1239 : * stored in the inode as
1240 : *
1241 : * { page }
1242 : * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
1243 : *
1244 : *
1245 : * one block each for bitmap and buddy information.
1246 : * So for each group we take up 2 blocks. A page can
1247 : * contain blocks_per_page (PAGE_SIZE / blocksize) blocks.
1248 : * So it can have information regarding groups_per_page which
1249 : * is blocks_per_page/2
1250 : *
1251 : * Locking note: This routine takes the block group lock of all groups
1252 : * for this page; do not hold this lock when calling this routine!
1253 : */
1254 :
1255 310963 : static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
1256 : {
1257 310963 : ext4_group_t ngroups;
1258 310963 : int blocksize;
1259 310963 : int blocks_per_page;
1260 310963 : int groups_per_page;
1261 310963 : int err = 0;
1262 310963 : int i;
1263 310963 : ext4_group_t first_group, group;
1264 310963 : int first_block;
1265 310963 : struct super_block *sb;
1266 310963 : struct buffer_head *bhs;
1267 310963 : struct buffer_head **bh = NULL;
1268 310963 : struct inode *inode;
1269 310963 : char *data;
1270 310963 : char *bitmap;
1271 310963 : struct ext4_group_info *grinfo;
1272 :
1273 310963 : inode = page->mapping->host;
1274 310963 : sb = inode->i_sb;
1275 310963 : ngroups = ext4_get_groups_count(sb);
1276 310963 : blocksize = i_blocksize(inode);
1277 310963 : blocks_per_page = PAGE_SIZE / blocksize;
1278 :
1279 310963 : mb_debug(sb, "init page %lu\n", page->index);
1280 :
1281 310963 : groups_per_page = blocks_per_page >> 1;
1282 310963 : if (groups_per_page == 0)
1283 : groups_per_page = 1;
1284 :
1285 : /* allocate buffer_heads to read bitmaps */
1286 3424 : if (groups_per_page > 1) {
1287 3408 : i = sizeof(struct buffer_head *) * groups_per_page;
1288 3408 : bh = kzalloc(i, gfp);
1289 3408 : if (bh == NULL)
1290 : return -ENOMEM;
1291 : } else
1292 : bh = &bhs;
1293 :
1294 310963 : first_group = page->index * blocks_per_page / 2;
1295 :
1296 : /* read all groups the page covers into the cache */
1297 625067 : for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
1298 314371 : if (group >= ngroups)
1299 : break;
1300 :
1301 314370 : grinfo = ext4_get_group_info(sb, group);
1302 314371 : if (!grinfo)
1303 0 : continue;
1304 : /*
1305 : * If page is uptodate then we came here after online resize
1306 : * which added some new uninitialized group info structs, so
1307 : * we must skip all initialized uptodate buddies on the page,
1308 : * which may be currently in use by an allocating task.
1309 : */
1310 314371 : if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
1311 0 : bh[i] = NULL;
1312 0 : continue;
1313 : }
1314 314371 : bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
1315 314370 : if (IS_ERR(bh[i])) {
1316 266 : err = PTR_ERR(bh[i]);
1317 266 : bh[i] = NULL;
1318 266 : goto out;
1319 : }
1320 : mb_debug(sb, "read bitmap for group %u\n", group);
1321 : }
1322 :
1323 : /* wait for I/O completion */
1324 624802 : for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
1325 314105 : int err2;
1326 :
1327 314105 : if (!bh[i])
1328 1 : continue;
1329 314104 : err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
1330 314104 : if (!err)
1331 314105 : err = err2;
1332 : }
1333 :
1334 310697 : first_block = page->index * blocks_per_page;
1335 631633 : for (i = 0; i < blocks_per_page; i++) {
1336 320936 : group = (first_block + i) >> 1;
1337 320936 : if (group >= ngroups)
1338 : break;
1339 :
1340 320935 : if (!bh[group - first_group])
1341 : /* skip initialized uptodate buddy */
1342 0 : continue;
1343 :
1344 641870 : if (!buffer_verified(bh[group - first_group]))
1345 : /* Skip faulty bitmaps */
1346 0 : continue;
1347 320935 : err = 0;
1348 :
1349 : /*
1350 : * data carry information regarding this
1351 : * particular group in the format specified
1352 : * above
1353 : *
1354 : */
1355 320935 : data = page_address(page) + (i * blocksize);
1356 320935 : bitmap = bh[group - first_group]->b_data;
1357 :
1358 : /*
1359 : * We place the buddy block and bitmap block
1360 : * close together
1361 : */
1362 320935 : if ((first_block + i) & 1) {
1363 : /* this is block of buddy */
1364 160468 : BUG_ON(incore == NULL);
1365 160468 : mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
1366 : group, page->index, i * blocksize);
1367 160468 : trace_ext4_mb_buddy_bitmap_load(sb, group);
1368 160468 : grinfo = ext4_get_group_info(sb, group);
1369 160468 : if (!grinfo) {
1370 0 : err = -EFSCORRUPTED;
1371 0 : goto out;
1372 : }
1373 160468 : grinfo->bb_fragments = 0;
1374 160468 : memset(grinfo->bb_counters, 0,
1375 : sizeof(*grinfo->bb_counters) *
1376 : (MB_NUM_ORDERS(sb)));
1377 : /*
1378 : * incore got set to the group block bitmap below
1379 : */
1380 160468 : ext4_lock_group(sb, group);
1381 : /* init the buddy */
1382 160468 : memset(data, 0xff, blocksize);
1383 160468 : ext4_mb_generate_buddy(sb, data, incore, group, grinfo);
1384 160468 : ext4_unlock_group(sb, group);
1385 160468 : incore = NULL;
1386 : } else {
1387 : /* this is block of bitmap */
1388 160467 : BUG_ON(incore != NULL);
1389 160467 : mb_debug(sb, "put bitmap for group %u in page %lu/%x\n",
1390 : group, page->index, i * blocksize);
1391 160467 : trace_ext4_mb_bitmap_load(sb, group);
1392 :
1393 : /* see comments in ext4_mb_put_pa() */
1394 160467 : ext4_lock_group(sb, group);
1395 320934 : memcpy(data, bitmap, blocksize);
1396 :
1397 : /* mark all preallocated blks used in in-core bitmap */
1398 160467 : ext4_mb_generate_from_pa(sb, data, group);
1399 160468 : ext4_mb_generate_from_freelist(sb, data, group);
1400 160468 : ext4_unlock_group(sb, group);
1401 :
1402 : /* set incore so that the buddy information can be
1403 : * generated using this
1404 : */
1405 160468 : incore = data;
1406 : }
1407 : }
1408 310698 : SetPageUptodate(page);
1409 :
1410 310964 : out:
1411 310964 : if (bh) {
1412 625336 : for (i = 0; i < groups_per_page; i++)
1413 314372 : brelse(bh[i]);
1414 310964 : if (bh != &bhs)
1415 3408 : kfree(bh);
1416 : }
1417 : return err;
1418 : }
1419 :
1420 : /*
1421 : * Lock the buddy and bitmap pages. This make sure other parallel init_group
1422 : * on the same buddy page doesn't happen whild holding the buddy page lock.
1423 : * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
1424 : * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
1425 : */
1426 157744 : static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
1427 : ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
1428 : {
1429 157744 : struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
1430 157744 : int block, pnum, poff;
1431 157744 : int blocks_per_page;
1432 157744 : struct page *page;
1433 :
1434 157744 : e4b->bd_buddy_page = NULL;
1435 157744 : e4b->bd_bitmap_page = NULL;
1436 :
1437 157744 : blocks_per_page = PAGE_SIZE / sb->s_blocksize;
1438 : /*
1439 : * the buddy cache inode stores the block bitmap
1440 : * and buddy information in consecutive blocks.
1441 : * So for each group we need two blocks.
1442 : */
1443 157744 : block = group * 2;
1444 157744 : pnum = block / blocks_per_page;
1445 157744 : poff = block % blocks_per_page;
1446 157744 : page = find_or_create_page(inode->i_mapping, pnum, gfp);
1447 157745 : if (!page)
1448 : return -ENOMEM;
1449 157745 : BUG_ON(page->mapping != inode->i_mapping);
1450 157745 : e4b->bd_bitmap_page = page;
1451 157745 : e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1452 :
1453 157745 : if (blocks_per_page >= 2) {
1454 : /* buddy and bitmap are on the same page */
1455 : return 0;
1456 : }
1457 :
1458 154321 : block++;
1459 154321 : pnum = block / blocks_per_page;
1460 154321 : page = find_or_create_page(inode->i_mapping, pnum, gfp);
1461 154321 : if (!page)
1462 : return -ENOMEM;
1463 154321 : BUG_ON(page->mapping != inode->i_mapping);
1464 154321 : e4b->bd_buddy_page = page;
1465 154321 : return 0;
1466 : }
1467 :
1468 157745 : static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1469 : {
1470 157745 : if (e4b->bd_bitmap_page) {
1471 157745 : unlock_page(e4b->bd_bitmap_page);
1472 157744 : put_page(e4b->bd_bitmap_page);
1473 : }
1474 157744 : if (e4b->bd_buddy_page) {
1475 154320 : unlock_page(e4b->bd_buddy_page);
1476 154321 : put_page(e4b->bd_buddy_page);
1477 : }
1478 157744 : }
1479 :
1480 : /*
1481 : * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1482 : * block group lock of all groups for this page; do not hold the BG lock when
1483 : * calling this routine!
1484 : */
1485 : static noinline_for_stack
1486 157744 : int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
1487 : {
1488 :
1489 157744 : struct ext4_group_info *this_grp;
1490 157744 : struct ext4_buddy e4b;
1491 157744 : struct page *page;
1492 157744 : int ret = 0;
1493 :
1494 157744 : might_sleep();
1495 157744 : mb_debug(sb, "init group %u\n", group);
1496 157744 : this_grp = ext4_get_group_info(sb, group);
1497 157744 : if (!this_grp)
1498 : return -EFSCORRUPTED;
1499 :
1500 : /*
1501 : * This ensures that we don't reinit the buddy cache
1502 : * page which map to the group from which we are already
1503 : * allocating. If we are looking at the buddy cache we would
1504 : * have taken a reference using ext4_mb_load_buddy and that
1505 : * would have pinned buddy page to page cache.
1506 : * The call to ext4_mb_get_buddy_page_lock will mark the
1507 : * page accessed.
1508 : */
1509 157744 : ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
1510 157745 : if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1511 : /*
1512 : * somebody initialized the group
1513 : * return without doing anything
1514 : */
1515 530 : goto err;
1516 : }
1517 :
1518 157215 : page = e4b.bd_bitmap_page;
1519 157215 : ret = ext4_mb_init_cache(page, NULL, gfp);
1520 157215 : if (ret)
1521 266 : goto err;
1522 156949 : if (!PageUptodate(page)) {
1523 0 : ret = -EIO;
1524 0 : goto err;
1525 : }
1526 :
1527 156949 : if (e4b.bd_buddy_page == NULL) {
1528 : /*
1529 : * If both the bitmap and buddy are in
1530 : * the same page we don't need to force
1531 : * init the buddy
1532 : */
1533 3424 : ret = 0;
1534 3424 : goto err;
1535 : }
1536 : /* init buddy cache */
1537 153525 : page = e4b.bd_buddy_page;
1538 153525 : ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
1539 153525 : if (ret)
1540 0 : goto err;
1541 153525 : if (!PageUptodate(page)) {
1542 0 : ret = -EIO;
1543 0 : goto err;
1544 : }
1545 153525 : err:
1546 157745 : ext4_mb_put_buddy_page_lock(&e4b);
1547 157745 : return ret;
1548 : }
1549 :
1550 : /*
1551 : * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1552 : * block group lock of all groups for this page; do not hold the BG lock when
1553 : * calling this routine!
1554 : */
1555 : static noinline_for_stack int
1556 21420778 : ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
1557 : struct ext4_buddy *e4b, gfp_t gfp)
1558 : {
1559 21420778 : int blocks_per_page;
1560 21420778 : int block;
1561 21420778 : int pnum;
1562 21420778 : int poff;
1563 21420778 : struct page *page;
1564 21420778 : int ret;
1565 21420778 : struct ext4_group_info *grp;
1566 21420778 : struct ext4_sb_info *sbi = EXT4_SB(sb);
1567 21420778 : struct inode *inode = sbi->s_buddy_cache;
1568 :
1569 21420778 : might_sleep();
1570 21430385 : mb_debug(sb, "load group %u\n", group);
1571 :
1572 21430385 : blocks_per_page = PAGE_SIZE / sb->s_blocksize;
1573 21430385 : grp = ext4_get_group_info(sb, group);
1574 21430751 : if (!grp)
1575 : return -EFSCORRUPTED;
1576 :
1577 21430751 : e4b->bd_blkbits = sb->s_blocksize_bits;
1578 21430751 : e4b->bd_info = grp;
1579 21430751 : e4b->bd_sb = sb;
1580 21430751 : e4b->bd_group = group;
1581 21430751 : e4b->bd_buddy_page = NULL;
1582 21430751 : e4b->bd_bitmap_page = NULL;
1583 :
1584 21430751 : if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1585 : /*
1586 : * we need full data about the group
1587 : * to make a good selection
1588 : */
1589 3558 : ret = ext4_mb_init_group(sb, group, gfp);
1590 3559 : if (ret)
1591 : return ret;
1592 : }
1593 :
1594 : /*
1595 : * the buddy cache inode stores the block bitmap
1596 : * and buddy information in consecutive blocks.
1597 : * So for each group we need two blocks.
1598 : */
1599 21430752 : block = group * 2;
1600 21430752 : pnum = block / blocks_per_page;
1601 21430752 : poff = block % blocks_per_page;
1602 :
1603 : /* we could use find_or_create_page(), but it locks page
1604 : * what we'd like to avoid in fast path ... */
1605 21430752 : page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1606 21439754 : if (page == NULL || !PageUptodate(page)) {
1607 112 : if (page)
1608 : /*
1609 : * drop the page reference and try
1610 : * to get the page with lock. If we
1611 : * are not uptodate that implies
1612 : * somebody just created the page but
1613 : * is yet to initialize the same. So
1614 : * wait for it to initialize.
1615 : */
1616 0 : put_page(page);
1617 112 : page = find_or_create_page(inode->i_mapping, pnum, gfp);
1618 112 : if (page) {
1619 112 : if (WARN_RATELIMIT(page->mapping != inode->i_mapping,
1620 : "ext4: bitmap's paging->mapping != inode->i_mapping\n")) {
1621 : /* should never happen */
1622 0 : unlock_page(page);
1623 0 : ret = -EINVAL;
1624 0 : goto err;
1625 : }
1626 112 : if (!PageUptodate(page)) {
1627 112 : ret = ext4_mb_init_cache(page, NULL, gfp);
1628 112 : if (ret) {
1629 0 : unlock_page(page);
1630 0 : goto err;
1631 : }
1632 : mb_cmp_bitmaps(e4b, page_address(page) +
1633 : (poff * sb->s_blocksize));
1634 : }
1635 112 : unlock_page(page);
1636 : }
1637 : }
1638 21433896 : if (page == NULL) {
1639 0 : ret = -ENOMEM;
1640 0 : goto err;
1641 : }
1642 21433896 : if (!PageUptodate(page)) {
1643 0 : ret = -EIO;
1644 0 : goto err;
1645 : }
1646 :
1647 : /* Pages marked accessed already */
1648 21431644 : e4b->bd_bitmap_page = page;
1649 21431644 : e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1650 :
1651 21431644 : block++;
1652 21431644 : pnum = block / blocks_per_page;
1653 21431644 : poff = block % blocks_per_page;
1654 :
1655 21431644 : page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1656 21444747 : if (page == NULL || !PageUptodate(page)) {
1657 113 : if (page)
1658 1 : put_page(page);
1659 113 : page = find_or_create_page(inode->i_mapping, pnum, gfp);
1660 113 : if (page) {
1661 113 : if (WARN_RATELIMIT(page->mapping != inode->i_mapping,
1662 : "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) {
1663 : /* should never happen */
1664 0 : unlock_page(page);
1665 0 : ret = -EINVAL;
1666 0 : goto err;
1667 : }
1668 113 : if (!PageUptodate(page)) {
1669 112 : ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
1670 : gfp);
1671 112 : if (ret) {
1672 0 : unlock_page(page);
1673 0 : goto err;
1674 : }
1675 : }
1676 113 : unlock_page(page);
1677 : }
1678 : }
1679 21442101 : if (page == NULL) {
1680 0 : ret = -ENOMEM;
1681 0 : goto err;
1682 : }
1683 21442101 : if (!PageUptodate(page)) {
1684 0 : ret = -EIO;
1685 0 : goto err;
1686 : }
1687 :
1688 : /* Pages marked accessed already */
1689 21435251 : e4b->bd_buddy_page = page;
1690 21435251 : e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1691 :
1692 21435251 : return 0;
1693 :
1694 0 : err:
1695 0 : if (page)
1696 0 : put_page(page);
1697 0 : if (e4b->bd_bitmap_page)
1698 0 : put_page(e4b->bd_bitmap_page);
1699 :
1700 0 : e4b->bd_buddy = NULL;
1701 0 : e4b->bd_bitmap = NULL;
1702 0 : return ret;
1703 : }
1704 :
1705 : static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1706 : struct ext4_buddy *e4b)
1707 : {
1708 18324438 : return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
1709 : }
1710 :
1711 21448637 : static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1712 : {
1713 21448637 : if (e4b->bd_bitmap_page)
1714 21448637 : put_page(e4b->bd_bitmap_page);
1715 21449445 : if (e4b->bd_buddy_page)
1716 21449445 : put_page(e4b->bd_buddy_page);
1717 21447267 : }
1718 :
1719 :
1720 500207666 : static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1721 : {
1722 500207666 : int order = 1, max;
1723 500207666 : void *bb;
1724 :
1725 500207666 : BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
1726 500207666 : BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1727 :
1728 2651480684 : while (order <= e4b->bd_blkbits + 1) {
1729 2519067202 : bb = mb_find_buddy(e4b, order, &max);
1730 2468578794 : if (!mb_test_bit(block >> order, bb)) {
1731 : /* this block is part of buddy of order 'order' */
1732 370389820 : return order;
1733 : }
1734 2151273018 : order++;
1735 : }
1736 : return 0;
1737 : }
1738 :
1739 2644773 : static void mb_clear_bits(void *bm, int cur, int len)
1740 : {
1741 2644773 : __u32 *addr;
1742 :
1743 2644773 : len = cur + len;
1744 32159553 : while (cur < len) {
1745 29514769 : if ((cur & 31) == 0 && (len - cur) >= 32) {
1746 : /* fast path: clear whole word at once */
1747 7001371 : addr = bm + (cur >> 3);
1748 7001371 : *addr = 0;
1749 7001371 : cur += 32;
1750 7001371 : continue;
1751 : }
1752 22513398 : mb_clear_bit(cur, bm);
1753 22513409 : cur++;
1754 : }
1755 2644784 : }
1756 :
1757 : /* clear bits in given range
1758 : * will return first found zero bit if any, -1 otherwise
1759 : */
1760 2838366 : static int mb_test_and_clear_bits(void *bm, int cur, int len)
1761 : {
1762 2838366 : __u32 *addr;
1763 2838366 : int zero_bit = -1;
1764 :
1765 2838366 : len = cur + len;
1766 67768396 : while (cur < len) {
1767 64930022 : if ((cur & 31) == 0 && (len - cur) >= 32) {
1768 : /* fast path: clear whole word at once */
1769 24702319 : addr = bm + (cur >> 3);
1770 24702319 : if (*addr != (__u32)(-1) && zero_bit == -1)
1771 0 : zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
1772 24702319 : *addr = 0;
1773 24702319 : cur += 32;
1774 24702319 : continue;
1775 : }
1776 40227703 : if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
1777 0 : zero_bit = cur;
1778 40227711 : cur++;
1779 : }
1780 :
1781 2838374 : return zero_bit;
1782 : }
1783 :
1784 6317915 : void mb_set_bits(void *bm, int cur, int len)
1785 : {
1786 6317915 : __u32 *addr;
1787 :
1788 6317915 : len = cur + len;
1789 100922173 : while (cur < len) {
1790 94604334 : if ((cur & 31) == 0 && (len - cur) >= 32) {
1791 : /* fast path: set whole word at once */
1792 35109457 : addr = bm + (cur >> 3);
1793 35109457 : *addr = 0xffffffff;
1794 35109457 : cur += 32;
1795 35109457 : continue;
1796 : }
1797 59494877 : mb_set_bit(cur, bm);
1798 59494801 : cur++;
1799 : }
1800 6317839 : }
1801 :
1802 9643900 : static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
1803 : {
1804 9643900 : if (mb_test_bit(*bit + side, bitmap)) {
1805 7340588 : mb_clear_bit(*bit, bitmap);
1806 7340607 : (*bit) -= side;
1807 7340607 : return 1;
1808 : }
1809 : else {
1810 2303322 : (*bit) += side;
1811 2303322 : mb_set_bit(*bit, bitmap);
1812 2303322 : return -1;
1813 : }
1814 : }
1815 :
1816 2577456 : static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
1817 : {
1818 2577456 : int max;
1819 2577456 : int order = 1;
1820 2577456 : void *buddy = mb_find_buddy(e4b, order, &max);
1821 :
1822 10997852 : while (buddy) {
1823 10997841 : void *buddy2;
1824 :
1825 : /* Bits in range [first; last] are known to be set since
1826 : * corresponding blocks were allocated. Bits in range
1827 : * (first; last) will stay set because they form buddies on
1828 : * upper layer. We just deal with borders if they don't
1829 : * align with upper layer and then go up.
1830 : * Releasing entire group is all about clearing
1831 : * single bit of highest order buddy.
1832 : */
1833 :
1834 : /* Example:
1835 : * ---------------------------------
1836 : * | 1 | 1 | 1 | 1 |
1837 : * ---------------------------------
1838 : * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
1839 : * ---------------------------------
1840 : * 0 1 2 3 4 5 6 7
1841 : * \_____________________/
1842 : *
1843 : * Neither [1] nor [6] is aligned to above layer.
1844 : * Left neighbour [0] is free, so mark it busy,
1845 : * decrease bb_counters and extend range to
1846 : * [0; 6]
1847 : * Right neighbour [7] is busy. It can't be coaleasced with [6], so
1848 : * mark [6] free, increase bb_counters and shrink range to
1849 : * [0; 5].
1850 : * Then shift range to [0; 2], go up and do the same.
1851 : */
1852 :
1853 :
1854 10997841 : if (first & 1)
1855 5139832 : e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
1856 10997827 : if (!(last & 1))
1857 4504492 : e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
1858 10997800 : if (first > last)
1859 : break;
1860 8440519 : order++;
1861 :
1862 8440519 : buddy2 = mb_find_buddy(e4b, order, &max);
1863 8440549 : if (!buddy2) {
1864 20153 : mb_clear_bits(buddy, first, last - first + 1);
1865 20153 : e4b->bd_info->bb_counters[order - 1] += last - first + 1;
1866 20153 : break;
1867 : }
1868 8420396 : first >>= 1;
1869 8420396 : last >>= 1;
1870 8420396 : buddy = buddy2;
1871 : }
1872 2577434 : }
1873 :
1874 2838341 : static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1875 : int first, int count)
1876 : {
1877 2838341 : int left_is_free = 0;
1878 2838341 : int right_is_free = 0;
1879 2838341 : int block;
1880 2838341 : int last = first + count - 1;
1881 2838341 : struct super_block *sb = e4b->bd_sb;
1882 :
1883 2838341 : if (WARN_ON(count == 0))
1884 : return;
1885 2838341 : BUG_ON(last >= (sb->s_blocksize << 3));
1886 2838341 : assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1887 : /* Don't bother if the block group is corrupt. */
1888 2838341 : if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
1889 : return;
1890 :
1891 2838341 : mb_check_buddy(e4b);
1892 2838341 : mb_free_blocks_double(inode, e4b, first, count);
1893 :
1894 2838341 : this_cpu_inc(discard_pa_seq);
1895 2838332 : e4b->bd_info->bb_free += count;
1896 2838332 : if (first < e4b->bd_info->bb_first_free)
1897 90494 : e4b->bd_info->bb_first_free = first;
1898 :
1899 : /* access memory sequentially: check left neighbour,
1900 : * clear range and then check right neighbour
1901 : */
1902 2838332 : if (first != 0)
1903 2816726 : left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
1904 2838361 : block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
1905 2838382 : if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
1906 2815523 : right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
1907 :
1908 2838374 : if (unlikely(block != -1)) {
1909 0 : struct ext4_sb_info *sbi = EXT4_SB(sb);
1910 0 : ext4_fsblk_t blocknr;
1911 :
1912 0 : blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1913 0 : blocknr += EXT4_C2B(sbi, block);
1914 0 : if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
1915 0 : ext4_grp_locked_error(sb, e4b->bd_group,
1916 : inode ? inode->i_ino : 0,
1917 : blocknr,
1918 : "freeing already freed block (bit %u); block bitmap corrupt.",
1919 : block);
1920 0 : ext4_mark_group_bitmap_corrupted(
1921 : sb, e4b->bd_group,
1922 : EXT4_GROUP_INFO_BBITMAP_CORRUPT);
1923 : }
1924 0 : goto done;
1925 : }
1926 :
1927 : /* let's maintain fragments counter */
1928 2838374 : if (left_is_free && right_is_free)
1929 481031 : e4b->bd_info->bb_fragments--;
1930 2357343 : else if (!left_is_free && !right_is_free)
1931 1608482 : e4b->bd_info->bb_fragments++;
1932 :
1933 : /* buddy[0] == bd_bitmap is a special case, so handle
1934 : * it right away and let mb_buddy_mark_free stay free of
1935 : * zero order checks.
1936 : * Check if neighbours are to be coaleasced,
1937 : * adjust bitmap bb_counters and borders appropriately.
1938 : */
1939 2838374 : if (first & 1) {
1940 1203968 : first += !left_is_free;
1941 2158767 : e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
1942 : }
1943 2838374 : if (!(last & 1)) {
1944 1164142 : last -= !right_is_free;
1945 1909039 : e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
1946 : }
1947 :
1948 2838374 : if (first <= last)
1949 2577458 : mb_buddy_mark_free(e4b, first >> 1, last >> 1);
1950 :
1951 260916 : done:
1952 2838348 : mb_set_largest_free_order(sb, e4b->bd_info);
1953 2838376 : mb_update_avg_fragment_size(sb, e4b->bd_info);
1954 2838390 : mb_check_buddy(e4b);
1955 : }
1956 :
1957 156390378 : static int mb_find_extent(struct ext4_buddy *e4b, int block,
1958 : int needed, struct ext4_free_extent *ex)
1959 : {
1960 156390378 : int next = block;
1961 156390378 : int max, order;
1962 156390378 : void *buddy;
1963 :
1964 156390378 : assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1965 156390378 : BUG_ON(ex == NULL);
1966 :
1967 156390378 : buddy = mb_find_buddy(e4b, 0, &max);
1968 156267465 : BUG_ON(buddy == NULL);
1969 156267465 : BUG_ON(block >= max);
1970 156267465 : if (mb_test_bit(block, buddy)) {
1971 964723 : ex->fe_len = 0;
1972 964723 : ex->fe_start = 0;
1973 964723 : ex->fe_group = 0;
1974 964723 : return 0;
1975 : }
1976 :
1977 : /* find actual order */
1978 155240608 : order = mb_find_order_for_block(e4b, block);
1979 154691550 : block = block >> order;
1980 :
1981 154691550 : ex->fe_len = 1 << order;
1982 154691550 : ex->fe_start = block << order;
1983 154691550 : ex->fe_group = e4b->bd_group;
1984 :
1985 : /* calc difference from given start */
1986 154691550 : next = next - ex->fe_start;
1987 154691550 : ex->fe_len -= next;
1988 154691550 : ex->fe_start += next;
1989 :
1990 976209383 : while (needed > ex->fe_len &&
1991 486244875 : mb_find_buddy(e4b, order, &max)) {
1992 :
1993 480903520 : if (block + 1 >= max)
1994 : break;
1995 :
1996 480493982 : next = (block + 1) * (1 << order);
1997 480493982 : if (mb_test_bit(next, e4b->bd_bitmap))
1998 : break;
1999 :
2000 340932720 : order = mb_find_order_for_block(e4b, next);
2001 :
2002 340614313 : block = next >> order;
2003 340614313 : ex->fe_len += 1 << order;
2004 : }
2005 :
2006 154767360 : if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
2007 : /* Should never happen! (but apparently sometimes does?!?) */
2008 0 : WARN_ON(1);
2009 0 : ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0,
2010 : "corruption or bug in mb_find_extent "
2011 : "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
2012 : block, order, needed, ex->fe_group, ex->fe_start,
2013 : ex->fe_len, ex->fe_logical);
2014 0 : ex->fe_len = 0;
2015 0 : ex->fe_start = 0;
2016 0 : ex->fe_group = 0;
2017 : }
2018 154767360 : return ex->fe_len;
2019 : }
2020 :
2021 2697199 : static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
2022 : {
2023 2697199 : int ord;
2024 2697199 : int mlen = 0;
2025 2697199 : int max = 0;
2026 2697199 : int cur;
2027 2697199 : int start = ex->fe_start;
2028 2697199 : int len = ex->fe_len;
2029 2697199 : unsigned ret = 0;
2030 2697199 : int len0 = len;
2031 2697199 : void *buddy;
2032 2697199 : bool split = false;
2033 :
2034 2697199 : BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
2035 2697199 : BUG_ON(e4b->bd_group != ex->fe_group);
2036 2697199 : assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
2037 2697199 : mb_check_buddy(e4b);
2038 2697199 : mb_mark_used_double(e4b, start, len);
2039 :
2040 2697199 : this_cpu_inc(discard_pa_seq);
2041 2697182 : e4b->bd_info->bb_free -= len;
2042 2697182 : if (e4b->bd_info->bb_first_free == start)
2043 560015 : e4b->bd_info->bb_first_free += len;
2044 :
2045 : /* let's maintain fragments counter */
2046 2697182 : if (start != 0)
2047 2673633 : mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
2048 2697170 : if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
2049 2672826 : max = !mb_test_bit(start + len, e4b->bd_bitmap);
2050 2697180 : if (mlen && max)
2051 176816 : e4b->bd_info->bb_fragments++;
2052 2520364 : else if (!mlen && !max)
2053 1050992 : e4b->bd_info->bb_fragments--;
2054 :
2055 : /* let's maintain buddy itself */
2056 14177665 : while (len) {
2057 11480472 : if (!split)
2058 8481605 : ord = mb_find_order_for_block(e4b, start);
2059 :
2060 11480419 : if (((start >> ord) << ord) == start && len >= (1 << ord)) {
2061 : /* the whole chunk may be allocated at once! */
2062 8481844 : mlen = 1 << ord;
2063 8481844 : if (!split)
2064 6514848 : buddy = mb_find_buddy(e4b, ord, &max);
2065 : else
2066 : split = false;
2067 8481805 : BUG_ON((start >> ord) >= max);
2068 8481805 : mb_set_bit(start >> ord, buddy);
2069 8481720 : e4b->bd_info->bb_counters[ord]--;
2070 8481720 : start += mlen;
2071 8481720 : len -= mlen;
2072 8481720 : BUG_ON(len < 0);
2073 8481720 : continue;
2074 : }
2075 :
2076 : /* store for history */
2077 2998575 : if (ret == 0)
2078 921151 : ret = len | (ord << 16);
2079 :
2080 : /* we have to split large buddy */
2081 2998575 : BUG_ON(ord <= 0);
2082 2998575 : buddy = mb_find_buddy(e4b, ord, &max);
2083 2998570 : mb_set_bit(start >> ord, buddy);
2084 2998565 : e4b->bd_info->bb_counters[ord]--;
2085 :
2086 2998565 : ord--;
2087 2998565 : cur = (start >> ord) & ~1U;
2088 2998565 : buddy = mb_find_buddy(e4b, ord, &max);
2089 2998569 : mb_clear_bit(cur, buddy);
2090 2998567 : mb_clear_bit(cur + 1, buddy);
2091 2998765 : e4b->bd_info->bb_counters[ord]++;
2092 2998765 : e4b->bd_info->bb_counters[ord]++;
2093 2998765 : split = true;
2094 : }
2095 2697193 : mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
2096 :
2097 2697146 : mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info);
2098 2697113 : mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
2099 2697089 : mb_check_buddy(e4b);
2100 :
2101 2697089 : return ret;
2102 : }
2103 :
2104 : /*
2105 : * Must be called under group lock!
2106 : */
2107 2679660 : static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
2108 : struct ext4_buddy *e4b)
2109 : {
2110 2679660 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2111 2679660 : int ret;
2112 :
2113 2679660 : BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
2114 2679660 : BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2115 :
2116 2679660 : ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
2117 2679660 : ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
2118 2679660 : ret = mb_mark_used(e4b, &ac->ac_b_ex);
2119 :
2120 : /* preallocation can change ac_b_ex, thus we store actually
2121 : * allocated blocks for history */
2122 2679575 : ac->ac_f_ex = ac->ac_b_ex;
2123 :
2124 2679575 : ac->ac_status = AC_STATUS_FOUND;
2125 2679575 : ac->ac_tail = ret & 0xffff;
2126 2679575 : ac->ac_buddy = ret >> 16;
2127 :
2128 : /*
2129 : * take the page reference. We want the page to be pinned
2130 : * so that we don't get a ext4_mb_init_cache_call for this
2131 : * group until we update the bitmap. That would mean we
2132 : * double allocate blocks. The reference is dropped
2133 : * in ext4_mb_release_context
2134 : */
2135 2679575 : ac->ac_bitmap_page = e4b->bd_bitmap_page;
2136 2679575 : get_page(ac->ac_bitmap_page);
2137 2679657 : ac->ac_buddy_page = e4b->bd_buddy_page;
2138 2679657 : get_page(ac->ac_buddy_page);
2139 : /* store last allocated for subsequent stream allocation */
2140 2679687 : if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2141 2036807 : spin_lock(&sbi->s_md_lock);
2142 2036824 : sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
2143 2036824 : sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
2144 2036824 : spin_unlock(&sbi->s_md_lock);
2145 : }
2146 : /*
2147 : * As we've just preallocated more space than
2148 : * user requested originally, we store allocated
2149 : * space in a special descriptor.
2150 : */
2151 2679688 : if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
2152 716958 : ext4_mb_new_preallocation(ac);
2153 :
2154 2679677 : }
2155 :
2156 160740519 : static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
2157 : struct ext4_buddy *e4b,
2158 : int finish_group)
2159 : {
2160 160740519 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2161 160740519 : struct ext4_free_extent *bex = &ac->ac_b_ex;
2162 160740519 : struct ext4_free_extent *gex = &ac->ac_g_ex;
2163 :
2164 160740519 : if (ac->ac_status == AC_STATUS_FOUND)
2165 : return;
2166 : /*
2167 : * We don't want to scan for a whole year
2168 : */
2169 159632007 : if (ac->ac_found > sbi->s_mb_max_to_scan &&
2170 1321092 : !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2171 1321098 : ac->ac_status = AC_STATUS_BREAK;
2172 1321098 : return;
2173 : }
2174 :
2175 : /*
2176 : * Haven't found good chunk so far, let's continue
2177 : */
2178 158310909 : if (bex->fe_len < gex->fe_len)
2179 : return;
2180 :
2181 6944925 : if (finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
2182 678089 : ext4_mb_use_best_found(ac, e4b);
2183 : }
2184 :
2185 : /*
2186 : * The routine checks whether found extent is good enough. If it is,
2187 : * then the extent gets marked used and flag is set to the context
2188 : * to stop scanning. Otherwise, the extent is compared with the
2189 : * previous found extent and if new one is better, then it's stored
2190 : * in the context. Later, the best found extent will be used, if
2191 : * mballoc can't find good enough extent.
2192 : *
2193 : * The algorithm used is roughly as follows:
2194 : *
2195 : * * If free extent found is exactly as big as goal, then
2196 : * stop the scan and use it immediately
2197 : *
2198 : * * If free extent found is smaller than goal, then keep retrying
2199 : * upto a max of sbi->s_mb_max_to_scan times (default 200). After
2200 : * that stop scanning and use whatever we have.
2201 : *
2202 : * * If free extent found is bigger than goal, then keep retrying
2203 : * upto a max of sbi->s_mb_min_to_scan times (default 10) before
2204 : * stopping the scan and using the extent.
2205 : *
2206 : *
2207 : * FIXME: real allocation policy is to be designed yet!
2208 : */
2209 153782114 : static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
2210 : struct ext4_free_extent *ex,
2211 : struct ext4_buddy *e4b)
2212 : {
2213 153782114 : struct ext4_free_extent *bex = &ac->ac_b_ex;
2214 153782114 : struct ext4_free_extent *gex = &ac->ac_g_ex;
2215 :
2216 153782114 : BUG_ON(ex->fe_len <= 0);
2217 153782114 : BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
2218 153782114 : BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
2219 153782114 : BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
2220 :
2221 153782114 : ac->ac_found++;
2222 153782114 : ac->ac_cX_found[ac->ac_criteria]++;
2223 :
2224 : /*
2225 : * The special case - take what you catch first
2226 : */
2227 153824574 : if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2228 136227 : *bex = *ex;
2229 136227 : ext4_mb_use_best_found(ac, e4b);
2230 136227 : return;
2231 : }
2232 :
2233 : /*
2234 : * Let's check whether the chuck is good enough
2235 : */
2236 153688347 : if (ex->fe_len == gex->fe_len) {
2237 588039 : *bex = *ex;
2238 588039 : ext4_mb_use_best_found(ac, e4b);
2239 588039 : return;
2240 : }
2241 :
2242 : /*
2243 : * If this is first found extent, just store it in the context
2244 : */
2245 153100308 : if (bex->fe_len == 0) {
2246 1784501 : *bex = *ex;
2247 1784501 : return;
2248 : }
2249 :
2250 : /*
2251 : * If new found extent is better, store it in the context
2252 : */
2253 151315807 : if (bex->fe_len < gex->fe_len) {
2254 : /* if the request isn't satisfied, any found extent
2255 : * larger than previous best one is better */
2256 144699726 : if (ex->fe_len > bex->fe_len)
2257 3361865 : *bex = *ex;
2258 6616081 : } else if (ex->fe_len > gex->fe_len) {
2259 : /* if the request is satisfied, then we try to find
2260 : * an extent that still satisfy the request, but is
2261 : * smaller than previous one */
2262 6578651 : if (ex->fe_len < bex->fe_len)
2263 670887 : *bex = *ex;
2264 : }
2265 :
2266 151315807 : ext4_mb_check_limits(ac, e4b, 0);
2267 : }
2268 :
2269 : static noinline_for_stack
2270 811570 : void ext4_mb_try_best_found(struct ext4_allocation_context *ac,
2271 : struct ext4_buddy *e4b)
2272 : {
2273 811570 : struct ext4_free_extent ex = ac->ac_b_ex;
2274 811570 : ext4_group_t group = ex.fe_group;
2275 811570 : int max;
2276 811570 : int err;
2277 :
2278 811570 : BUG_ON(ex.fe_len <= 0);
2279 811570 : err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
2280 811572 : if (err)
2281 0 : return;
2282 :
2283 811572 : ext4_lock_group(ac->ac_sb, group);
2284 811598 : max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
2285 :
2286 811566 : if (max > 0) {
2287 675340 : ac->ac_b_ex = ex;
2288 675340 : ext4_mb_use_best_found(ac, e4b);
2289 : }
2290 :
2291 811556 : ext4_unlock_group(ac->ac_sb, group);
2292 811599 : ext4_mb_unload_buddy(e4b);
2293 : }
2294 :
2295 : static noinline_for_stack
2296 2685004 : int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
2297 : struct ext4_buddy *e4b)
2298 : {
2299 2685004 : ext4_group_t group = ac->ac_g_ex.fe_group;
2300 2685004 : int max;
2301 2685004 : int err;
2302 2685004 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2303 2685004 : struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2304 2685015 : struct ext4_free_extent ex;
2305 :
2306 2685015 : if (!grp)
2307 : return -EFSCORRUPTED;
2308 2685015 : if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY)))
2309 : return 0;
2310 1330075 : if (grp->bb_free == 0)
2311 : return 0;
2312 :
2313 1229232 : err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
2314 1229281 : if (err)
2315 : return err;
2316 :
2317 1229281 : if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
2318 0 : ext4_mb_unload_buddy(e4b);
2319 0 : return 0;
2320 : }
2321 :
2322 1229281 : ext4_lock_group(ac->ac_sb, group);
2323 1229354 : max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
2324 : ac->ac_g_ex.fe_len, &ex);
2325 1229358 : ex.fe_logical = 0xDEADFA11; /* debug value */
2326 :
2327 1229358 : if (max >= ac->ac_g_ex.fe_len &&
2328 238735 : ac->ac_g_ex.fe_len == EXT4_B2C(sbi, sbi->s_stripe)) {
2329 229 : ext4_fsblk_t start;
2330 :
2331 229 : start = ext4_grp_offs_to_block(ac->ac_sb, &ex);
2332 : /* use do_div to get remainder (would be 64-bit modulo) */
2333 229 : if (do_div(start, sbi->s_stripe) == 0) {
2334 96 : ac->ac_found++;
2335 96 : ac->ac_b_ex = ex;
2336 96 : ext4_mb_use_best_found(ac, e4b);
2337 : }
2338 1229129 : } else if (max >= ac->ac_g_ex.fe_len) {
2339 238506 : BUG_ON(ex.fe_len <= 0);
2340 238506 : BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
2341 238506 : BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
2342 238506 : ac->ac_found++;
2343 238506 : ac->ac_b_ex = ex;
2344 238506 : ext4_mb_use_best_found(ac, e4b);
2345 990623 : } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
2346 : /* Sometimes, caller may want to merge even small
2347 : * number of blocks to an existing extent */
2348 0 : BUG_ON(ex.fe_len <= 0);
2349 0 : BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
2350 0 : BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
2351 0 : ac->ac_found++;
2352 0 : ac->ac_b_ex = ex;
2353 0 : ext4_mb_use_best_found(ac, e4b);
2354 : }
2355 1229358 : ext4_unlock_group(ac->ac_sb, group);
2356 1229420 : ext4_mb_unload_buddy(e4b);
2357 :
2358 1229420 : return 0;
2359 : }
2360 :
2361 : /*
2362 : * The routine scans buddy structures (not bitmap!) from given order
2363 : * to max order and tries to find big enough chunk to satisfy the req
2364 : */
2365 : static noinline_for_stack
2366 350723 : void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
2367 : struct ext4_buddy *e4b)
2368 : {
2369 350723 : struct super_block *sb = ac->ac_sb;
2370 350723 : struct ext4_group_info *grp = e4b->bd_info;
2371 350723 : void *buddy;
2372 350723 : int i;
2373 350723 : int k;
2374 350723 : int max;
2375 :
2376 350723 : BUG_ON(ac->ac_2order <= 0);
2377 524913 : for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) {
2378 524913 : if (grp->bb_counters[i] == 0)
2379 174190 : continue;
2380 :
2381 350723 : buddy = mb_find_buddy(e4b, i, &max);
2382 350723 : if (WARN_RATELIMIT(buddy == NULL,
2383 : "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i))
2384 0 : continue;
2385 :
2386 350723 : k = mb_find_next_zero_bit(buddy, max, 0);
2387 350723 : if (k >= max) {
2388 0 : ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
2389 : "%d free clusters of order %d. But found 0",
2390 : grp->bb_counters[i], i);
2391 0 : ext4_mark_group_bitmap_corrupted(ac->ac_sb,
2392 : e4b->bd_group,
2393 : EXT4_GROUP_INFO_BBITMAP_CORRUPT);
2394 0 : break;
2395 : }
2396 350723 : ac->ac_found++;
2397 350723 : ac->ac_cX_found[ac->ac_criteria]++;
2398 :
2399 350723 : ac->ac_b_ex.fe_len = 1 << i;
2400 350723 : ac->ac_b_ex.fe_start = k << i;
2401 350723 : ac->ac_b_ex.fe_group = e4b->bd_group;
2402 :
2403 350723 : ext4_mb_use_best_found(ac, e4b);
2404 :
2405 350722 : BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
2406 :
2407 350722 : if (EXT4_SB(sb)->s_mb_stats)
2408 0 : atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
2409 :
2410 : break;
2411 : }
2412 350722 : }
2413 :
2414 : /*
2415 : * The routine scans the group and measures all found extents.
2416 : * In order to optimize scanning, caller must pass number of
2417 : * free blocks in the group, so the routine can know upper limit.
2418 : */
2419 : static noinline_for_stack
2420 9416469 : void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
2421 : struct ext4_buddy *e4b)
2422 : {
2423 9416469 : struct super_block *sb = ac->ac_sb;
2424 9416469 : void *bitmap = e4b->bd_bitmap;
2425 9416469 : struct ext4_free_extent ex;
2426 9416469 : int i, j, freelen;
2427 9416469 : int free;
2428 :
2429 9416469 : free = e4b->bd_info->bb_free;
2430 9416469 : if (WARN_ON(free <= 0))
2431 0 : return;
2432 :
2433 9416469 : i = e4b->bd_info->bb_first_free;
2434 :
2435 177364502 : while (free && ac->ac_status == AC_STATUS_CONTINUE) {
2436 167950114 : i = mb_find_next_zero_bit(bitmap,
2437 167950114 : EXT4_CLUSTERS_PER_GROUP(sb), i);
2438 168071339 : if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
2439 : /*
2440 : * IF we have corrupt bitmap, we won't find any
2441 : * free blocks even though group info says we
2442 : * have free blocks
2443 : */
2444 0 : ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
2445 : "%d free clusters as per "
2446 : "group info. But bitmap says 0",
2447 : free);
2448 0 : ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
2449 : EXT4_GROUP_INFO_BBITMAP_CORRUPT);
2450 0 : break;
2451 : }
2452 :
2453 168071339 : if (ac->ac_criteria < CR_FAST) {
2454 : /*
2455 : * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are
2456 : * sure that this group will have a large enough
2457 : * continuous free extent, so skip over the smaller free
2458 : * extents
2459 : */
2460 21951288 : j = mb_find_next_bit(bitmap,
2461 : EXT4_CLUSTERS_PER_GROUP(sb), i);
2462 21950674 : freelen = j - i;
2463 :
2464 21950674 : if (freelen < ac->ac_g_ex.fe_len) {
2465 14079302 : i = j;
2466 14079302 : free -= freelen;
2467 14079302 : continue;
2468 : }
2469 : }
2470 :
2471 153991423 : mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
2472 153720019 : if (WARN_ON(ex.fe_len <= 0))
2473 : break;
2474 153720019 : if (free < ex.fe_len) {
2475 0 : ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
2476 : "%d free clusters as per "
2477 : "group info. But got %d blocks",
2478 : free, ex.fe_len);
2479 0 : ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
2480 : EXT4_GROUP_INFO_BBITMAP_CORRUPT);
2481 : /*
2482 : * The number of free blocks differs. This mostly
2483 : * indicate that the bitmap is corrupt. So exit
2484 : * without claiming the space.
2485 : */
2486 0 : break;
2487 : }
2488 153720019 : ex.fe_logical = 0xDEADC0DE; /* debug value */
2489 153720019 : ext4_mb_measure_extent(ac, &ex, e4b);
2490 :
2491 153868731 : i += ex.fe_len;
2492 153868731 : free -= ex.fe_len;
2493 : }
2494 :
2495 9414388 : ext4_mb_check_limits(ac, e4b, 1);
2496 : }
2497 :
2498 : /*
2499 : * This is a special case for storages like raid5
2500 : * we try to find stripe-aligned chunks for stripe-size-multiple requests
2501 : */
2502 : static noinline_for_stack
2503 151172 : void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
2504 : struct ext4_buddy *e4b)
2505 : {
2506 151172 : struct super_block *sb = ac->ac_sb;
2507 151172 : struct ext4_sb_info *sbi = EXT4_SB(sb);
2508 151172 : void *bitmap = e4b->bd_bitmap;
2509 151172 : struct ext4_free_extent ex;
2510 151172 : ext4_fsblk_t first_group_block;
2511 151172 : ext4_fsblk_t a;
2512 151172 : ext4_grpblk_t i, stripe;
2513 151172 : int max;
2514 :
2515 151172 : BUG_ON(sbi->s_stripe == 0);
2516 :
2517 : /* find first stripe-aligned block in group */
2518 151172 : first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
2519 :
2520 151172 : a = first_group_block + sbi->s_stripe - 1;
2521 151172 : do_div(a, sbi->s_stripe);
2522 151172 : i = (a * sbi->s_stripe) - first_group_block;
2523 :
2524 151172 : stripe = EXT4_B2C(sbi, sbi->s_stripe);
2525 151172 : i = EXT4_B2C(sbi, i);
2526 18246095 : while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
2527 18107600 : if (!mb_test_bit(i, bitmap)) {
2528 261121 : max = mb_find_extent(e4b, i, stripe, &ex);
2529 261121 : if (max >= stripe) {
2530 12677 : ac->ac_found++;
2531 12677 : ac->ac_cX_found[ac->ac_criteria]++;
2532 12677 : ex.fe_logical = 0xDEADF00D; /* debug value */
2533 12677 : ac->ac_b_ex = ex;
2534 12677 : ext4_mb_use_best_found(ac, e4b);
2535 12677 : break;
2536 : }
2537 : }
2538 18094923 : i += stripe;
2539 : }
2540 151172 : }
2541 :
2542 : /*
2543 : * This is also called BEFORE we load the buddy bitmap.
2544 : * Returns either 1 or 0 indicating that the group is either suitable
2545 : * for the allocation or not.
2546 : */
2547 35083170 : static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
2548 : ext4_group_t group, enum criteria cr)
2549 : {
2550 35083170 : ext4_grpblk_t free, fragments;
2551 35083170 : int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
2552 35078447 : struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2553 :
2554 35115534 : BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS);
2555 :
2556 35115534 : if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp))
2557 : return false;
2558 :
2559 35115534 : free = grp->bb_free;
2560 35115534 : if (free == 0)
2561 : return false;
2562 :
2563 25516619 : fragments = grp->bb_fragments;
2564 25516619 : if (fragments == 0)
2565 : return false;
2566 :
2567 25516580 : switch (cr) {
2568 1168541 : case CR_POWER2_ALIGNED:
2569 1168541 : BUG_ON(ac->ac_2order == 0);
2570 :
2571 : /* Avoid using the first bg of a flexgroup for data files */
2572 1168541 : if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
2573 1167815 : (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
2574 1167815 : ((group % flex_size) == 0))
2575 : return false;
2576 :
2577 996586 : if (free < ac->ac_g_ex.fe_len)
2578 : return false;
2579 :
2580 996581 : if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb))
2581 : return true;
2582 :
2583 996581 : if (grp->bb_largest_free_order < ac->ac_2order)
2584 293317 : return false;
2585 :
2586 : return true;
2587 7948914 : case CR_GOAL_LEN_FAST:
2588 : case CR_BEST_AVAIL_LEN:
2589 7948914 : if ((free / fragments) >= ac->ac_g_ex.fe_len)
2590 2852884 : return true;
2591 : break;
2592 3641626 : case CR_GOAL_LEN_SLOW:
2593 3641626 : if (free >= ac->ac_g_ex.fe_len)
2594 3639796 : return true;
2595 : break;
2596 : case CR_ANY_FREE:
2597 : return true;
2598 0 : default:
2599 0 : BUG();
2600 : }
2601 :
2602 : return false;
2603 : }
2604 :
2605 : /*
2606 : * This could return negative error code if something goes wrong
2607 : * during ext4_mb_init_group(). This should not be called with
2608 : * ext4_lock_group() held.
2609 : *
2610 : * Note: because we are conditionally operating with the group lock in
2611 : * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this
2612 : * function using __acquire and __release. This means we need to be
2613 : * super careful before messing with the error path handling via "goto
2614 : * out"!
2615 : */
2616 168821892 : static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
2617 : ext4_group_t group, enum criteria cr)
2618 : {
2619 168821892 : struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2620 169528355 : struct super_block *sb = ac->ac_sb;
2621 169528355 : struct ext4_sb_info *sbi = EXT4_SB(sb);
2622 169528355 : bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
2623 169528355 : ext4_grpblk_t free;
2624 169528355 : int ret = 0;
2625 :
2626 169528355 : if (!grp)
2627 : return -EFSCORRUPTED;
2628 169528355 : if (sbi->s_mb_stats)
2629 0 : atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
2630 169528355 : if (should_lock) {
2631 67623 : ext4_lock_group(sb, group);
2632 169528829 : __release(ext4_group_lock_ptr(sb, group));
2633 : }
2634 169528829 : free = grp->bb_free;
2635 169528829 : if (free == 0)
2636 112750066 : goto out;
2637 56778763 : if (cr <= CR_FAST && free < ac->ac_g_ex.fe_len)
2638 42637842 : goto out;
2639 14140921 : if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2640 268 : goto out;
2641 14140653 : if (should_lock) {
2642 83 : __acquire(ext4_group_lock_ptr(sb, group));
2643 83 : ext4_unlock_group(sb, group);
2644 : }
2645 :
2646 : /* We only do this if the grp has never been initialized */
2647 14140653 : if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2648 4601 : struct ext4_group_desc *gdp =
2649 4601 : ext4_get_group_desc(sb, group, NULL);
2650 4601 : int ret;
2651 :
2652 : /*
2653 : * cr=CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic
2654 : * search to find large good chunks almost for free. If buddy
2655 : * data is not ready, then this optimization makes no sense. But
2656 : * we never skip the first block group in a flex_bg, since this
2657 : * gets used for metadata block allocation, and we want to make
2658 : * sure we locate metadata blocks in the first block group in
2659 : * the flex_bg if possible.
2660 : */
2661 4601 : if (cr < CR_FAST &&
2662 4267 : (!sbi->s_log_groups_per_flex ||
2663 5697 : ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
2664 2687 : !(ext4_has_group_desc_csum(sb) &&
2665 1443 : (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
2666 : return 0;
2667 2424 : ret = ext4_mb_init_group(sb, group, GFP_NOFS);
2668 2424 : if (ret)
2669 : return ret;
2670 : }
2671 :
2672 14138211 : if (should_lock) {
2673 83 : ext4_lock_group(sb, group);
2674 14138211 : __release(ext4_group_lock_ptr(sb, group));
2675 : }
2676 14138211 : ret = ext4_mb_good_group(ac, group, cr);
2677 169522639 : out:
2678 169522639 : if (should_lock) {
2679 67731 : __acquire(ext4_group_lock_ptr(sb, group));
2680 67731 : ext4_unlock_group(sb, group);
2681 : }
2682 : return ret;
2683 : }
2684 :
2685 : /*
2686 : * Start prefetching @nr block bitmaps starting at @group.
2687 : * Return the next group which needs to be prefetched.
2688 : */
2689 15448382 : ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
2690 : unsigned int nr, int *cnt)
2691 : {
2692 15448382 : ext4_group_t ngroups = ext4_get_groups_count(sb);
2693 15449331 : struct buffer_head *bh;
2694 15449331 : struct blk_plug plug;
2695 :
2696 15449331 : blk_start_plug(&plug);
2697 225185399 : while (nr-- > 0) {
2698 209738107 : struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
2699 : NULL);
2700 209067975 : struct ext4_group_info *grp = ext4_get_group_info(sb, group);
2701 :
2702 : /*
2703 : * Prefetch block groups with free blocks; but don't
2704 : * bother if it is marked uninitialized on disk, since
2705 : * it won't require I/O to read. Also only try to
2706 : * prefetch once, so we avoid getblk() call, which can
2707 : * be expensive.
2708 : */
2709 208414200 : if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
2710 314723 : EXT4_MB_GRP_NEED_INIT(grp) &&
2711 156788 : ext4_free_group_clusters(sb, gdp) > 0 ) {
2712 154939 : bh = ext4_read_block_bitmap_nowait(sb, group, true);
2713 154938 : if (bh && !IS_ERR(bh)) {
2714 309845 : if (!buffer_uptodate(bh) && cnt)
2715 69143 : (*cnt)++;
2716 154923 : brelse(bh);
2717 : }
2718 : }
2719 209736068 : if (++group >= ngroups)
2720 1339136 : group = 0;
2721 : }
2722 15448904 : blk_finish_plug(&plug);
2723 15449766 : return group;
2724 : }
2725 :
2726 : /*
2727 : * Prefetching reads the block bitmap into the buffer cache; but we
2728 : * need to make sure that the buddy bitmap in the page cache has been
2729 : * initialized. Note that ext4_mb_init_group() will block if the I/O
2730 : * is not yet completed, or indeed if it was not initiated by
2731 : * ext4_mb_prefetch did not start the I/O.
2732 : *
2733 : * TODO: We should actually kick off the buddy bitmap setup in a work
2734 : * queue when the buffer I/O is completed, so that we don't block
2735 : * waiting for the block allocation bitmap read to finish when
2736 : * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
2737 : */
2738 2448467 : void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
2739 : unsigned int nr)
2740 : {
2741 2448467 : struct ext4_group_desc *gdp;
2742 2448467 : struct ext4_group_info *grp;
2743 :
2744 32195759 : while (nr-- > 0) {
2745 29747152 : if (!group)
2746 252580 : group = ext4_get_groups_count(sb);
2747 29747152 : group--;
2748 29747152 : gdp = ext4_get_group_desc(sb, group, NULL);
2749 29747831 : grp = ext4_get_group_info(sb, group);
2750 :
2751 59772814 : if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) &&
2752 278216 : ext4_free_group_clusters(sb, gdp) > 0) {
2753 149383 : if (ext4_mb_init_group(sb, group, GFP_NOFS))
2754 : break;
2755 : }
2756 : }
2757 2448608 : }
2758 :
2759 : static noinline_for_stack int
2760 2685015 : ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2761 : {
2762 2685015 : ext4_group_t prefetch_grp = 0, ngroups, group, i;
2763 2685015 : enum criteria new_cr, cr = CR_GOAL_LEN_FAST;
2764 2685015 : int err = 0, first_err = 0;
2765 2685015 : unsigned int nr = 0, prefetch_ios = 0;
2766 2685015 : struct ext4_sb_info *sbi;
2767 2685015 : struct super_block *sb;
2768 2685015 : struct ext4_buddy e4b;
2769 2685015 : int lost;
2770 :
2771 2685015 : sb = ac->ac_sb;
2772 2685015 : sbi = EXT4_SB(sb);
2773 2685015 : ngroups = ext4_get_groups_count(sb);
2774 : /* non-extent files are limited to low blocks/groups */
2775 2684752 : if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
2776 690 : ngroups = sbi->s_blockfile_groups;
2777 :
2778 2684752 : BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2779 :
2780 : /* first, try the goal */
2781 2684752 : err = ext4_mb_find_by_goal(ac, &e4b);
2782 2685007 : if (err || ac->ac_status == AC_STATUS_FOUND)
2783 238601 : goto out;
2784 :
2785 2446406 : if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2786 0 : goto out;
2787 :
2788 : /*
2789 : * ac->ac_2order is set only if the fe_len is a power of 2
2790 : * if ac->ac_2order is set we also set criteria to 0 so that we
2791 : * try exact allocation using buddy.
2792 : */
2793 2446406 : i = fls(ac->ac_g_ex.fe_len);
2794 2446406 : ac->ac_2order = 0;
2795 : /*
2796 : * We search using buddy data only if the order of the request
2797 : * is greater than equal to the sbi_s_mb_order2_reqs
2798 : * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
2799 : * We also support searching for power-of-two requests only for
2800 : * requests upto maximum buddy size we have constructed.
2801 : */
2802 2446406 : if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
2803 : /*
2804 : * This should tell if fe_len is exactly power of 2
2805 : */
2806 1846750 : if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
2807 493359 : ac->ac_2order = array_index_nospec(i - 1,
2808 : MB_NUM_ORDERS(sb));
2809 : }
2810 :
2811 : /* if stream allocation is enabled, use global goal */
2812 2446399 : if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2813 : /* TBD: may be hot point */
2814 1803638 : spin_lock(&sbi->s_md_lock);
2815 1804023 : ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
2816 1804023 : ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
2817 1804023 : spin_unlock(&sbi->s_md_lock);
2818 : }
2819 :
2820 : /*
2821 : * Let's just scan groups to find more-less suitable blocks We
2822 : * start with CR_GOAL_LEN_FAST, unless it is power of 2
2823 : * aligned, in which case let's do that faster approach first.
2824 : */
2825 2446758 : if (ac->ac_2order)
2826 493448 : cr = CR_POWER2_ALIGNED;
2827 1953310 : repeat:
2828 10751362 : for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2829 4908277 : ac->ac_criteria = cr;
2830 : /*
2831 : * searching for the right group start
2832 : * from the goal value specified
2833 : */
2834 4908277 : group = ac->ac_g_ex.fe_group;
2835 4908277 : ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
2836 4908277 : prefetch_grp = group;
2837 :
2838 170906260 : for (i = 0, new_cr = cr; i < ngroups; i++,
2839 166708041 : ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
2840 170073613 : int ret = 0;
2841 :
2842 170073613 : cond_resched();
2843 170351957 : if (new_cr != cr) {
2844 1650514 : cr = new_cr;
2845 1650514 : goto repeat;
2846 : }
2847 :
2848 : /*
2849 : * Batch reads of the block allocation bitmaps
2850 : * to get multiple READs in flight; limit
2851 : * prefetching at cr=0/1, otherwise mballoc can
2852 : * spend a lot of time loading imperfect groups
2853 : */
2854 168701443 : if ((prefetch_grp == group) &&
2855 3881213 : (cr >= CR_FAST ||
2856 3881213 : prefetch_ios < sbi->s_mb_prefetch_limit)) {
2857 15439392 : nr = sbi->s_mb_prefetch;
2858 15439392 : if (ext4_has_feature_flex_bg(sb)) {
2859 15438520 : nr = 1 << sbi->s_log_groups_per_flex;
2860 15438520 : nr -= group & (nr - 1);
2861 15438520 : nr = min(nr, sbi->s_mb_prefetch);
2862 : }
2863 15439392 : prefetch_grp = ext4_mb_prefetch(sb, group,
2864 : nr, &prefetch_ios);
2865 : }
2866 :
2867 : /* This now checks without needing the buddy page */
2868 168708063 : ret = ext4_mb_good_group_nolock(ac, group, cr);
2869 169126258 : if (ret <= 0) {
2870 159211758 : if (!first_err)
2871 159231511 : first_err = ret;
2872 159211758 : continue;
2873 : }
2874 :
2875 9914500 : err = ext4_mb_load_buddy(sb, group, &e4b);
2876 9918663 : if (err)
2877 0 : goto out;
2878 :
2879 9918663 : ext4_lock_group(sb, group);
2880 :
2881 : /*
2882 : * We need to check again after locking the
2883 : * block group
2884 : */
2885 9922468 : ret = ext4_mb_good_group(ac, group, cr);
2886 9917586 : if (ret == 0) {
2887 866 : ext4_unlock_group(sb, group);
2888 866 : ext4_mb_unload_buddy(&e4b);
2889 866 : continue;
2890 : }
2891 :
2892 9916720 : ac->ac_groups_scanned++;
2893 9916720 : if (cr == CR_POWER2_ALIGNED)
2894 350723 : ext4_mb_simple_scan_group(ac, &e4b);
2895 9565997 : else if ((cr == CR_GOAL_LEN_FAST ||
2896 1348819 : cr == CR_BEST_AVAIL_LEN) &&
2897 1348819 : sbi->s_stripe &&
2898 1232181 : !(ac->ac_g_ex.fe_len %
2899 1232181 : EXT4_B2C(sbi, sbi->s_stripe)))
2900 151172 : ext4_mb_scan_aligned(ac, &e4b);
2901 : else
2902 9414825 : ext4_mb_complex_scan_group(ac, &e4b);
2903 :
2904 9915687 : ext4_unlock_group(sb, group);
2905 9924997 : ext4_mb_unload_buddy(&e4b);
2906 :
2907 9921687 : if (ac->ac_status != AC_STATUS_CONTINUE)
2908 : break;
2909 : }
2910 : /* Processed all groups and haven't found blocks */
2911 3258917 : if (sbi->s_mb_stats && i == ngroups)
2912 0 : atomic64_inc(&sbi->s_bal_cX_failed[cr]);
2913 :
2914 3258917 : if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN)
2915 : /* Reset goal length to original goal length before
2916 : * falling into CR_GOAL_LEN_SLOW */
2917 117701 : ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
2918 : }
2919 :
2920 2584168 : if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
2921 812615 : !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2922 : /*
2923 : * We've been searching too long. Let's try to allocate
2924 : * the best chunk we've found so far
2925 : */
2926 812615 : ext4_mb_try_best_found(ac, &e4b);
2927 811581 : if (ac->ac_status != AC_STATUS_FOUND) {
2928 : /*
2929 : * Someone more lucky has already allocated it.
2930 : * The only thing we can do is just take first
2931 : * found block(s)
2932 : */
2933 136249 : lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
2934 136256 : mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
2935 : ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
2936 : ac->ac_b_ex.fe_len, lost);
2937 :
2938 136256 : ac->ac_b_ex.fe_group = 0;
2939 136256 : ac->ac_b_ex.fe_start = 0;
2940 136256 : ac->ac_b_ex.fe_len = 0;
2941 136256 : ac->ac_status = AC_STATUS_CONTINUE;
2942 136256 : ac->ac_flags |= EXT4_MB_HINT_FIRST;
2943 136256 : cr = CR_ANY_FREE;
2944 136256 : goto repeat;
2945 : }
2946 : }
2947 :
2948 2446886 : if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
2949 0 : atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
2950 2446886 : out:
2951 2685487 : if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
2952 1 : err = first_err;
2953 :
2954 2685487 : mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
2955 : ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
2956 : ac->ac_flags, cr, err);
2957 :
2958 2685487 : if (nr)
2959 2446876 : ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
2960 :
2961 2685612 : return err;
2962 : }
2963 :
2964 47 : static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2965 : {
2966 47 : struct super_block *sb = pde_data(file_inode(seq->file));
2967 47 : ext4_group_t group;
2968 :
2969 94 : if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2970 0 : return NULL;
2971 47 : group = *pos + 1;
2972 47 : return (void *) ((unsigned long) group);
2973 : }
2974 :
2975 1615 : static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2976 : {
2977 1615 : struct super_block *sb = pde_data(file_inode(seq->file));
2978 1615 : ext4_group_t group;
2979 :
2980 1615 : ++*pos;
2981 3230 : if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2982 0 : return NULL;
2983 1615 : group = *pos + 1;
2984 1615 : return (void *) ((unsigned long) group);
2985 : }
2986 :
2987 1662 : static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2988 : {
2989 1662 : struct super_block *sb = pde_data(file_inode(seq->file));
2990 1662 : ext4_group_t group = (ext4_group_t) ((unsigned long) v);
2991 1662 : int i;
2992 1662 : int err, buddy_loaded = 0;
2993 1662 : struct ext4_buddy e4b;
2994 1662 : struct ext4_group_info *grinfo;
2995 1662 : unsigned char blocksize_bits = min_t(unsigned char,
2996 : sb->s_blocksize_bits,
2997 : EXT4_MAX_BLOCK_LOG_SIZE);
2998 1662 : struct sg {
2999 : struct ext4_group_info info;
3000 : ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
3001 : } sg;
3002 :
3003 1662 : group--;
3004 1662 : if (group == 0)
3005 30 : seq_puts(seq, "#group: free frags first ["
3006 : " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 "
3007 : " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n");
3008 :
3009 1662 : i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
3010 : sizeof(struct ext4_group_info);
3011 :
3012 1662 : grinfo = ext4_get_group_info(sb, group);
3013 1662 : if (!grinfo)
3014 : return 0;
3015 : /* Load the group info in memory only if not already loaded. */
3016 1662 : if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
3017 1645 : err = ext4_mb_load_buddy(sb, group, &e4b);
3018 1645 : if (err) {
3019 0 : seq_printf(seq, "#%-5u: I/O error\n", group);
3020 0 : return 0;
3021 : }
3022 : buddy_loaded = 1;
3023 : }
3024 :
3025 1662 : memcpy(&sg, grinfo, i);
3026 :
3027 1662 : if (buddy_loaded)
3028 1645 : ext4_mb_unload_buddy(&e4b);
3029 :
3030 1662 : seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
3031 : sg.info.bb_fragments, sg.info.bb_first_free);
3032 26592 : for (i = 0; i <= 13; i++)
3033 23268 : seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
3034 : sg.info.bb_counters[i] : 0);
3035 1662 : seq_puts(seq, " ]\n");
3036 :
3037 1662 : return 0;
3038 : }
3039 :
3040 47 : static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
3041 : {
3042 47 : }
3043 :
3044 : const struct seq_operations ext4_mb_seq_groups_ops = {
3045 : .start = ext4_mb_seq_groups_start,
3046 : .next = ext4_mb_seq_groups_next,
3047 : .stop = ext4_mb_seq_groups_stop,
3048 : .show = ext4_mb_seq_groups_show,
3049 : };
3050 :
3051 0 : int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
3052 : {
3053 0 : struct super_block *sb = seq->private;
3054 0 : struct ext4_sb_info *sbi = EXT4_SB(sb);
3055 :
3056 0 : seq_puts(seq, "mballoc:\n");
3057 0 : if (!sbi->s_mb_stats) {
3058 0 : seq_puts(seq, "\tmb stats collection turned off.\n");
3059 0 : seq_puts(
3060 : seq,
3061 : "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
3062 0 : return 0;
3063 : }
3064 0 : seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
3065 0 : seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
3066 :
3067 0 : seq_printf(seq, "\tgroups_scanned: %u\n",
3068 : atomic_read(&sbi->s_bal_groups_scanned));
3069 :
3070 : /* CR_POWER2_ALIGNED stats */
3071 0 : seq_puts(seq, "\tcr_p2_aligned_stats:\n");
3072 0 : seq_printf(seq, "\t\thits: %llu\n",
3073 : atomic64_read(&sbi->s_bal_cX_hits[CR_POWER2_ALIGNED]));
3074 0 : seq_printf(
3075 : seq, "\t\tgroups_considered: %llu\n",
3076 : atomic64_read(
3077 : &sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]));
3078 0 : seq_printf(seq, "\t\textents_scanned: %u\n",
3079 : atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED]));
3080 0 : seq_printf(seq, "\t\tuseless_loops: %llu\n",
3081 : atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED]));
3082 0 : seq_printf(seq, "\t\tbad_suggestions: %u\n",
3083 : atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions));
3084 :
3085 : /* CR_GOAL_LEN_FAST stats */
3086 0 : seq_puts(seq, "\tcr_goal_fast_stats:\n");
3087 0 : seq_printf(seq, "\t\thits: %llu\n",
3088 : atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_FAST]));
3089 0 : seq_printf(seq, "\t\tgroups_considered: %llu\n",
3090 : atomic64_read(
3091 : &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_FAST]));
3092 0 : seq_printf(seq, "\t\textents_scanned: %u\n",
3093 : atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST]));
3094 0 : seq_printf(seq, "\t\tuseless_loops: %llu\n",
3095 : atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST]));
3096 0 : seq_printf(seq, "\t\tbad_suggestions: %u\n",
3097 : atomic_read(&sbi->s_bal_goal_fast_bad_suggestions));
3098 :
3099 : /* CR_BEST_AVAIL_LEN stats */
3100 0 : seq_puts(seq, "\tcr_best_avail_stats:\n");
3101 0 : seq_printf(seq, "\t\thits: %llu\n",
3102 : atomic64_read(&sbi->s_bal_cX_hits[CR_BEST_AVAIL_LEN]));
3103 0 : seq_printf(
3104 : seq, "\t\tgroups_considered: %llu\n",
3105 : atomic64_read(
3106 : &sbi->s_bal_cX_groups_considered[CR_BEST_AVAIL_LEN]));
3107 0 : seq_printf(seq, "\t\textents_scanned: %u\n",
3108 : atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN]));
3109 0 : seq_printf(seq, "\t\tuseless_loops: %llu\n",
3110 : atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN]));
3111 0 : seq_printf(seq, "\t\tbad_suggestions: %u\n",
3112 : atomic_read(&sbi->s_bal_best_avail_bad_suggestions));
3113 :
3114 : /* CR_GOAL_LEN_SLOW stats */
3115 0 : seq_puts(seq, "\tcr_goal_slow_stats:\n");
3116 0 : seq_printf(seq, "\t\thits: %llu\n",
3117 : atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_SLOW]));
3118 0 : seq_printf(seq, "\t\tgroups_considered: %llu\n",
3119 : atomic64_read(
3120 : &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_SLOW]));
3121 0 : seq_printf(seq, "\t\textents_scanned: %u\n",
3122 : atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_SLOW]));
3123 0 : seq_printf(seq, "\t\tuseless_loops: %llu\n",
3124 : atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_SLOW]));
3125 :
3126 : /* CR_ANY_FREE stats */
3127 0 : seq_puts(seq, "\tcr_any_free_stats:\n");
3128 0 : seq_printf(seq, "\t\thits: %llu\n",
3129 : atomic64_read(&sbi->s_bal_cX_hits[CR_ANY_FREE]));
3130 0 : seq_printf(
3131 : seq, "\t\tgroups_considered: %llu\n",
3132 : atomic64_read(&sbi->s_bal_cX_groups_considered[CR_ANY_FREE]));
3133 0 : seq_printf(seq, "\t\textents_scanned: %u\n",
3134 : atomic_read(&sbi->s_bal_cX_ex_scanned[CR_ANY_FREE]));
3135 0 : seq_printf(seq, "\t\tuseless_loops: %llu\n",
3136 : atomic64_read(&sbi->s_bal_cX_failed[CR_ANY_FREE]));
3137 :
3138 : /* Aggregates */
3139 0 : seq_printf(seq, "\textents_scanned: %u\n",
3140 : atomic_read(&sbi->s_bal_ex_scanned));
3141 0 : seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
3142 0 : seq_printf(seq, "\t\tlen_goal_hits: %u\n",
3143 : atomic_read(&sbi->s_bal_len_goals));
3144 0 : seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
3145 0 : seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
3146 0 : seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
3147 0 : seq_printf(seq, "\tbuddies_generated: %u/%u\n",
3148 : atomic_read(&sbi->s_mb_buddies_generated),
3149 : ext4_get_groups_count(sb));
3150 0 : seq_printf(seq, "\tbuddies_time_used: %llu\n",
3151 : atomic64_read(&sbi->s_mb_generation_time));
3152 0 : seq_printf(seq, "\tpreallocated: %u\n",
3153 : atomic_read(&sbi->s_mb_preallocated));
3154 0 : seq_printf(seq, "\tdiscarded: %u\n", atomic_read(&sbi->s_mb_discarded));
3155 0 : return 0;
3156 : }
3157 :
3158 0 : static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos)
3159 : __acquires(&EXT4_SB(sb)->s_mb_rb_lock)
3160 : {
3161 0 : struct super_block *sb = pde_data(file_inode(seq->file));
3162 0 : unsigned long position;
3163 :
3164 0 : if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
3165 : return NULL;
3166 0 : position = *pos + 1;
3167 0 : return (void *) ((unsigned long) position);
3168 : }
3169 :
3170 0 : static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos)
3171 : {
3172 0 : struct super_block *sb = pde_data(file_inode(seq->file));
3173 0 : unsigned long position;
3174 :
3175 0 : ++*pos;
3176 0 : if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
3177 : return NULL;
3178 0 : position = *pos + 1;
3179 0 : return (void *) ((unsigned long) position);
3180 : }
3181 :
3182 0 : static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
3183 : {
3184 0 : struct super_block *sb = pde_data(file_inode(seq->file));
3185 0 : struct ext4_sb_info *sbi = EXT4_SB(sb);
3186 0 : unsigned long position = ((unsigned long) v);
3187 0 : struct ext4_group_info *grp;
3188 0 : unsigned int count;
3189 :
3190 0 : position--;
3191 0 : if (position >= MB_NUM_ORDERS(sb)) {
3192 0 : position -= MB_NUM_ORDERS(sb);
3193 0 : if (position == 0)
3194 0 : seq_puts(seq, "avg_fragment_size_lists:\n");
3195 :
3196 0 : count = 0;
3197 0 : read_lock(&sbi->s_mb_avg_fragment_size_locks[position]);
3198 0 : list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position],
3199 : bb_avg_fragment_size_node)
3200 0 : count++;
3201 0 : read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]);
3202 0 : seq_printf(seq, "\tlist_order_%u_groups: %u\n",
3203 : (unsigned int)position, count);
3204 0 : return 0;
3205 : }
3206 :
3207 0 : if (position == 0) {
3208 0 : seq_printf(seq, "optimize_scan: %d\n",
3209 0 : test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0);
3210 0 : seq_puts(seq, "max_free_order_lists:\n");
3211 : }
3212 0 : count = 0;
3213 0 : read_lock(&sbi->s_mb_largest_free_orders_locks[position]);
3214 0 : list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position],
3215 : bb_largest_free_order_node)
3216 0 : count++;
3217 0 : read_unlock(&sbi->s_mb_largest_free_orders_locks[position]);
3218 0 : seq_printf(seq, "\tlist_order_%u_groups: %u\n",
3219 : (unsigned int)position, count);
3220 :
3221 0 : return 0;
3222 : }
3223 :
3224 0 : static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
3225 : {
3226 0 : }
3227 :
3228 : const struct seq_operations ext4_mb_seq_structs_summary_ops = {
3229 : .start = ext4_mb_seq_structs_summary_start,
3230 : .next = ext4_mb_seq_structs_summary_next,
3231 : .stop = ext4_mb_seq_structs_summary_stop,
3232 : .show = ext4_mb_seq_structs_summary_show,
3233 : };
3234 :
3235 : static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
3236 : {
3237 859087 : int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
3238 1718174 : struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
3239 :
3240 859087 : BUG_ON(!cachep);
3241 859087 : return cachep;
3242 : }
3243 :
3244 : /*
3245 : * Allocate the top-level s_group_info array for the specified number
3246 : * of groups
3247 : */
3248 2561 : int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
3249 : {
3250 2561 : struct ext4_sb_info *sbi = EXT4_SB(sb);
3251 2561 : unsigned size;
3252 2561 : struct ext4_group_info ***old_groupinfo, ***new_groupinfo;
3253 :
3254 0 : size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
3255 2561 : EXT4_DESC_PER_BLOCK_BITS(sb);
3256 2561 : if (size <= sbi->s_group_info_size)
3257 : return 0;
3258 :
3259 2547 : size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
3260 2547 : new_groupinfo = kvzalloc(size, GFP_KERNEL);
3261 2547 : if (!new_groupinfo) {
3262 0 : ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
3263 0 : return -ENOMEM;
3264 : }
3265 2547 : rcu_read_lock();
3266 2547 : old_groupinfo = rcu_dereference(sbi->s_group_info);
3267 2547 : if (old_groupinfo)
3268 22 : memcpy(new_groupinfo, old_groupinfo,
3269 : sbi->s_group_info_size * sizeof(*sbi->s_group_info));
3270 2547 : rcu_read_unlock();
3271 2547 : rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
3272 2547 : sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
3273 2547 : if (old_groupinfo)
3274 11 : ext4_kvfree_array_rcu(old_groupinfo);
3275 : ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
3276 : sbi->s_group_info_size);
3277 : return 0;
3278 : }
3279 :
3280 : /* Create and initialize ext4_group_info data for the given group. */
3281 856551 : int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
3282 : struct ext4_group_desc *desc)
3283 : {
3284 856551 : int i;
3285 856551 : int metalen = 0;
3286 856551 : int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
3287 856551 : struct ext4_sb_info *sbi = EXT4_SB(sb);
3288 856551 : struct ext4_group_info **meta_group_info;
3289 856551 : struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
3290 :
3291 : /*
3292 : * First check if this group is the first of a reserved block.
3293 : * If it's true, we have to allocate a new table of pointers
3294 : * to ext4_group_info structures
3295 : */
3296 856551 : if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
3297 16459 : metalen = sizeof(*meta_group_info) <<
3298 16459 : EXT4_DESC_PER_BLOCK_BITS(sb);
3299 16459 : meta_group_info = kmalloc(metalen, GFP_NOFS);
3300 16459 : if (meta_group_info == NULL) {
3301 0 : ext4_msg(sb, KERN_ERR, "can't allocate mem "
3302 : "for a buddy group");
3303 0 : return -ENOMEM;
3304 : }
3305 16459 : rcu_read_lock();
3306 16459 : rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
3307 16459 : rcu_read_unlock();
3308 : }
3309 :
3310 856551 : meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
3311 856551 : i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
3312 :
3313 856551 : meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
3314 856551 : if (meta_group_info[i] == NULL) {
3315 0 : ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
3316 0 : goto exit_group_info;
3317 : }
3318 856551 : set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
3319 856551 : &(meta_group_info[i]->bb_state));
3320 :
3321 : /*
3322 : * initialize bb_free to be able to skip
3323 : * empty groups without initialization
3324 : */
3325 856551 : if (ext4_has_group_desc_csum(sb) &&
3326 853736 : (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
3327 1390588 : meta_group_info[i]->bb_free =
3328 695294 : ext4_free_clusters_after_init(sb, group, desc);
3329 : } else {
3330 322514 : meta_group_info[i]->bb_free =
3331 161257 : ext4_free_group_clusters(sb, desc);
3332 : }
3333 :
3334 856551 : INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
3335 856551 : init_rwsem(&meta_group_info[i]->alloc_sem);
3336 856551 : meta_group_info[i]->bb_free_root = RB_ROOT;
3337 856551 : INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
3338 856551 : INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node);
3339 856551 : meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
3340 856551 : meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */
3341 856551 : meta_group_info[i]->bb_group = group;
3342 :
3343 856551 : mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
3344 856551 : return 0;
3345 :
3346 : exit_group_info:
3347 : /* If a meta_group_info table has been allocated, release it now */
3348 0 : if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
3349 0 : struct ext4_group_info ***group_info;
3350 :
3351 0 : rcu_read_lock();
3352 0 : group_info = rcu_dereference(sbi->s_group_info);
3353 0 : kfree(group_info[idx]);
3354 0 : group_info[idx] = NULL;
3355 0 : rcu_read_unlock();
3356 : }
3357 : return -ENOMEM;
3358 : } /* ext4_mb_add_groupinfo */
3359 :
3360 2536 : static int ext4_mb_init_backend(struct super_block *sb)
3361 : {
3362 2536 : ext4_group_t ngroups = ext4_get_groups_count(sb);
3363 2536 : ext4_group_t i;
3364 2536 : struct ext4_sb_info *sbi = EXT4_SB(sb);
3365 2536 : int err;
3366 2536 : struct ext4_group_desc *desc;
3367 2536 : struct ext4_group_info ***group_info;
3368 2536 : struct kmem_cache *cachep;
3369 :
3370 2536 : err = ext4_mb_alloc_groupinfo(sb, ngroups);
3371 2536 : if (err)
3372 : return err;
3373 :
3374 2536 : sbi->s_buddy_cache = new_inode(sb);
3375 2536 : if (sbi->s_buddy_cache == NULL) {
3376 0 : ext4_msg(sb, KERN_ERR, "can't get new inode");
3377 0 : goto err_freesgi;
3378 : }
3379 : /* To avoid potentially colliding with an valid on-disk inode number,
3380 : * use EXT4_BAD_INO for the buddy cache inode number. This inode is
3381 : * not in the inode hash, so it should never be found by iget(), but
3382 : * this will avoid confusion if it ever shows up during debugging. */
3383 2536 : sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
3384 2536 : EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
3385 854586 : for (i = 0; i < ngroups; i++) {
3386 852050 : cond_resched();
3387 852050 : desc = ext4_get_group_desc(sb, i, NULL);
3388 852050 : if (desc == NULL) {
3389 0 : ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
3390 0 : goto err_freebuddy;
3391 : }
3392 852050 : if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
3393 0 : goto err_freebuddy;
3394 : }
3395 :
3396 2536 : if (ext4_has_feature_flex_bg(sb)) {
3397 : /* a single flex group is supposed to be read by a single IO.
3398 : * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
3399 : * unsigned integer, so the maximum shift is 32.
3400 : */
3401 2345 : if (sbi->s_es->s_log_groups_per_flex >= 32) {
3402 0 : ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
3403 0 : goto err_freebuddy;
3404 : }
3405 2345 : sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
3406 : BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
3407 2345 : sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
3408 : } else {
3409 191 : sbi->s_mb_prefetch = 32;
3410 : }
3411 2536 : if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
3412 701 : sbi->s_mb_prefetch = ext4_get_groups_count(sb);
3413 : /* now many real IOs to prefetch within a single allocation at cr=0
3414 : * given cr=0 is an CPU-related optimization we shouldn't try to
3415 : * load too many groups, at some point we should start to use what
3416 : * we've got in memory.
3417 : * with an average random access time 5ms, it'd take a second to get
3418 : * 200 groups (* N with flex_bg), so let's make this limit 4
3419 : */
3420 2536 : sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
3421 2536 : if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
3422 2518 : sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
3423 :
3424 : return 0;
3425 :
3426 0 : err_freebuddy:
3427 0 : cachep = get_groupinfo_cache(sb->s_blocksize_bits);
3428 0 : while (i-- > 0) {
3429 0 : struct ext4_group_info *grp = ext4_get_group_info(sb, i);
3430 :
3431 0 : if (grp)
3432 0 : kmem_cache_free(cachep, grp);
3433 : }
3434 0 : i = sbi->s_group_info_size;
3435 0 : rcu_read_lock();
3436 0 : group_info = rcu_dereference(sbi->s_group_info);
3437 0 : while (i-- > 0)
3438 0 : kfree(group_info[i]);
3439 0 : rcu_read_unlock();
3440 0 : iput(sbi->s_buddy_cache);
3441 0 : err_freesgi:
3442 0 : rcu_read_lock();
3443 0 : kvfree(rcu_dereference(sbi->s_group_info));
3444 0 : rcu_read_unlock();
3445 0 : return -ENOMEM;
3446 : }
3447 :
3448 0 : static void ext4_groupinfo_destroy_slabs(void)
3449 : {
3450 0 : int i;
3451 :
3452 0 : for (i = 0; i < NR_GRPINFO_CACHES; i++) {
3453 0 : kmem_cache_destroy(ext4_groupinfo_caches[i]);
3454 0 : ext4_groupinfo_caches[i] = NULL;
3455 : }
3456 0 : }
3457 :
3458 2536 : static int ext4_groupinfo_create_slab(size_t size)
3459 : {
3460 2536 : static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
3461 2536 : int slab_size;
3462 2536 : int blocksize_bits = order_base_2(size);
3463 2536 : int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
3464 2536 : struct kmem_cache *cachep;
3465 :
3466 2536 : if (cache_index >= NR_GRPINFO_CACHES)
3467 : return -EINVAL;
3468 :
3469 2536 : if (unlikely(cache_index < 0))
3470 0 : cache_index = 0;
3471 :
3472 2536 : mutex_lock(&ext4_grpinfo_slab_create_mutex);
3473 2536 : if (ext4_groupinfo_caches[cache_index]) {
3474 2533 : mutex_unlock(&ext4_grpinfo_slab_create_mutex);
3475 2533 : return 0; /* Already created */
3476 : }
3477 :
3478 3 : slab_size = offsetof(struct ext4_group_info,
3479 : bb_counters[blocksize_bits + 2]);
3480 :
3481 3 : cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
3482 : slab_size, 0, SLAB_RECLAIM_ACCOUNT,
3483 : NULL);
3484 :
3485 3 : ext4_groupinfo_caches[cache_index] = cachep;
3486 :
3487 3 : mutex_unlock(&ext4_grpinfo_slab_create_mutex);
3488 3 : if (!cachep) {
3489 0 : printk(KERN_EMERG
3490 : "EXT4-fs: no memory for groupinfo slab cache\n");
3491 0 : return -ENOMEM;
3492 : }
3493 :
3494 : return 0;
3495 : }
3496 :
3497 0 : static void ext4_discard_work(struct work_struct *work)
3498 : {
3499 0 : struct ext4_sb_info *sbi = container_of(work,
3500 : struct ext4_sb_info, s_discard_work);
3501 0 : struct super_block *sb = sbi->s_sb;
3502 0 : struct ext4_free_data *fd, *nfd;
3503 0 : struct ext4_buddy e4b;
3504 0 : struct list_head discard_list;
3505 0 : ext4_group_t grp, load_grp;
3506 0 : int err = 0;
3507 :
3508 0 : INIT_LIST_HEAD(&discard_list);
3509 0 : spin_lock(&sbi->s_md_lock);
3510 0 : list_splice_init(&sbi->s_discard_list, &discard_list);
3511 0 : spin_unlock(&sbi->s_md_lock);
3512 :
3513 0 : load_grp = UINT_MAX;
3514 0 : list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) {
3515 : /*
3516 : * If filesystem is umounting or no memory or suffering
3517 : * from no space, give up the discard
3518 : */
3519 0 : if ((sb->s_flags & SB_ACTIVE) && !err &&
3520 : !atomic_read(&sbi->s_retry_alloc_pending)) {
3521 0 : grp = fd->efd_group;
3522 0 : if (grp != load_grp) {
3523 0 : if (load_grp != UINT_MAX)
3524 0 : ext4_mb_unload_buddy(&e4b);
3525 :
3526 0 : err = ext4_mb_load_buddy(sb, grp, &e4b);
3527 0 : if (err) {
3528 0 : kmem_cache_free(ext4_free_data_cachep, fd);
3529 0 : load_grp = UINT_MAX;
3530 0 : continue;
3531 : } else {
3532 : load_grp = grp;
3533 : }
3534 : }
3535 :
3536 0 : ext4_lock_group(sb, grp);
3537 0 : ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster,
3538 0 : fd->efd_start_cluster + fd->efd_count - 1, 1);
3539 0 : ext4_unlock_group(sb, grp);
3540 : }
3541 0 : kmem_cache_free(ext4_free_data_cachep, fd);
3542 : }
3543 :
3544 0 : if (load_grp != UINT_MAX)
3545 0 : ext4_mb_unload_buddy(&e4b);
3546 0 : }
3547 :
3548 2536 : int ext4_mb_init(struct super_block *sb)
3549 : {
3550 2536 : struct ext4_sb_info *sbi = EXT4_SB(sb);
3551 2536 : unsigned i, j;
3552 2536 : unsigned offset, offset_incr;
3553 2536 : unsigned max;
3554 2536 : int ret;
3555 :
3556 2536 : i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets);
3557 :
3558 2536 : sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
3559 2536 : if (sbi->s_mb_offsets == NULL) {
3560 0 : ret = -ENOMEM;
3561 0 : goto out;
3562 : }
3563 :
3564 2536 : i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs);
3565 2536 : sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
3566 2536 : if (sbi->s_mb_maxs == NULL) {
3567 0 : ret = -ENOMEM;
3568 0 : goto out;
3569 : }
3570 :
3571 2536 : ret = ext4_groupinfo_create_slab(sb->s_blocksize);
3572 2536 : if (ret < 0)
3573 0 : goto out;
3574 :
3575 : /* order 0 is regular bitmap */
3576 2536 : sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
3577 2536 : sbi->s_mb_offsets[0] = 0;
3578 :
3579 2536 : i = 1;
3580 2536 : offset = 0;
3581 2536 : offset_incr = 1 << (sb->s_blocksize_bits - 1);
3582 2536 : max = sb->s_blocksize << 2;
3583 32953 : do {
3584 32953 : sbi->s_mb_offsets[i] = offset;
3585 32953 : sbi->s_mb_maxs[i] = max;
3586 32953 : offset += offset_incr;
3587 32953 : offset_incr = offset_incr >> 1;
3588 32953 : max = max >> 1;
3589 32953 : i++;
3590 32953 : } while (i < MB_NUM_ORDERS(sb));
3591 :
3592 5072 : sbi->s_mb_avg_fragment_size =
3593 2536 : kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
3594 : GFP_KERNEL);
3595 2536 : if (!sbi->s_mb_avg_fragment_size) {
3596 0 : ret = -ENOMEM;
3597 0 : goto out;
3598 : }
3599 5072 : sbi->s_mb_avg_fragment_size_locks =
3600 2536 : kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
3601 : GFP_KERNEL);
3602 2536 : if (!sbi->s_mb_avg_fragment_size_locks) {
3603 0 : ret = -ENOMEM;
3604 0 : goto out;
3605 : }
3606 38025 : for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
3607 35489 : INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]);
3608 35489 : rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]);
3609 : }
3610 5072 : sbi->s_mb_largest_free_orders =
3611 2536 : kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
3612 : GFP_KERNEL);
3613 2536 : if (!sbi->s_mb_largest_free_orders) {
3614 0 : ret = -ENOMEM;
3615 0 : goto out;
3616 : }
3617 5072 : sbi->s_mb_largest_free_orders_locks =
3618 2536 : kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
3619 : GFP_KERNEL);
3620 2536 : if (!sbi->s_mb_largest_free_orders_locks) {
3621 0 : ret = -ENOMEM;
3622 0 : goto out;
3623 : }
3624 38025 : for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
3625 35489 : INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
3626 35489 : rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
3627 : }
3628 :
3629 2536 : spin_lock_init(&sbi->s_md_lock);
3630 2536 : sbi->s_mb_free_pending = 0;
3631 2536 : INIT_LIST_HEAD(&sbi->s_freed_data_list);
3632 2536 : INIT_LIST_HEAD(&sbi->s_discard_list);
3633 2536 : INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
3634 2536 : atomic_set(&sbi->s_retry_alloc_pending, 0);
3635 :
3636 2536 : sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
3637 2536 : sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
3638 2536 : sbi->s_mb_stats = MB_DEFAULT_STATS;
3639 2536 : sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
3640 2536 : sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
3641 2536 : sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER;
3642 :
3643 : /*
3644 : * The default group preallocation is 512, which for 4k block
3645 : * sizes translates to 2 megabytes. However for bigalloc file
3646 : * systems, this is probably too big (i.e, if the cluster size
3647 : * is 1 megabyte, then group preallocation size becomes half a
3648 : * gigabyte!). As a default, we will keep a two megabyte
3649 : * group pralloc size for cluster sizes up to 64k, and after
3650 : * that, we will force a minimum group preallocation size of
3651 : * 32 clusters. This translates to 8 megs when the cluster
3652 : * size is 256k, and 32 megs when the cluster size is 1 meg,
3653 : * which seems reasonable as a default.
3654 : */
3655 2536 : sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
3656 : sbi->s_cluster_bits, 32);
3657 : /*
3658 : * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
3659 : * to the lowest multiple of s_stripe which is bigger than
3660 : * the s_mb_group_prealloc as determined above. We want
3661 : * the preallocation size to be an exact multiple of the
3662 : * RAID stripe size so that preallocations don't fragment
3663 : * the stripes.
3664 : */
3665 2536 : if (sbi->s_stripe > 1) {
3666 2493 : sbi->s_mb_group_prealloc = roundup(
3667 : sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe));
3668 : }
3669 :
3670 2536 : sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
3671 2536 : if (sbi->s_locality_groups == NULL) {
3672 0 : ret = -ENOMEM;
3673 0 : goto out;
3674 : }
3675 12680 : for_each_possible_cpu(i) {
3676 10144 : struct ext4_locality_group *lg;
3677 10144 : lg = per_cpu_ptr(sbi->s_locality_groups, i);
3678 10144 : mutex_init(&lg->lg_mutex);
3679 121728 : for (j = 0; j < PREALLOC_TB_SIZE; j++)
3680 101440 : INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
3681 10144 : spin_lock_init(&lg->lg_prealloc_lock);
3682 : }
3683 :
3684 2536 : if (bdev_nonrot(sb->s_bdev))
3685 6 : sbi->s_mb_max_linear_groups = 0;
3686 : else
3687 2530 : sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
3688 : /* init file for buddy data */
3689 2536 : ret = ext4_mb_init_backend(sb);
3690 2536 : if (ret != 0)
3691 0 : goto out_free_locality_groups;
3692 :
3693 : return 0;
3694 :
3695 : out_free_locality_groups:
3696 0 : free_percpu(sbi->s_locality_groups);
3697 0 : sbi->s_locality_groups = NULL;
3698 0 : out:
3699 0 : kfree(sbi->s_mb_avg_fragment_size);
3700 0 : kfree(sbi->s_mb_avg_fragment_size_locks);
3701 0 : kfree(sbi->s_mb_largest_free_orders);
3702 0 : kfree(sbi->s_mb_largest_free_orders_locks);
3703 0 : kfree(sbi->s_mb_offsets);
3704 0 : sbi->s_mb_offsets = NULL;
3705 0 : kfree(sbi->s_mb_maxs);
3706 0 : sbi->s_mb_maxs = NULL;
3707 0 : return ret;
3708 : }
3709 :
3710 : /* need to called with the ext4 group lock held */
3711 856551 : static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
3712 : {
3713 856551 : struct ext4_prealloc_space *pa;
3714 856551 : struct list_head *cur, *tmp;
3715 856551 : int count = 0;
3716 :
3717 857324 : list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
3718 773 : pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3719 773 : list_del(&pa->pa_group_list);
3720 773 : count++;
3721 773 : kmem_cache_free(ext4_pspace_cachep, pa);
3722 : }
3723 856551 : return count;
3724 : }
3725 :
3726 2536 : int ext4_mb_release(struct super_block *sb)
3727 : {
3728 2536 : ext4_group_t ngroups = ext4_get_groups_count(sb);
3729 2536 : ext4_group_t i;
3730 2536 : int num_meta_group_infos;
3731 2536 : struct ext4_group_info *grinfo, ***group_info;
3732 2536 : struct ext4_sb_info *sbi = EXT4_SB(sb);
3733 2536 : struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
3734 2536 : int count;
3735 :
3736 2536 : if (test_opt(sb, DISCARD)) {
3737 : /*
3738 : * wait the discard work to drain all of ext4_free_data
3739 : */
3740 8 : flush_work(&sbi->s_discard_work);
3741 8 : WARN_ON_ONCE(!list_empty(&sbi->s_discard_list));
3742 : }
3743 :
3744 2536 : if (sbi->s_group_info) {
3745 859087 : for (i = 0; i < ngroups; i++) {
3746 856551 : cond_resched();
3747 856551 : grinfo = ext4_get_group_info(sb, i);
3748 856551 : if (!grinfo)
3749 0 : continue;
3750 856551 : mb_group_bb_bitmap_free(grinfo);
3751 856551 : ext4_lock_group(sb, i);
3752 856551 : count = ext4_mb_cleanup_pa(grinfo);
3753 856551 : if (count)
3754 : mb_debug(sb, "mballoc: %d PAs left\n",
3755 : count);
3756 856551 : ext4_unlock_group(sb, i);
3757 856551 : kmem_cache_free(cachep, grinfo);
3758 : }
3759 2536 : num_meta_group_infos = (ngroups +
3760 0 : EXT4_DESC_PER_BLOCK(sb) - 1) >>
3761 2536 : EXT4_DESC_PER_BLOCK_BITS(sb);
3762 2536 : rcu_read_lock();
3763 2536 : group_info = rcu_dereference(sbi->s_group_info);
3764 18995 : for (i = 0; i < num_meta_group_infos; i++)
3765 16459 : kfree(group_info[i]);
3766 2536 : kvfree(group_info);
3767 2536 : rcu_read_unlock();
3768 : }
3769 2536 : kfree(sbi->s_mb_avg_fragment_size);
3770 2536 : kfree(sbi->s_mb_avg_fragment_size_locks);
3771 2536 : kfree(sbi->s_mb_largest_free_orders);
3772 2536 : kfree(sbi->s_mb_largest_free_orders_locks);
3773 2536 : kfree(sbi->s_mb_offsets);
3774 2536 : kfree(sbi->s_mb_maxs);
3775 2536 : iput(sbi->s_buddy_cache);
3776 2536 : if (sbi->s_mb_stats) {
3777 0 : ext4_msg(sb, KERN_INFO,
3778 : "mballoc: %u blocks %u reqs (%u success)",
3779 : atomic_read(&sbi->s_bal_allocated),
3780 : atomic_read(&sbi->s_bal_reqs),
3781 : atomic_read(&sbi->s_bal_success));
3782 0 : ext4_msg(sb, KERN_INFO,
3783 : "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
3784 : "%u 2^N hits, %u breaks, %u lost",
3785 : atomic_read(&sbi->s_bal_ex_scanned),
3786 : atomic_read(&sbi->s_bal_groups_scanned),
3787 : atomic_read(&sbi->s_bal_goals),
3788 : atomic_read(&sbi->s_bal_2orders),
3789 : atomic_read(&sbi->s_bal_breaks),
3790 : atomic_read(&sbi->s_mb_lost_chunks));
3791 0 : ext4_msg(sb, KERN_INFO,
3792 : "mballoc: %u generated and it took %llu",
3793 : atomic_read(&sbi->s_mb_buddies_generated),
3794 : atomic64_read(&sbi->s_mb_generation_time));
3795 0 : ext4_msg(sb, KERN_INFO,
3796 : "mballoc: %u preallocated, %u discarded",
3797 : atomic_read(&sbi->s_mb_preallocated),
3798 : atomic_read(&sbi->s_mb_discarded));
3799 : }
3800 :
3801 2536 : free_percpu(sbi->s_locality_groups);
3802 :
3803 2536 : return 0;
3804 : }
3805 :
3806 17545 : static inline int ext4_issue_discard(struct super_block *sb,
3807 : ext4_group_t block_group, ext4_grpblk_t cluster, int count,
3808 : struct bio **biop)
3809 : {
3810 17545 : ext4_fsblk_t discard_block;
3811 :
3812 17545 : discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
3813 : ext4_group_first_block_no(sb, block_group));
3814 17545 : count = EXT4_C2B(EXT4_SB(sb), count);
3815 17545 : trace_ext4_discard_blocks(sb,
3816 : (unsigned long long) discard_block, count);
3817 17545 : if (biop) {
3818 0 : return __blkdev_issue_discard(sb->s_bdev,
3819 0 : (sector_t)discard_block << (sb->s_blocksize_bits - 9),
3820 0 : (sector_t)count << (sb->s_blocksize_bits - 9),
3821 : GFP_NOFS, biop);
3822 : } else
3823 17545 : return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
3824 : }
3825 :
3826 1664427 : static void ext4_free_data_in_buddy(struct super_block *sb,
3827 : struct ext4_free_data *entry)
3828 : {
3829 1664427 : struct ext4_buddy e4b;
3830 1664427 : struct ext4_group_info *db;
3831 1664427 : int err, count = 0;
3832 :
3833 1664427 : mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
3834 : entry->efd_count, entry->efd_group, entry);
3835 :
3836 1664427 : err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
3837 : /* we expect to find existing buddy because it's pinned */
3838 1664427 : BUG_ON(err != 0);
3839 :
3840 1664427 : spin_lock(&EXT4_SB(sb)->s_md_lock);
3841 1664427 : EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
3842 1664427 : spin_unlock(&EXT4_SB(sb)->s_md_lock);
3843 :
3844 1664427 : db = e4b.bd_info;
3845 : /* there are blocks to put in buddy to make them really free */
3846 1664427 : count += entry->efd_count;
3847 1664427 : ext4_lock_group(sb, entry->efd_group);
3848 : /* Take it out of per group rb tree */
3849 1664427 : rb_erase(&entry->efd_node, &(db->bb_free_root));
3850 1664427 : mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
3851 :
3852 : /*
3853 : * Clear the trimmed flag for the group so that the next
3854 : * ext4_trim_fs can trim it.
3855 : * If the volume is mounted with -o discard, online discard
3856 : * is supported and the free blocks will be trimmed online.
3857 : */
3858 1664427 : if (!test_opt(sb, DISCARD))
3859 1664427 : EXT4_MB_GRP_CLEAR_TRIMMED(db);
3860 :
3861 1664427 : if (!db->bb_free_root.rb_node) {
3862 : /* No more items in the per group rb tree
3863 : * balance refcounts from ext4_mb_free_metadata()
3864 : */
3865 212339 : put_page(e4b.bd_buddy_page);
3866 212339 : put_page(e4b.bd_bitmap_page);
3867 : }
3868 1664427 : ext4_unlock_group(sb, entry->efd_group);
3869 1664427 : ext4_mb_unload_buddy(&e4b);
3870 :
3871 1664427 : mb_debug(sb, "freed %d blocks in 1 structures\n", count);
3872 1664427 : }
3873 :
3874 : /*
3875 : * This function is called by the jbd2 layer once the commit has finished,
3876 : * so we know we can free the blocks that were released with that commit.
3877 : */
3878 206991 : void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
3879 : {
3880 206991 : struct ext4_sb_info *sbi = EXT4_SB(sb);
3881 206991 : struct ext4_free_data *entry, *tmp;
3882 206991 : struct list_head freed_data_list;
3883 206991 : struct list_head *cut_pos = NULL;
3884 206991 : bool wake;
3885 :
3886 206991 : INIT_LIST_HEAD(&freed_data_list);
3887 :
3888 206991 : spin_lock(&sbi->s_md_lock);
3889 1871418 : list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
3890 1673997 : if (entry->efd_tid != commit_tid)
3891 : break;
3892 1664427 : cut_pos = &entry->efd_list;
3893 : }
3894 206991 : if (cut_pos)
3895 136905 : list_cut_position(&freed_data_list, &sbi->s_freed_data_list,
3896 : cut_pos);
3897 206991 : spin_unlock(&sbi->s_md_lock);
3898 :
3899 1871418 : list_for_each_entry(entry, &freed_data_list, efd_list)
3900 1664427 : ext4_free_data_in_buddy(sb, entry);
3901 :
3902 206991 : if (test_opt(sb, DISCARD)) {
3903 0 : spin_lock(&sbi->s_md_lock);
3904 0 : wake = list_empty(&sbi->s_discard_list);
3905 0 : list_splice_tail(&freed_data_list, &sbi->s_discard_list);
3906 0 : spin_unlock(&sbi->s_md_lock);
3907 0 : if (wake)
3908 0 : queue_work(system_unbound_wq, &sbi->s_discard_work);
3909 : } else {
3910 1871418 : list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
3911 1664427 : kmem_cache_free(ext4_free_data_cachep, entry);
3912 : }
3913 206991 : }
3914 :
3915 12 : int __init ext4_init_mballoc(void)
3916 : {
3917 12 : ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
3918 : SLAB_RECLAIM_ACCOUNT);
3919 12 : if (ext4_pspace_cachep == NULL)
3920 0 : goto out;
3921 :
3922 12 : ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
3923 : SLAB_RECLAIM_ACCOUNT);
3924 12 : if (ext4_ac_cachep == NULL)
3925 0 : goto out_pa_free;
3926 :
3927 12 : ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
3928 : SLAB_RECLAIM_ACCOUNT);
3929 12 : if (ext4_free_data_cachep == NULL)
3930 0 : goto out_ac_free;
3931 :
3932 : return 0;
3933 :
3934 : out_ac_free:
3935 0 : kmem_cache_destroy(ext4_ac_cachep);
3936 0 : out_pa_free:
3937 0 : kmem_cache_destroy(ext4_pspace_cachep);
3938 : out:
3939 : return -ENOMEM;
3940 : }
3941 :
3942 0 : void ext4_exit_mballoc(void)
3943 : {
3944 : /*
3945 : * Wait for completion of call_rcu()'s on ext4_pspace_cachep
3946 : * before destroying the slab cache.
3947 : */
3948 0 : rcu_barrier();
3949 0 : kmem_cache_destroy(ext4_pspace_cachep);
3950 0 : kmem_cache_destroy(ext4_ac_cachep);
3951 0 : kmem_cache_destroy(ext4_free_data_cachep);
3952 0 : ext4_groupinfo_destroy_slabs();
3953 0 : }
3954 :
3955 :
3956 : /*
3957 : * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
3958 : * Returns 0 if success or error code
3959 : */
3960 : static noinline_for_stack int
3961 3619061 : ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
3962 : handle_t *handle, unsigned int reserv_clstrs)
3963 : {
3964 3619061 : struct buffer_head *bitmap_bh = NULL;
3965 3619061 : struct ext4_group_desc *gdp;
3966 3619061 : struct buffer_head *gdp_bh;
3967 3619061 : struct ext4_sb_info *sbi;
3968 3619061 : struct super_block *sb;
3969 3619061 : ext4_fsblk_t block;
3970 3619061 : int err, len;
3971 :
3972 3619061 : BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3973 3619061 : BUG_ON(ac->ac_b_ex.fe_len <= 0);
3974 :
3975 3619061 : sb = ac->ac_sb;
3976 3619061 : sbi = EXT4_SB(sb);
3977 :
3978 3619061 : bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
3979 3619141 : if (IS_ERR(bitmap_bh)) {
3980 0 : return PTR_ERR(bitmap_bh);
3981 : }
3982 :
3983 3619141 : BUFFER_TRACE(bitmap_bh, "getting write access");
3984 3619141 : err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
3985 : EXT4_JTR_NONE);
3986 3619851 : if (err)
3987 0 : goto out_err;
3988 :
3989 3619851 : err = -EIO;
3990 3619851 : gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
3991 3619899 : if (!gdp)
3992 0 : goto out_err;
3993 :
3994 3619899 : ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
3995 : ext4_free_group_clusters(sb, gdp));
3996 :
3997 3619899 : BUFFER_TRACE(gdp_bh, "get_write_access");
3998 3619899 : err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE);
3999 3619879 : if (err)
4000 0 : goto out_err;
4001 :
4002 3619879 : block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4003 :
4004 3619693 : len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4005 3619693 : if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
4006 0 : ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
4007 : "fs metadata", block, block+len);
4008 : /* File system mounted not to panic on error
4009 : * Fix the bitmap and return EFSCORRUPTED
4010 : * We leak some of the blocks here.
4011 : */
4012 0 : ext4_lock_group(sb, ac->ac_b_ex.fe_group);
4013 0 : mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
4014 : ac->ac_b_ex.fe_len);
4015 0 : ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
4016 0 : err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4017 0 : if (!err)
4018 0 : err = -EFSCORRUPTED;
4019 0 : goto out_err;
4020 : }
4021 :
4022 3619683 : ext4_lock_group(sb, ac->ac_b_ex.fe_group);
4023 : #ifdef AGGRESSIVE_CHECK
4024 : {
4025 : int i;
4026 : for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
4027 : BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
4028 : bitmap_bh->b_data));
4029 : }
4030 : }
4031 : #endif
4032 3620016 : mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
4033 : ac->ac_b_ex.fe_len);
4034 3619947 : if (ext4_has_group_desc_csum(sb) &&
4035 3616123 : (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
4036 3980 : gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
4037 3980 : ext4_free_group_clusters_set(sb, gdp,
4038 : ext4_free_clusters_after_init(sb,
4039 : ac->ac_b_ex.fe_group, gdp));
4040 : }
4041 3619821 : len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
4042 3619951 : ext4_free_group_clusters_set(sb, gdp, len);
4043 3619794 : ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
4044 3619289 : ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
4045 :
4046 3619660 : ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
4047 3619864 : percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
4048 : /*
4049 : * Now reduce the dirty block count also. Should not go negative
4050 : */
4051 3619878 : if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
4052 : /* release all the reserved blocks if non delalloc */
4053 2732966 : percpu_counter_sub(&sbi->s_dirtyclusters_counter,
4054 : reserv_clstrs);
4055 :
4056 3619863 : if (sbi->s_log_groups_per_flex) {
4057 3615605 : ext4_group_t flex_group = ext4_flex_group(sbi,
4058 : ac->ac_b_ex.fe_group);
4059 7231113 : atomic64_sub(ac->ac_b_ex.fe_len,
4060 3615549 : &sbi_array_rcu_deref(sbi, s_flex_groups,
4061 : flex_group)->free_clusters);
4062 : }
4063 :
4064 3619836 : err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4065 3619834 : if (err)
4066 0 : goto out_err;
4067 3619834 : err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
4068 :
4069 3619890 : out_err:
4070 3619890 : brelse(bitmap_bh);
4071 : return err;
4072 : }
4073 :
4074 : /*
4075 : * Idempotent helper for Ext4 fast commit replay path to set the state of
4076 : * blocks in bitmaps and update counters.
4077 : */
4078 0 : void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
4079 : int len, int state)
4080 : {
4081 0 : struct buffer_head *bitmap_bh = NULL;
4082 0 : struct ext4_group_desc *gdp;
4083 0 : struct buffer_head *gdp_bh;
4084 0 : struct ext4_sb_info *sbi = EXT4_SB(sb);
4085 0 : ext4_group_t group;
4086 0 : ext4_grpblk_t blkoff;
4087 0 : int i, err;
4088 0 : int already;
4089 0 : unsigned int clen, clen_changed, thisgrp_len;
4090 :
4091 0 : while (len > 0) {
4092 0 : ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
4093 :
4094 : /*
4095 : * Check to see if we are freeing blocks across a group
4096 : * boundary.
4097 : * In case of flex_bg, this can happen that (block, len) may
4098 : * span across more than one group. In that case we need to
4099 : * get the corresponding group metadata to work with.
4100 : * For this we have goto again loop.
4101 : */
4102 0 : thisgrp_len = min_t(unsigned int, (unsigned int)len,
4103 : EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
4104 0 : clen = EXT4_NUM_B2C(sbi, thisgrp_len);
4105 :
4106 0 : if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) {
4107 0 : ext4_error(sb, "Marking blocks in system zone - "
4108 : "Block = %llu, len = %u",
4109 : block, thisgrp_len);
4110 0 : bitmap_bh = NULL;
4111 0 : break;
4112 : }
4113 :
4114 0 : bitmap_bh = ext4_read_block_bitmap(sb, group);
4115 0 : if (IS_ERR(bitmap_bh)) {
4116 0 : err = PTR_ERR(bitmap_bh);
4117 0 : bitmap_bh = NULL;
4118 0 : break;
4119 : }
4120 :
4121 0 : err = -EIO;
4122 0 : gdp = ext4_get_group_desc(sb, group, &gdp_bh);
4123 0 : if (!gdp)
4124 : break;
4125 :
4126 0 : ext4_lock_group(sb, group);
4127 0 : already = 0;
4128 0 : for (i = 0; i < clen; i++)
4129 0 : if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
4130 : !state)
4131 0 : already++;
4132 :
4133 0 : clen_changed = clen - already;
4134 0 : if (state)
4135 0 : mb_set_bits(bitmap_bh->b_data, blkoff, clen);
4136 : else
4137 0 : mb_clear_bits(bitmap_bh->b_data, blkoff, clen);
4138 0 : if (ext4_has_group_desc_csum(sb) &&
4139 0 : (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
4140 0 : gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
4141 0 : ext4_free_group_clusters_set(sb, gdp,
4142 : ext4_free_clusters_after_init(sb, group, gdp));
4143 : }
4144 0 : if (state)
4145 0 : clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
4146 : else
4147 0 : clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
4148 :
4149 0 : ext4_free_group_clusters_set(sb, gdp, clen);
4150 0 : ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
4151 0 : ext4_group_desc_csum_set(sb, group, gdp);
4152 :
4153 0 : ext4_unlock_group(sb, group);
4154 :
4155 0 : if (sbi->s_log_groups_per_flex) {
4156 0 : ext4_group_t flex_group = ext4_flex_group(sbi, group);
4157 0 : struct flex_groups *fg = sbi_array_rcu_deref(sbi,
4158 : s_flex_groups, flex_group);
4159 :
4160 0 : if (state)
4161 0 : atomic64_sub(clen_changed, &fg->free_clusters);
4162 : else
4163 0 : atomic64_add(clen_changed, &fg->free_clusters);
4164 :
4165 : }
4166 :
4167 0 : err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
4168 0 : if (err)
4169 : break;
4170 0 : sync_dirty_buffer(bitmap_bh);
4171 0 : err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
4172 0 : sync_dirty_buffer(gdp_bh);
4173 0 : if (err)
4174 : break;
4175 :
4176 0 : block += thisgrp_len;
4177 0 : len -= thisgrp_len;
4178 0 : brelse(bitmap_bh);
4179 0 : BUG_ON(len < 0);
4180 : }
4181 :
4182 0 : if (err)
4183 0 : brelse(bitmap_bh);
4184 0 : }
4185 :
4186 : /*
4187 : * here we normalize request for locality group
4188 : * Group request are normalized to s_mb_group_prealloc, which goes to
4189 : * s_strip if we set the same via mount option.
4190 : * s_mb_group_prealloc can be configured via
4191 : * /sys/fs/ext4/<partition>/mb_group_prealloc
4192 : *
4193 : * XXX: should we try to preallocate more than the group has now?
4194 : */
4195 : static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
4196 : {
4197 27358 : struct super_block *sb = ac->ac_sb;
4198 27358 : struct ext4_locality_group *lg = ac->ac_lg;
4199 :
4200 0 : BUG_ON(lg == NULL);
4201 27358 : ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
4202 27358 : mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
4203 : }
4204 :
4205 : /*
4206 : * This function returns the next element to look at during inode
4207 : * PA rbtree walk. We assume that we have held the inode PA rbtree lock
4208 : * (ei->i_prealloc_lock)
4209 : *
4210 : * new_start The start of the range we want to compare
4211 : * cur_start The existing start that we are comparing against
4212 : * node The node of the rb_tree
4213 : */
4214 : static inline struct rb_node*
4215 : ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node)
4216 : {
4217 13065709 : if (new_start < cur_start)
4218 3935445 : return node->rb_left;
4219 : else
4220 9130264 : return node->rb_right;
4221 : }
4222 :
4223 : static inline void
4224 1475377 : ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac,
4225 : ext4_lblk_t start, ext4_lblk_t end)
4226 : {
4227 1475377 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4228 1475377 : struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
4229 1475377 : struct ext4_prealloc_space *tmp_pa;
4230 1475377 : ext4_lblk_t tmp_pa_start, tmp_pa_end;
4231 1475377 : struct rb_node *iter;
4232 :
4233 1475377 : read_lock(&ei->i_prealloc_lock);
4234 5088826 : for (iter = ei->i_prealloc_node.rb_node; iter;
4235 3613320 : iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) {
4236 3613376 : tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
4237 : pa_node.inode_node);
4238 3613376 : tmp_pa_start = tmp_pa->pa_lstart;
4239 3613376 : tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
4240 :
4241 3613376 : spin_lock(&tmp_pa->pa_lock);
4242 3611023 : if (tmp_pa->pa_deleted == 0)
4243 3610733 : BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start));
4244 3611023 : spin_unlock(&tmp_pa->pa_lock);
4245 : }
4246 1475450 : read_unlock(&ei->i_prealloc_lock);
4247 1475476 : }
4248 :
4249 : /*
4250 : * Given an allocation context "ac" and a range "start", "end", check
4251 : * and adjust boundaries if the range overlaps with any of the existing
4252 : * preallocatoins stored in the corresponding inode of the allocation context.
4253 : *
4254 : * Parameters:
4255 : * ac allocation context
4256 : * start start of the new range
4257 : * end end of the new range
4258 : */
4259 : static inline void
4260 1474824 : ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac,
4261 : ext4_lblk_t *start, ext4_lblk_t *end)
4262 : {
4263 1474824 : struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
4264 1474824 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4265 1474824 : struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL;
4266 1474824 : struct rb_node *iter;
4267 1474824 : ext4_lblk_t new_start, new_end;
4268 1474824 : ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1;
4269 :
4270 1474824 : new_start = *start;
4271 1474824 : new_end = *end;
4272 :
4273 : /*
4274 : * Adjust the normalized range so that it doesn't overlap with any
4275 : * existing preallocated blocks(PAs). Make sure to hold the rbtree lock
4276 : * so it doesn't change underneath us.
4277 : */
4278 1474824 : read_lock(&ei->i_prealloc_lock);
4279 :
4280 : /* Step 1: find any one immediate neighboring PA of the normalized range */
4281 5088783 : for (iter = ei->i_prealloc_node.rb_node; iter;
4282 3613594 : iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
4283 : tmp_pa_start, iter)) {
4284 3613389 : tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
4285 : pa_node.inode_node);
4286 3613389 : tmp_pa_start = tmp_pa->pa_lstart;
4287 3613389 : tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
4288 :
4289 : /* PA must not overlap original request */
4290 3613389 : spin_lock(&tmp_pa->pa_lock);
4291 3610515 : if (tmp_pa->pa_deleted == 0)
4292 3610228 : BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end ||
4293 : ac->ac_o_ex.fe_logical < tmp_pa_start));
4294 3610515 : spin_unlock(&tmp_pa->pa_lock);
4295 : }
4296 :
4297 : /*
4298 : * Step 2: check if the found PA is left or right neighbor and
4299 : * get the other neighbor
4300 : */
4301 1475394 : if (tmp_pa) {
4302 489427 : if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) {
4303 279317 : struct rb_node *tmp;
4304 :
4305 279317 : left_pa = tmp_pa;
4306 279317 : tmp = rb_next(&left_pa->pa_node.inode_node);
4307 279294 : if (tmp) {
4308 126675 : right_pa = rb_entry(tmp,
4309 : struct ext4_prealloc_space,
4310 : pa_node.inode_node);
4311 : }
4312 : } else {
4313 210110 : struct rb_node *tmp;
4314 :
4315 210110 : right_pa = tmp_pa;
4316 210110 : tmp = rb_prev(&right_pa->pa_node.inode_node);
4317 210113 : if (tmp) {
4318 : left_pa = rb_entry(tmp,
4319 : struct ext4_prealloc_space,
4320 : pa_node.inode_node);
4321 : }
4322 : }
4323 : }
4324 :
4325 : /* Step 3: get the non deleted neighbors */
4326 392746 : if (left_pa) {
4327 392746 : for (iter = &left_pa->pa_node.inode_node;;
4328 99 : iter = rb_prev(iter)) {
4329 392845 : if (!iter) {
4330 : left_pa = NULL;
4331 : break;
4332 : }
4333 :
4334 392765 : tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
4335 : pa_node.inode_node);
4336 392765 : left_pa = tmp_pa;
4337 392765 : spin_lock(&tmp_pa->pa_lock);
4338 392702 : if (tmp_pa->pa_deleted == 0) {
4339 392603 : spin_unlock(&tmp_pa->pa_lock);
4340 : break;
4341 : }
4342 99 : spin_unlock(&tmp_pa->pa_lock);
4343 : }
4344 : }
4345 :
4346 1475307 : if (right_pa) {
4347 336750 : for (iter = &right_pa->pa_node.inode_node;;
4348 48 : iter = rb_next(iter)) {
4349 336798 : if (!iter) {
4350 : right_pa = NULL;
4351 : break;
4352 : }
4353 :
4354 336881 : tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
4355 : pa_node.inode_node);
4356 336881 : right_pa = tmp_pa;
4357 336881 : spin_lock(&tmp_pa->pa_lock);
4358 336777 : if (tmp_pa->pa_deleted == 0) {
4359 336729 : spin_unlock(&tmp_pa->pa_lock);
4360 : break;
4361 : }
4362 48 : spin_unlock(&tmp_pa->pa_lock);
4363 : }
4364 : }
4365 :
4366 1475246 : if (left_pa) {
4367 785184 : left_pa_end =
4368 392592 : left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len);
4369 392592 : BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical);
4370 : }
4371 :
4372 1475246 : if (right_pa) {
4373 336723 : right_pa_start = right_pa->pa_lstart;
4374 336723 : BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical);
4375 : }
4376 :
4377 : /* Step 4: trim our normalized range to not overlap with the neighbors */
4378 1475246 : if (left_pa) {
4379 392602 : if (left_pa_end > new_start)
4380 : new_start = left_pa_end;
4381 : }
4382 :
4383 1475246 : if (right_pa) {
4384 336724 : if (right_pa_start < new_end)
4385 : new_end = right_pa_start;
4386 : }
4387 1475246 : read_unlock(&ei->i_prealloc_lock);
4388 :
4389 : /* XXX: extra loop to check we really don't overlap preallocations */
4390 1475425 : ext4_mb_pa_assert_overlap(ac, new_start, new_end);
4391 :
4392 1475458 : *start = new_start;
4393 1475458 : *end = new_end;
4394 1475458 : }
4395 :
4396 : /*
4397 : * Normalization means making request better in terms of
4398 : * size and alignment
4399 : */
4400 : static noinline_for_stack void
4401 2678942 : ext4_mb_normalize_request(struct ext4_allocation_context *ac,
4402 : struct ext4_allocation_request *ar)
4403 : {
4404 2678942 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4405 2678942 : struct ext4_super_block *es = sbi->s_es;
4406 2678942 : int bsbits, max;
4407 2678942 : ext4_lblk_t end;
4408 2678942 : loff_t size, start_off;
4409 2678942 : loff_t orig_size __maybe_unused;
4410 2678942 : ext4_lblk_t start;
4411 :
4412 : /* do normalize only data requests, metadata requests
4413 : do not need preallocation */
4414 2678942 : if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4415 1203935 : return;
4416 :
4417 : /* sometime caller may want exact blocks */
4418 2147867 : if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4419 : return;
4420 :
4421 : /* caller may indicate that preallocation isn't
4422 : * required (it's a tail, for example) */
4423 2147867 : if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
4424 : return;
4425 :
4426 1502365 : if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
4427 27358 : ext4_mb_normalize_group_request(ac);
4428 27358 : return ;
4429 : }
4430 :
4431 1475007 : bsbits = ac->ac_sb->s_blocksize_bits;
4432 :
4433 : /* first, let's learn actual file size
4434 : * given current request is allocated */
4435 1475007 : size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
4436 1475007 : size = size << bsbits;
4437 1475007 : if (size < i_size_read(ac->ac_inode))
4438 : size = i_size_read(ac->ac_inode);
4439 1475007 : orig_size = size;
4440 :
4441 : /* max size of free chunks */
4442 1475007 : max = 2 << bsbits;
4443 :
4444 : #define NRL_CHECK_SIZE(req, size, max, chunk_size) \
4445 : (req <= (size) || max <= (chunk_size))
4446 :
4447 : /* first, try to predict filesize */
4448 : /* XXX: should this table be tunable? */
4449 1475007 : start_off = 0;
4450 1475007 : if (size <= 16 * 1024) {
4451 : size = 16 * 1024;
4452 1474893 : } else if (size <= 32 * 1024) {
4453 : size = 32 * 1024;
4454 1474708 : } else if (size <= 64 * 1024) {
4455 : size = 64 * 1024;
4456 1474673 : } else if (size <= 128 * 1024) {
4457 : size = 128 * 1024;
4458 1466834 : } else if (size <= 256 * 1024) {
4459 : size = 256 * 1024;
4460 1408806 : } else if (size <= 512 * 1024) {
4461 : size = 512 * 1024;
4462 1278285 : } else if (size <= 1024 * 1024) {
4463 : size = 1024 * 1024;
4464 1097414 : } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
4465 0 : start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
4466 73513 : (21 - bsbits)) << 21;
4467 73513 : size = 2 * 1024 * 1024;
4468 1023901 : } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
4469 0 : start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
4470 36993 : (22 - bsbits)) << 22;
4471 36993 : size = 4 * 1024 * 1024;
4472 986908 : } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len),
4473 : (8<<20)>>bsbits, max, 8 * 1024)) {
4474 0 : start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
4475 986908 : (23 - bsbits)) << 23;
4476 986908 : size = 8 * 1024 * 1024;
4477 : } else {
4478 0 : start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
4479 0 : size = (loff_t) EXT4_C2B(sbi,
4480 0 : ac->ac_o_ex.fe_len) << bsbits;
4481 : }
4482 1475007 : size = size >> bsbits;
4483 1475007 : start = start_off >> bsbits;
4484 :
4485 : /*
4486 : * For tiny groups (smaller than 8MB) the chosen allocation
4487 : * alignment may be larger than group size. Make sure the
4488 : * alignment does not move allocation to a different group which
4489 : * makes mballoc fail assertions later.
4490 : */
4491 1475007 : start = max(start, rounddown(ac->ac_o_ex.fe_logical,
4492 : (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
4493 :
4494 : /* don't cover already allocated blocks in selected range */
4495 1475007 : if (ar->pleft && start <= ar->lleft) {
4496 1150979 : size -= ar->lleft + 1 - start;
4497 1150979 : start = ar->lleft + 1;
4498 : }
4499 1475007 : if (ar->pright && start + size - 1 >= ar->lright)
4500 482279 : size -= start + size - ar->lright;
4501 :
4502 : /*
4503 : * Trim allocation request for filesystems with artificially small
4504 : * groups.
4505 : */
4506 1475007 : if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
4507 48 : size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);
4508 :
4509 1475007 : end = start + size;
4510 :
4511 1475007 : ext4_mb_pa_adjust_overlap(ac, &start, &end);
4512 :
4513 1475472 : size = end - start;
4514 :
4515 : /*
4516 : * In this function "start" and "size" are normalized for better
4517 : * alignment and length such that we could preallocate more blocks.
4518 : * This normalization is done such that original request of
4519 : * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and
4520 : * "size" boundaries.
4521 : * (Note fe_len can be relaxed since FS block allocation API does not
4522 : * provide gurantee on number of contiguous blocks allocation since that
4523 : * depends upon free space left, etc).
4524 : * In case of inode pa, later we use the allocated blocks
4525 : * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated
4526 : * range of goal/best blocks [start, size] to put it at the
4527 : * ac_o_ex.fe_logical extent of this inode.
4528 : * (See ext4_mb_use_inode_pa() for more details)
4529 : */
4530 1475472 : if (start + size <= ac->ac_o_ex.fe_logical ||
4531 : start > ac->ac_o_ex.fe_logical) {
4532 0 : ext4_msg(ac->ac_sb, KERN_ERR,
4533 : "start %lu, size %lu, fe_logical %lu",
4534 : (unsigned long) start, (unsigned long) size,
4535 : (unsigned long) ac->ac_o_ex.fe_logical);
4536 0 : BUG();
4537 : }
4538 1475472 : BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
4539 :
4540 : /* now prepare goal request */
4541 :
4542 : /* XXX: is it better to align blocks WRT to logical
4543 : * placement or satisfy big request as is */
4544 1475472 : ac->ac_g_ex.fe_logical = start;
4545 1475472 : ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
4546 1475472 : ac->ac_orig_goal_len = ac->ac_g_ex.fe_len;
4547 :
4548 : /* define goal start in order to merge */
4549 1475472 : if (ar->pright && (ar->lright == (start + size)) &&
4550 482522 : ar->pright >= size &&
4551 482522 : ar->pright - size >= le32_to_cpu(es->s_first_data_block)) {
4552 : /* merge to the right */
4553 482522 : ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
4554 : &ac->ac_g_ex.fe_group,
4555 : &ac->ac_g_ex.fe_start);
4556 482515 : ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
4557 : }
4558 2702646 : if (ar->pleft && (ar->lleft + 1 == start) &&
4559 1227181 : ar->pleft + 1 < ext4_blocks_count(es)) {
4560 : /* merge to the left */
4561 1227123 : ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
4562 : &ac->ac_g_ex.fe_group,
4563 : &ac->ac_g_ex.fe_start);
4564 1227125 : ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
4565 : }
4566 :
4567 1475467 : mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size,
4568 : orig_size, start);
4569 : }
4570 :
4571 3620081 : static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
4572 : {
4573 3620081 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4574 :
4575 3620081 : if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
4576 0 : atomic_inc(&sbi->s_bal_reqs);
4577 0 : atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
4578 0 : if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
4579 0 : atomic_inc(&sbi->s_bal_success);
4580 :
4581 0 : atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
4582 0 : for (int i=0; i<EXT4_MB_NUM_CRS; i++) {
4583 0 : atomic_add(ac->ac_cX_found[i], &sbi->s_bal_cX_ex_scanned[i]);
4584 : }
4585 :
4586 0 : atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
4587 0 : if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
4588 0 : ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
4589 0 : atomic_inc(&sbi->s_bal_goals);
4590 : /* did we allocate as much as normalizer originally wanted? */
4591 0 : if (ac->ac_f_ex.fe_len == ac->ac_orig_goal_len)
4592 0 : atomic_inc(&sbi->s_bal_len_goals);
4593 :
4594 0 : if (ac->ac_found > sbi->s_mb_max_to_scan)
4595 0 : atomic_inc(&sbi->s_bal_breaks);
4596 : }
4597 :
4598 3620081 : if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
4599 2679829 : trace_ext4_mballoc_alloc(ac);
4600 : else
4601 940252 : trace_ext4_mballoc_prealloc(ac);
4602 3619796 : }
4603 :
4604 : /*
4605 : * Called on failure; free up any blocks from the inode PA for this
4606 : * context. We don't need this for MB_GROUP_PA because we only change
4607 : * pa_free in ext4_mb_release_context(), but on failure, we've already
4608 : * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
4609 : */
4610 1 : static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
4611 : {
4612 1 : struct ext4_prealloc_space *pa = ac->ac_pa;
4613 1 : struct ext4_buddy e4b;
4614 1 : int err;
4615 :
4616 1 : if (pa == NULL) {
4617 1 : if (ac->ac_f_ex.fe_len == 0)
4618 1 : return;
4619 0 : err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
4620 0 : if (WARN_RATELIMIT(err,
4621 : "ext4: mb_load_buddy failed (%d)", err))
4622 : /*
4623 : * This should never happen since we pin the
4624 : * pages in the ext4_allocation_context so
4625 : * ext4_mb_load_buddy() should never fail.
4626 : */
4627 : return;
4628 0 : ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
4629 0 : mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
4630 : ac->ac_f_ex.fe_len);
4631 0 : ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
4632 0 : ext4_mb_unload_buddy(&e4b);
4633 0 : return;
4634 : }
4635 0 : if (pa->pa_type == MB_INODE_PA) {
4636 0 : spin_lock(&pa->pa_lock);
4637 0 : pa->pa_free += ac->ac_b_ex.fe_len;
4638 0 : spin_unlock(&pa->pa_lock);
4639 : }
4640 : }
4641 :
4642 : /*
4643 : * use blocks preallocated to inode
4644 : */
4645 1390611 : static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
4646 : struct ext4_prealloc_space *pa)
4647 : {
4648 1390611 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4649 1390611 : ext4_fsblk_t start;
4650 1390611 : ext4_fsblk_t end;
4651 1390611 : int len;
4652 :
4653 : /* found preallocated blocks, use them */
4654 1390611 : start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
4655 1390611 : end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
4656 : start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
4657 1390611 : len = EXT4_NUM_B2C(sbi, end - start);
4658 1390611 : ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
4659 : &ac->ac_b_ex.fe_start);
4660 1390429 : ac->ac_b_ex.fe_len = len;
4661 1390429 : ac->ac_status = AC_STATUS_FOUND;
4662 1390429 : ac->ac_pa = pa;
4663 :
4664 1390429 : BUG_ON(start < pa->pa_pstart);
4665 1390429 : BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
4666 1390429 : BUG_ON(pa->pa_free < len);
4667 1390429 : BUG_ON(ac->ac_b_ex.fe_len <= 0);
4668 1390429 : pa->pa_free -= len;
4669 :
4670 1390429 : mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
4671 1390429 : }
4672 :
4673 : /*
4674 : * use blocks preallocated to locality group
4675 : */
4676 266501 : static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
4677 : struct ext4_prealloc_space *pa)
4678 : {
4679 266501 : unsigned int len = ac->ac_o_ex.fe_len;
4680 :
4681 266501 : ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
4682 : &ac->ac_b_ex.fe_group,
4683 : &ac->ac_b_ex.fe_start);
4684 266501 : ac->ac_b_ex.fe_len = len;
4685 266501 : ac->ac_status = AC_STATUS_FOUND;
4686 266501 : ac->ac_pa = pa;
4687 :
4688 : /* we don't correct pa_pstart or pa_len here to avoid
4689 : * possible race when the group is being loaded concurrently
4690 : * instead we correct pa later, after blocks are marked
4691 : * in on-disk bitmap -- see ext4_mb_release_context()
4692 : * Other CPUs are prevented from allocating from this pa by lg_mutex
4693 : */
4694 266501 : mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
4695 : pa->pa_lstart, len, pa);
4696 266501 : }
4697 :
4698 : /*
4699 : * Return the prealloc space that have minimal distance
4700 : * from the goal block. @cpa is the prealloc
4701 : * space that is having currently known minimal distance
4702 : * from the goal block.
4703 : */
4704 : static struct ext4_prealloc_space *
4705 272497 : ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
4706 : struct ext4_prealloc_space *pa,
4707 : struct ext4_prealloc_space *cpa)
4708 : {
4709 272497 : ext4_fsblk_t cur_distance, new_distance;
4710 :
4711 272497 : if (cpa == NULL) {
4712 264806 : atomic_inc(&pa->pa_count);
4713 264806 : return pa;
4714 : }
4715 7691 : cur_distance = abs(goal_block - cpa->pa_pstart);
4716 7691 : new_distance = abs(goal_block - pa->pa_pstart);
4717 :
4718 7691 : if (cur_distance <= new_distance)
4719 : return cpa;
4720 :
4721 : /* drop the previous reference */
4722 7442 : atomic_dec(&cpa->pa_count);
4723 7442 : atomic_inc(&pa->pa_count);
4724 7442 : return pa;
4725 : }
4726 :
4727 : /*
4728 : * check if found pa meets EXT4_MB_HINT_GOAL_ONLY
4729 : */
4730 : static bool
4731 674968 : ext4_mb_pa_goal_check(struct ext4_allocation_context *ac,
4732 : struct ext4_prealloc_space *pa)
4733 : {
4734 674968 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4735 674968 : ext4_fsblk_t start;
4736 :
4737 674968 : if (likely(!(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)))
4738 : return true;
4739 :
4740 : /*
4741 : * If EXT4_MB_HINT_GOAL_ONLY is set, ac_g_ex will not be adjusted
4742 : * in ext4_mb_normalize_request and will keep same with ac_o_ex
4743 : * from ext4_mb_initialize_context. Choose ac_g_ex here to keep
4744 : * consistent with ext4_mb_find_by_goal.
4745 : */
4746 0 : start = pa->pa_pstart +
4747 0 : (ac->ac_g_ex.fe_logical - pa->pa_lstart);
4748 0 : if (ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex) != start)
4749 : return false;
4750 :
4751 0 : if (ac->ac_g_ex.fe_len > pa->pa_len -
4752 0 : EXT4_B2C(sbi, ac->ac_g_ex.fe_logical - pa->pa_lstart))
4753 0 : return false;
4754 :
4755 : return true;
4756 : }
4757 :
4758 : /*
4759 : * search goal blocks in preallocated space
4760 : */
4761 : static noinline_for_stack bool
4762 3618480 : ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
4763 : {
4764 3618480 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4765 3618480 : int order, i;
4766 3618480 : struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
4767 3618480 : struct ext4_locality_group *lg;
4768 3618480 : struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL;
4769 3618480 : loff_t tmp_pa_end;
4770 3618480 : struct rb_node *iter;
4771 3618480 : ext4_fsblk_t goal_block;
4772 :
4773 : /* only data can be preallocated */
4774 3618480 : if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4775 : return false;
4776 :
4777 : /*
4778 : * first, try per-file preallocation by searching the inode pa rbtree.
4779 : *
4780 : * Here, we can't do a direct traversal of the tree because
4781 : * ext4_mb_discard_group_preallocation() can paralelly mark the pa
4782 : * deleted and that can cause direct traversal to skip some entries.
4783 : */
4784 3087229 : read_lock(&ei->i_prealloc_lock);
4785 :
4786 3087954 : if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) {
4787 1869964 : goto try_group_pa;
4788 : }
4789 :
4790 : /*
4791 : * Step 1: Find a pa with logical start immediately adjacent to the
4792 : * original logical start. This could be on the left or right.
4793 : *
4794 : * (tmp_pa->pa_lstart never changes so we can skip locking for it).
4795 : */
4796 7056785 : for (iter = ei->i_prealloc_node.rb_node; iter;
4797 5838795 : iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
4798 : tmp_pa->pa_lstart, iter)) {
4799 5838795 : tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
4800 : pa_node.inode_node);
4801 : }
4802 :
4803 : /*
4804 : * Step 2: The adjacent pa might be to the right of logical start, find
4805 : * the left adjacent pa. After this step we'd have a valid tmp_pa whose
4806 : * logical start is towards the left of original request's logical start
4807 : */
4808 1217990 : if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) {
4809 268775 : struct rb_node *tmp;
4810 268775 : tmp = rb_prev(&tmp_pa->pa_node.inode_node);
4811 :
4812 268772 : if (tmp) {
4813 : tmp_pa = rb_entry(tmp, struct ext4_prealloc_space,
4814 : pa_node.inode_node);
4815 : } else {
4816 : /*
4817 : * If there is no adjacent pa to the left then finding
4818 : * an overlapping pa is not possible hence stop searching
4819 : * inode pa tree
4820 : */
4821 120096 : goto try_group_pa;
4822 : }
4823 : }
4824 :
4825 1097891 : BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical));
4826 :
4827 : /*
4828 : * Step 3: If the left adjacent pa is deleted, keep moving left to find
4829 : * the first non deleted adjacent pa. After this step we should have a
4830 : * valid tmp_pa which is guaranteed to be non deleted.
4831 : */
4832 1097990 : for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) {
4833 1097990 : if (!iter) {
4834 : /*
4835 : * no non deleted left adjacent pa, so stop searching
4836 : * inode pa tree
4837 : */
4838 82 : goto try_group_pa;
4839 : }
4840 1097908 : tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
4841 : pa_node.inode_node);
4842 1097908 : spin_lock(&tmp_pa->pa_lock);
4843 1098128 : if (tmp_pa->pa_deleted == 0) {
4844 : /*
4845 : * We will keep holding the pa_lock from
4846 : * this point on because we don't want group discard
4847 : * to delete this pa underneath us. Since group
4848 : * discard is anyways an ENOSPC operation it
4849 : * should be okay for it to wait a few more cycles.
4850 : */
4851 : break;
4852 : } else {
4853 99 : spin_unlock(&tmp_pa->pa_lock);
4854 : }
4855 : }
4856 :
4857 1098029 : BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical));
4858 1098029 : BUG_ON(tmp_pa->pa_deleted == 1);
4859 :
4860 : /*
4861 : * Step 4: We now have the non deleted left adjacent pa. Only this
4862 : * pa can possibly satisfy the request hence check if it overlaps
4863 : * original logical start and stop searching if it doesn't.
4864 : */
4865 1098029 : tmp_pa_end = (loff_t)tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
4866 :
4867 1098029 : if (ac->ac_o_ex.fe_logical >= tmp_pa_end) {
4868 422803 : spin_unlock(&tmp_pa->pa_lock);
4869 422900 : goto try_group_pa;
4870 : }
4871 :
4872 : /* non-extent files can't have physical blocks past 2^32 */
4873 675226 : if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
4874 3953 : (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) >
4875 : EXT4_MAX_BLOCK_FILE_PHYS)) {
4876 : /*
4877 : * Since PAs don't overlap, we won't find any other PA to
4878 : * satisfy this.
4879 : */
4880 0 : spin_unlock(&tmp_pa->pa_lock);
4881 0 : goto try_group_pa;
4882 : }
4883 :
4884 675226 : if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) {
4885 674687 : atomic_inc(&tmp_pa->pa_count);
4886 675414 : ext4_mb_use_inode_pa(ac, tmp_pa);
4887 674714 : spin_unlock(&tmp_pa->pa_lock);
4888 675212 : read_unlock(&ei->i_prealloc_lock);
4889 675212 : return true;
4890 : } else {
4891 : /*
4892 : * We found a valid overlapping pa but couldn't use it because
4893 : * it had no free blocks. This should ideally never happen
4894 : * because:
4895 : *
4896 : * 1. When a new inode pa is added to rbtree it must have
4897 : * pa_free > 0 since otherwise we won't actually need
4898 : * preallocation.
4899 : *
4900 : * 2. An inode pa that is in the rbtree can only have it's
4901 : * pa_free become zero when another thread calls:
4902 : * ext4_mb_new_blocks
4903 : * ext4_mb_use_preallocated
4904 : * ext4_mb_use_inode_pa
4905 : *
4906 : * 3. Further, after the above calls make pa_free == 0, we will
4907 : * immediately remove it from the rbtree in:
4908 : * ext4_mb_new_blocks
4909 : * ext4_mb_release_context
4910 : * ext4_mb_put_pa
4911 : *
4912 : * 4. Since the pa_free becoming 0 and pa_free getting removed
4913 : * from tree both happen in ext4_mb_new_blocks, which is always
4914 : * called with i_data_sem held for data allocations, we can be
4915 : * sure that another process will never see a pa in rbtree with
4916 : * pa_free == 0.
4917 : */
4918 144 : WARN_ON_ONCE(tmp_pa->pa_free == 0);
4919 : }
4920 144 : spin_unlock(&tmp_pa->pa_lock);
4921 2413042 : try_group_pa:
4922 2413042 : read_unlock(&ei->i_prealloc_lock);
4923 :
4924 : /* can we use group allocation? */
4925 2412963 : if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
4926 : return false;
4927 :
4928 : /* inode may have no locality group for some reason */
4929 295863 : lg = ac->ac_lg;
4930 295863 : if (lg == NULL)
4931 : return false;
4932 295863 : order = fls(ac->ac_o_ex.fe_len) - 1;
4933 295863 : if (order > PREALLOC_TB_SIZE - 1)
4934 : /* The max size of hash table is PREALLOC_TB_SIZE */
4935 : order = PREALLOC_TB_SIZE - 1;
4936 :
4937 295863 : goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
4938 : /*
4939 : * search for the prealloc space that is having
4940 : * minimal distance from the goal block.
4941 : */
4942 3389368 : for (i = order; i < PREALLOC_TB_SIZE; i++) {
4943 2797626 : rcu_read_lock();
4944 3257214 : list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i],
4945 : pa_node.lg_list) {
4946 459521 : spin_lock(&tmp_pa->pa_lock);
4947 459521 : if (tmp_pa->pa_deleted == 0 &&
4948 459521 : tmp_pa->pa_free >= ac->ac_o_ex.fe_len) {
4949 :
4950 272497 : cpa = ext4_mb_check_group_pa(goal_block,
4951 : tmp_pa, cpa);
4952 : }
4953 459521 : spin_unlock(&tmp_pa->pa_lock);
4954 : }
4955 2797683 : rcu_read_unlock();
4956 : }
4957 295879 : if (cpa) {
4958 264806 : ext4_mb_use_group_pa(ac, cpa);
4959 264806 : return true;
4960 : }
4961 : return false;
4962 : }
4963 :
4964 : /*
4965 : * the function goes through all block freed in the group
4966 : * but not yet committed and marks them used in in-core bitmap.
4967 : * buddy must be generated from this bitmap
4968 : * Need to be called with the ext4 group lock held
4969 : */
4970 160468 : static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
4971 : ext4_group_t group)
4972 : {
4973 160468 : struct rb_node *n;
4974 160468 : struct ext4_group_info *grp;
4975 160468 : struct ext4_free_data *entry;
4976 :
4977 160468 : grp = ext4_get_group_info(sb, group);
4978 160468 : if (!grp)
4979 : return;
4980 160468 : n = rb_first(&(grp->bb_free_root));
4981 :
4982 160468 : while (n) {
4983 0 : entry = rb_entry(n, struct ext4_free_data, efd_node);
4984 0 : mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
4985 0 : n = rb_next(n);
4986 : }
4987 : return;
4988 : }
4989 :
4990 : /*
4991 : * the function goes through all preallocation in this group and marks them
4992 : * used in in-core bitmap. buddy must be generated from this bitmap
4993 : * Need to be called with ext4 group lock held
4994 : */
4995 : static noinline_for_stack
4996 160468 : void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
4997 : ext4_group_t group)
4998 : {
4999 160468 : struct ext4_group_info *grp = ext4_get_group_info(sb, group);
5000 160468 : struct ext4_prealloc_space *pa;
5001 160468 : struct list_head *cur;
5002 160468 : ext4_group_t groupnr;
5003 160468 : ext4_grpblk_t start;
5004 160468 : int preallocated = 0;
5005 160468 : int len;
5006 :
5007 160468 : if (!grp)
5008 0 : return;
5009 :
5010 : /* all form of preallocation discards first load group,
5011 : * so the only competing code is preallocation use.
5012 : * we don't need any locking here
5013 : * notice we do NOT ignore preallocations with pa_deleted
5014 : * otherwise we could leave used blocks available for
5015 : * allocation in buddy when concurrent ext4_mb_put_pa()
5016 : * is dropping preallocation
5017 : */
5018 160495 : list_for_each(cur, &grp->bb_prealloc_list) {
5019 27 : pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
5020 27 : spin_lock(&pa->pa_lock);
5021 27 : ext4_get_group_no_and_offset(sb, pa->pa_pstart,
5022 : &groupnr, &start);
5023 27 : len = pa->pa_len;
5024 27 : spin_unlock(&pa->pa_lock);
5025 27 : if (unlikely(len == 0))
5026 0 : continue;
5027 27 : BUG_ON(groupnr != group);
5028 27 : mb_set_bits(bitmap, start, len);
5029 27 : preallocated += len;
5030 : }
5031 160468 : mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
5032 : }
5033 :
5034 716150 : static void ext4_mb_mark_pa_deleted(struct super_block *sb,
5035 : struct ext4_prealloc_space *pa)
5036 : {
5037 716150 : struct ext4_inode_info *ei;
5038 :
5039 716150 : if (pa->pa_deleted) {
5040 0 : ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
5041 : pa->pa_type, pa->pa_pstart, pa->pa_lstart,
5042 : pa->pa_len);
5043 0 : return;
5044 : }
5045 :
5046 716150 : pa->pa_deleted = 1;
5047 :
5048 716150 : if (pa->pa_type == MB_INODE_PA) {
5049 715214 : ei = EXT4_I(pa->pa_inode);
5050 715214 : atomic_dec(&ei->i_prealloc_active);
5051 : }
5052 : }
5053 :
5054 2679110 : static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa)
5055 : {
5056 2679110 : BUG_ON(!pa);
5057 2679110 : BUG_ON(atomic_read(&pa->pa_count));
5058 2679110 : BUG_ON(pa->pa_deleted == 0);
5059 2679110 : kmem_cache_free(ext4_pspace_cachep, pa);
5060 2679069 : }
5061 :
5062 922 : static void ext4_mb_pa_callback(struct rcu_head *head)
5063 : {
5064 922 : struct ext4_prealloc_space *pa;
5065 :
5066 922 : pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
5067 922 : ext4_mb_pa_free(pa);
5068 922 : }
5069 :
5070 : /*
5071 : * drops a reference to preallocated space descriptor
5072 : * if this was the last reference and the space is consumed
5073 : */
5074 1657264 : static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
5075 : struct super_block *sb, struct ext4_prealloc_space *pa)
5076 : {
5077 1657264 : ext4_group_t grp;
5078 1657264 : ext4_fsblk_t grp_blk;
5079 1657264 : struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
5080 :
5081 : /* in this short window concurrent discard can set pa_deleted */
5082 1657264 : spin_lock(&pa->pa_lock);
5083 1657328 : if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
5084 1645197 : spin_unlock(&pa->pa_lock);
5085 1645197 : return;
5086 : }
5087 :
5088 12139 : if (pa->pa_deleted == 1) {
5089 0 : spin_unlock(&pa->pa_lock);
5090 0 : return;
5091 : }
5092 :
5093 12139 : ext4_mb_mark_pa_deleted(sb, pa);
5094 12139 : spin_unlock(&pa->pa_lock);
5095 :
5096 12139 : grp_blk = pa->pa_pstart;
5097 : /*
5098 : * If doing group-based preallocation, pa_pstart may be in the
5099 : * next group when pa is used up
5100 : */
5101 12139 : if (pa->pa_type == MB_GROUP_PA)
5102 746 : grp_blk--;
5103 :
5104 12139 : grp = ext4_get_group_number(sb, grp_blk);
5105 :
5106 : /*
5107 : * possible race:
5108 : *
5109 : * P1 (buddy init) P2 (regular allocation)
5110 : * find block B in PA
5111 : * copy on-disk bitmap to buddy
5112 : * mark B in on-disk bitmap
5113 : * drop PA from group
5114 : * mark all PAs in buddy
5115 : *
5116 : * thus, P1 initializes buddy with B available. to prevent this
5117 : * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
5118 : * against that pair
5119 : */
5120 12139 : ext4_lock_group(sb, grp);
5121 12139 : list_del(&pa->pa_group_list);
5122 12139 : ext4_unlock_group(sb, grp);
5123 :
5124 12139 : if (pa->pa_type == MB_INODE_PA) {
5125 11393 : write_lock(pa->pa_node_lock.inode_lock);
5126 11393 : rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
5127 11393 : write_unlock(pa->pa_node_lock.inode_lock);
5128 11393 : ext4_mb_pa_free(pa);
5129 : } else {
5130 746 : spin_lock(pa->pa_node_lock.lg_lock);
5131 746 : list_del_rcu(&pa->pa_node.lg_list);
5132 746 : spin_unlock(pa->pa_node_lock.lg_lock);
5133 746 : call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
5134 : }
5135 : }
5136 :
5137 715259 : static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new)
5138 : {
5139 715259 : struct rb_node **iter = &root->rb_node, *parent = NULL;
5140 715259 : struct ext4_prealloc_space *iter_pa, *new_pa;
5141 715259 : ext4_lblk_t iter_start, new_start;
5142 :
5143 3270021 : while (*iter) {
5144 2554762 : iter_pa = rb_entry(*iter, struct ext4_prealloc_space,
5145 : pa_node.inode_node);
5146 2554762 : new_pa = rb_entry(new, struct ext4_prealloc_space,
5147 : pa_node.inode_node);
5148 2554762 : iter_start = iter_pa->pa_lstart;
5149 2554762 : new_start = new_pa->pa_lstart;
5150 :
5151 2554762 : parent = *iter;
5152 2554762 : if (new_start < iter_start)
5153 697719 : iter = &((*iter)->rb_left);
5154 : else
5155 1857043 : iter = &((*iter)->rb_right);
5156 : }
5157 :
5158 715259 : rb_link_node(new, parent, iter);
5159 715259 : rb_insert_color(new, root);
5160 715251 : }
5161 :
5162 : /*
5163 : * creates new preallocated space for given inode
5164 : */
5165 : static noinline_for_stack void
5166 715263 : ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
5167 : {
5168 715263 : struct super_block *sb = ac->ac_sb;
5169 715263 : struct ext4_sb_info *sbi = EXT4_SB(sb);
5170 715263 : struct ext4_prealloc_space *pa;
5171 715263 : struct ext4_group_info *grp;
5172 715263 : struct ext4_inode_info *ei;
5173 :
5174 : /* preallocate only when found space is larger then requested */
5175 715263 : BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
5176 715263 : BUG_ON(ac->ac_status != AC_STATUS_FOUND);
5177 715263 : BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
5178 715263 : BUG_ON(ac->ac_pa == NULL);
5179 :
5180 715263 : pa = ac->ac_pa;
5181 :
5182 715263 : if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) {
5183 244401 : int new_bex_start;
5184 244401 : int new_bex_end;
5185 :
5186 : /* we can't allocate as much as normalizer wants.
5187 : * so, found space must get proper lstart
5188 : * to cover original request */
5189 244401 : BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
5190 244401 : BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
5191 :
5192 : /*
5193 : * Use the below logic for adjusting best extent as it keeps
5194 : * fragmentation in check while ensuring logical range of best
5195 : * extent doesn't overflow out of goal extent:
5196 : *
5197 : * 1. Check if best ex can be kept at end of goal (before
5198 : * cr_best_avail trimmed it) and still cover original start
5199 : * 2. Else, check if best ex can be kept at start of goal and
5200 : * still cover original start
5201 : * 3. Else, keep the best ex at start of original request.
5202 : */
5203 488802 : new_bex_end = ac->ac_g_ex.fe_logical +
5204 244401 : EXT4_C2B(sbi, ac->ac_orig_goal_len);
5205 244401 : new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
5206 244401 : if (ac->ac_o_ex.fe_logical >= new_bex_start)
5207 22419 : goto adjust_bex;
5208 :
5209 221982 : new_bex_start = ac->ac_g_ex.fe_logical;
5210 443964 : new_bex_end =
5211 221982 : new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
5212 221982 : if (ac->ac_o_ex.fe_logical < new_bex_end)
5213 83619 : goto adjust_bex;
5214 :
5215 138363 : new_bex_start = ac->ac_o_ex.fe_logical;
5216 276726 : new_bex_end =
5217 138363 : new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
5218 :
5219 244401 : adjust_bex:
5220 244401 : ac->ac_b_ex.fe_logical = new_bex_start;
5221 :
5222 244401 : BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
5223 244401 : BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
5224 244401 : BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical +
5225 : EXT4_C2B(sbi, ac->ac_orig_goal_len)));
5226 : }
5227 :
5228 715263 : pa->pa_lstart = ac->ac_b_ex.fe_logical;
5229 715263 : pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
5230 715254 : pa->pa_len = ac->ac_b_ex.fe_len;
5231 715254 : pa->pa_free = pa->pa_len;
5232 715254 : spin_lock_init(&pa->pa_lock);
5233 715254 : INIT_LIST_HEAD(&pa->pa_group_list);
5234 715254 : pa->pa_deleted = 0;
5235 715254 : pa->pa_type = MB_INODE_PA;
5236 :
5237 715254 : mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
5238 : pa->pa_len, pa->pa_lstart);
5239 715254 : trace_ext4_mb_new_inode_pa(ac, pa);
5240 :
5241 715245 : atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
5242 715255 : ext4_mb_use_inode_pa(ac, pa);
5243 :
5244 715250 : ei = EXT4_I(ac->ac_inode);
5245 715250 : grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
5246 715254 : if (!grp)
5247 : return;
5248 :
5249 715254 : pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock;
5250 715254 : pa->pa_inode = ac->ac_inode;
5251 :
5252 715254 : list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
5253 :
5254 715254 : write_lock(pa->pa_node_lock.inode_lock);
5255 715260 : ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node);
5256 715250 : write_unlock(pa->pa_node_lock.inode_lock);
5257 715246 : atomic_inc(&ei->i_prealloc_active);
5258 : }
5259 :
5260 : /*
5261 : * creates new preallocated space for locality group inodes belongs to
5262 : */
5263 : static noinline_for_stack void
5264 1695 : ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
5265 : {
5266 1695 : struct super_block *sb = ac->ac_sb;
5267 1695 : struct ext4_locality_group *lg;
5268 1695 : struct ext4_prealloc_space *pa;
5269 1695 : struct ext4_group_info *grp;
5270 :
5271 : /* preallocate only when found space is larger then requested */
5272 1695 : BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
5273 1695 : BUG_ON(ac->ac_status != AC_STATUS_FOUND);
5274 1695 : BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
5275 1695 : BUG_ON(ac->ac_pa == NULL);
5276 :
5277 1695 : pa = ac->ac_pa;
5278 :
5279 1695 : pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
5280 1695 : pa->pa_lstart = pa->pa_pstart;
5281 1695 : pa->pa_len = ac->ac_b_ex.fe_len;
5282 1695 : pa->pa_free = pa->pa_len;
5283 1695 : spin_lock_init(&pa->pa_lock);
5284 1695 : INIT_LIST_HEAD(&pa->pa_node.lg_list);
5285 1695 : INIT_LIST_HEAD(&pa->pa_group_list);
5286 1695 : pa->pa_deleted = 0;
5287 1695 : pa->pa_type = MB_GROUP_PA;
5288 :
5289 1695 : mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
5290 : pa->pa_len, pa->pa_lstart);
5291 1695 : trace_ext4_mb_new_group_pa(ac, pa);
5292 :
5293 1695 : ext4_mb_use_group_pa(ac, pa);
5294 1695 : atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
5295 :
5296 1695 : grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
5297 1695 : if (!grp)
5298 : return;
5299 1695 : lg = ac->ac_lg;
5300 1695 : BUG_ON(lg == NULL);
5301 :
5302 1695 : pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock;
5303 1695 : pa->pa_inode = NULL;
5304 :
5305 1695 : list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
5306 :
5307 : /*
5308 : * We will later add the new pa to the right bucket
5309 : * after updating the pa_free in ext4_mb_release_context
5310 : */
5311 : }
5312 :
5313 716957 : static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
5314 : {
5315 716957 : if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
5316 1695 : ext4_mb_new_group_pa(ac);
5317 : else
5318 715262 : ext4_mb_new_inode_pa(ac);
5319 716948 : }
5320 :
5321 : /*
5322 : * finds all unused blocks in on-disk bitmap, frees them in
5323 : * in-core bitmap and buddy.
5324 : * @pa must be unlinked from inode and group lists, so that
5325 : * nobody else can find/use it.
5326 : * the caller MUST hold group/inode locks.
5327 : * TODO: optimize the case when there are no in-core structures yet
5328 : */
5329 : static noinline_for_stack int
5330 703823 : ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
5331 : struct ext4_prealloc_space *pa)
5332 : {
5333 703823 : struct super_block *sb = e4b->bd_sb;
5334 703823 : struct ext4_sb_info *sbi = EXT4_SB(sb);
5335 703823 : unsigned int end;
5336 703823 : unsigned int next;
5337 703823 : ext4_group_t group;
5338 703823 : ext4_grpblk_t bit;
5339 703823 : unsigned long long grp_blk_start;
5340 703823 : int free = 0;
5341 :
5342 703823 : BUG_ON(pa->pa_deleted == 0);
5343 703823 : ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
5344 703824 : grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
5345 703824 : BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
5346 703824 : end = bit + pa->pa_len;
5347 :
5348 1860041 : while (bit < end) {
5349 1240333 : bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
5350 1240318 : if (bit >= end)
5351 : break;
5352 1156180 : next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
5353 1156180 : mb_debug(sb, "free preallocated %u/%u in group %u\n",
5354 : (unsigned) ext4_group_first_block_no(sb, group) + bit,
5355 : (unsigned) next - bit, (unsigned) group);
5356 1156180 : free += next - bit;
5357 :
5358 1156180 : trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
5359 2312330 : trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
5360 1156165 : EXT4_C2B(sbi, bit)),
5361 : next - bit);
5362 1156163 : mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
5363 1156217 : bit = next + 1;
5364 : }
5365 703846 : if (free != pa->pa_free) {
5366 0 : ext4_msg(e4b->bd_sb, KERN_CRIT,
5367 : "pa %p: logic %lu, phys. %lu, len %d",
5368 : pa, (unsigned long) pa->pa_lstart,
5369 : (unsigned long) pa->pa_pstart,
5370 : pa->pa_len);
5371 0 : ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
5372 : free, pa->pa_free);
5373 : /*
5374 : * pa is already deleted so we use the value obtained
5375 : * from the bitmap and continue.
5376 : */
5377 : }
5378 703846 : atomic_add(free, &sbi->s_mb_discarded);
5379 :
5380 703850 : return 0;
5381 : }
5382 :
5383 : static noinline_for_stack int
5384 176 : ext4_mb_release_group_pa(struct ext4_buddy *e4b,
5385 : struct ext4_prealloc_space *pa)
5386 : {
5387 176 : struct super_block *sb = e4b->bd_sb;
5388 176 : ext4_group_t group;
5389 176 : ext4_grpblk_t bit;
5390 :
5391 176 : trace_ext4_mb_release_group_pa(sb, pa);
5392 176 : BUG_ON(pa->pa_deleted == 0);
5393 176 : ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
5394 176 : if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
5395 0 : ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
5396 : e4b->bd_group, group, pa->pa_pstart);
5397 0 : return 0;
5398 : }
5399 176 : mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
5400 176 : atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
5401 176 : trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
5402 :
5403 176 : return 0;
5404 : }
5405 :
5406 : /*
5407 : * releases all preallocations in given group
5408 : *
5409 : * first, we need to decide discard policy:
5410 : * - when do we discard
5411 : * 1) ENOSPC
5412 : * - how many do we discard
5413 : * 1) how many requested
5414 : */
5415 : static noinline_for_stack int
5416 670720 : ext4_mb_discard_group_preallocations(struct super_block *sb,
5417 : ext4_group_t group, int *busy)
5418 : {
5419 670720 : struct ext4_group_info *grp = ext4_get_group_info(sb, group);
5420 669912 : struct buffer_head *bitmap_bh = NULL;
5421 669912 : struct ext4_prealloc_space *pa, *tmp;
5422 669912 : struct list_head list;
5423 669912 : struct ext4_buddy e4b;
5424 669912 : struct ext4_inode_info *ei;
5425 669912 : int err;
5426 669912 : int free = 0;
5427 :
5428 669912 : if (!grp)
5429 : return 0;
5430 669912 : mb_debug(sb, "discard preallocation for group %u\n", group);
5431 669912 : if (list_empty(&grp->bb_prealloc_list))
5432 648004 : goto out_dbg;
5433 :
5434 21908 : bitmap_bh = ext4_read_block_bitmap(sb, group);
5435 21905 : if (IS_ERR(bitmap_bh)) {
5436 0 : err = PTR_ERR(bitmap_bh);
5437 0 : ext4_error_err(sb, -err,
5438 : "Error %d reading block bitmap for %u",
5439 : err, group);
5440 0 : goto out_dbg;
5441 : }
5442 :
5443 21905 : err = ext4_mb_load_buddy(sb, group, &e4b);
5444 21907 : if (err) {
5445 0 : ext4_warning(sb, "Error %d loading buddy information for %u",
5446 : err, group);
5447 0 : put_bh(bitmap_bh);
5448 0 : goto out_dbg;
5449 : }
5450 :
5451 21907 : INIT_LIST_HEAD(&list);
5452 21907 : ext4_lock_group(sb, group);
5453 257607 : list_for_each_entry_safe(pa, tmp,
5454 : &grp->bb_prealloc_list, pa_group_list) {
5455 235681 : spin_lock(&pa->pa_lock);
5456 235680 : if (atomic_read(&pa->pa_count)) {
5457 5263 : spin_unlock(&pa->pa_lock);
5458 5264 : *busy = 1;
5459 5264 : continue;
5460 : }
5461 230417 : if (pa->pa_deleted) {
5462 0 : spin_unlock(&pa->pa_lock);
5463 0 : continue;
5464 : }
5465 :
5466 : /* seems this one can be freed ... */
5467 230417 : ext4_mb_mark_pa_deleted(sb, pa);
5468 :
5469 230416 : if (!free)
5470 21214 : this_cpu_inc(discard_pa_seq);
5471 :
5472 : /* we can trust pa_free ... */
5473 230416 : free += pa->pa_free;
5474 :
5475 230416 : spin_unlock(&pa->pa_lock);
5476 :
5477 230417 : list_del(&pa->pa_group_list);
5478 230416 : list_add(&pa->u.pa_tmp_list, &list);
5479 : }
5480 :
5481 : /* now free all selected PAs */
5482 252331 : list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
5483 :
5484 : /* remove from object (inode or locality group) */
5485 230403 : if (pa->pa_type == MB_GROUP_PA) {
5486 28 : spin_lock(pa->pa_node_lock.lg_lock);
5487 28 : list_del_rcu(&pa->pa_node.lg_list);
5488 28 : spin_unlock(pa->pa_node_lock.lg_lock);
5489 : } else {
5490 230375 : write_lock(pa->pa_node_lock.inode_lock);
5491 230382 : ei = EXT4_I(pa->pa_inode);
5492 230382 : rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
5493 230374 : write_unlock(pa->pa_node_lock.inode_lock);
5494 : }
5495 :
5496 230401 : list_del(&pa->u.pa_tmp_list);
5497 :
5498 230400 : if (pa->pa_type == MB_GROUP_PA) {
5499 28 : ext4_mb_release_group_pa(&e4b, pa);
5500 28 : call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
5501 : } else {
5502 230372 : ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
5503 230382 : ext4_mb_pa_free(pa);
5504 : }
5505 : }
5506 :
5507 21928 : ext4_unlock_group(sb, group);
5508 21928 : ext4_mb_unload_buddy(&e4b);
5509 21926 : put_bh(bitmap_bh);
5510 : out_dbg:
5511 : mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
5512 : free, group, grp->bb_free);
5513 : return free;
5514 : }
5515 :
5516 : /*
5517 : * releases all non-used preallocated blocks for given inode
5518 : *
5519 : * It's important to discard preallocations under i_data_sem
5520 : * We don't want another block to be served from the prealloc
5521 : * space when we are discarding the inode prealloc space.
5522 : *
5523 : * FIXME!! Make sure it is valid at all the call sites
5524 : */
5525 9743441 : void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
5526 : {
5527 9743441 : struct ext4_inode_info *ei = EXT4_I(inode);
5528 9743441 : struct super_block *sb = inode->i_sb;
5529 9743441 : struct buffer_head *bitmap_bh = NULL;
5530 9743441 : struct ext4_prealloc_space *pa, *tmp;
5531 9743441 : ext4_group_t group = 0;
5532 9743441 : struct list_head list;
5533 9743441 : struct ext4_buddy e4b;
5534 9743441 : struct rb_node *iter;
5535 9743441 : int err;
5536 :
5537 9743441 : if (!S_ISREG(inode->i_mode)) {
5538 986938 : return;
5539 : }
5540 :
5541 8756599 : if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
5542 : return;
5543 :
5544 8756503 : mb_debug(sb, "discard preallocation for inode %lu\n",
5545 : inode->i_ino);
5546 8756503 : trace_ext4_discard_preallocations(inode,
5547 : atomic_read(&ei->i_prealloc_active), needed);
5548 :
5549 8754345 : INIT_LIST_HEAD(&list);
5550 :
5551 8754345 : if (needed == 0)
5552 8754345 : needed = UINT_MAX;
5553 :
5554 0 : repeat:
5555 : /* first, collect all pa's in the inode */
5556 8754345 : write_lock(&ei->i_prealloc_lock);
5557 9230370 : for (iter = rb_first(&ei->i_prealloc_node); iter && needed;
5558 473444 : iter = rb_next(iter)) {
5559 473452 : pa = rb_entry(iter, struct ext4_prealloc_space,
5560 : pa_node.inode_node);
5561 473452 : BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock);
5562 :
5563 473452 : spin_lock(&pa->pa_lock);
5564 473456 : if (atomic_read(&pa->pa_count)) {
5565 : /* this shouldn't happen often - nobody should
5566 : * use preallocation while we're discarding it */
5567 0 : spin_unlock(&pa->pa_lock);
5568 0 : write_unlock(&ei->i_prealloc_lock);
5569 0 : ext4_msg(sb, KERN_ERR,
5570 : "uh-oh! used pa while discarding");
5571 0 : WARN_ON(1);
5572 0 : schedule_timeout_uninterruptible(HZ);
5573 0 : goto repeat;
5574 :
5575 : }
5576 473456 : if (pa->pa_deleted == 0) {
5577 473456 : ext4_mb_mark_pa_deleted(sb, pa);
5578 473471 : spin_unlock(&pa->pa_lock);
5579 473466 : rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
5580 473457 : list_add(&pa->u.pa_tmp_list, &list);
5581 473444 : needed--;
5582 473444 : continue;
5583 : }
5584 :
5585 : /* someone is deleting pa right now */
5586 0 : spin_unlock(&pa->pa_lock);
5587 0 : write_unlock(&ei->i_prealloc_lock);
5588 :
5589 : /* we have to wait here because pa_deleted
5590 : * doesn't mean pa is already unlinked from
5591 : * the list. as we might be called from
5592 : * ->clear_inode() the inode will get freed
5593 : * and concurrent thread which is unlinking
5594 : * pa from inode's list may access already
5595 : * freed memory, bad-bad-bad */
5596 :
5597 : /* XXX: if this happens too often, we can
5598 : * add a flag to force wait only in case
5599 : * of ->clear_inode(), but not in case of
5600 : * regular truncate */
5601 0 : schedule_timeout_uninterruptible(HZ);
5602 0 : goto repeat;
5603 : }
5604 8756403 : write_unlock(&ei->i_prealloc_lock);
5605 :
5606 9229011 : list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
5607 473409 : BUG_ON(pa->pa_type != MB_INODE_PA);
5608 473409 : group = ext4_get_group_number(sb, pa->pa_pstart);
5609 :
5610 473374 : err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
5611 : GFP_NOFS|__GFP_NOFAIL);
5612 473431 : if (err) {
5613 0 : ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
5614 : err, group);
5615 0 : continue;
5616 : }
5617 :
5618 473431 : bitmap_bh = ext4_read_block_bitmap(sb, group);
5619 473414 : if (IS_ERR(bitmap_bh)) {
5620 0 : err = PTR_ERR(bitmap_bh);
5621 0 : ext4_error_err(sb, -err, "Error %d reading block bitmap for %u",
5622 : err, group);
5623 0 : ext4_mb_unload_buddy(&e4b);
5624 0 : continue;
5625 : }
5626 :
5627 473414 : ext4_lock_group(sb, group);
5628 473461 : list_del(&pa->pa_group_list);
5629 473459 : ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
5630 473468 : ext4_unlock_group(sb, group);
5631 :
5632 473481 : ext4_mb_unload_buddy(&e4b);
5633 473472 : put_bh(bitmap_bh);
5634 :
5635 473481 : list_del(&pa->u.pa_tmp_list);
5636 473477 : ext4_mb_pa_free(pa);
5637 : }
5638 : }
5639 :
5640 2679247 : static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
5641 : {
5642 2679247 : struct ext4_prealloc_space *pa;
5643 :
5644 2679247 : BUG_ON(ext4_pspace_cachep == NULL);
5645 2679247 : pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
5646 2679572 : if (!pa)
5647 : return -ENOMEM;
5648 2679572 : atomic_set(&pa->pa_count, 1);
5649 2679572 : ac->ac_pa = pa;
5650 2679572 : return 0;
5651 : }
5652 :
5653 1962887 : static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac)
5654 : {
5655 1962887 : struct ext4_prealloc_space *pa = ac->ac_pa;
5656 :
5657 1962887 : BUG_ON(!pa);
5658 1962887 : ac->ac_pa = NULL;
5659 1962887 : WARN_ON(!atomic_dec_and_test(&pa->pa_count));
5660 : /*
5661 : * current function is only called due to an error or due to
5662 : * len of found blocks < len of requested blocks hence the PA has not
5663 : * been added to grp->bb_prealloc_list. So we don't need to lock it
5664 : */
5665 1962991 : pa->pa_deleted = 1;
5666 1962991 : ext4_mb_pa_free(pa);
5667 1962904 : }
5668 :
5669 : #ifdef CONFIG_EXT4_DEBUG
5670 432227 : static inline void ext4_mb_show_pa(struct super_block *sb)
5671 : {
5672 432227 : ext4_group_t i, ngroups;
5673 :
5674 432227 : if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
5675 : return;
5676 :
5677 432227 : ngroups = ext4_get_groups_count(sb);
5678 432227 : mb_debug(sb, "groups: ");
5679 48263224 : for (i = 0; i < ngroups; i++) {
5680 47398671 : struct ext4_group_info *grp = ext4_get_group_info(sb, i);
5681 46030990 : struct ext4_prealloc_space *pa;
5682 46030990 : ext4_grpblk_t start;
5683 46030990 : struct list_head *cur;
5684 :
5685 46030990 : if (!grp)
5686 0 : continue;
5687 46030990 : ext4_lock_group(sb, i);
5688 104038480 : list_for_each(cur, &grp->bb_prealloc_list) {
5689 58462198 : pa = list_entry(cur, struct ext4_prealloc_space,
5690 : pa_group_list);
5691 58462198 : spin_lock(&pa->pa_lock);
5692 58498412 : ext4_get_group_no_and_offset(sb, pa->pa_pstart,
5693 : NULL, &start);
5694 57574834 : spin_unlock(&pa->pa_lock);
5695 58745072 : mb_debug(sb, "PA:%u:%d:%d\n", i, start,
5696 : pa->pa_len);
5697 : }
5698 45576282 : ext4_unlock_group(sb, i);
5699 47398781 : mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free,
5700 : grp->bb_fragments);
5701 : }
5702 : }
5703 :
5704 234 : static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
5705 : {
5706 234 : struct super_block *sb = ac->ac_sb;
5707 :
5708 234 : if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
5709 : return;
5710 :
5711 234 : mb_debug(sb, "Can't allocate:"
5712 : " Allocation context details:");
5713 234 : mb_debug(sb, "status %u flags 0x%x",
5714 : ac->ac_status, ac->ac_flags);
5715 234 : mb_debug(sb, "orig %lu/%lu/%lu@%lu, "
5716 : "goal %lu/%lu/%lu@%lu, "
5717 : "best %lu/%lu/%lu@%lu cr %d",
5718 : (unsigned long)ac->ac_o_ex.fe_group,
5719 : (unsigned long)ac->ac_o_ex.fe_start,
5720 : (unsigned long)ac->ac_o_ex.fe_len,
5721 : (unsigned long)ac->ac_o_ex.fe_logical,
5722 : (unsigned long)ac->ac_g_ex.fe_group,
5723 : (unsigned long)ac->ac_g_ex.fe_start,
5724 : (unsigned long)ac->ac_g_ex.fe_len,
5725 : (unsigned long)ac->ac_g_ex.fe_logical,
5726 : (unsigned long)ac->ac_b_ex.fe_group,
5727 : (unsigned long)ac->ac_b_ex.fe_start,
5728 : (unsigned long)ac->ac_b_ex.fe_len,
5729 : (unsigned long)ac->ac_b_ex.fe_logical,
5730 : (int)ac->ac_criteria);
5731 234 : mb_debug(sb, "%u found", ac->ac_found);
5732 234 : mb_debug(sb, "used pa: %s, ", ac->ac_pa ? "yes" : "no");
5733 234 : if (ac->ac_pa)
5734 : mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ?
5735 : "group pa" : "inode pa");
5736 234 : ext4_mb_show_pa(sb);
5737 : }
5738 : #else
5739 : static inline void ext4_mb_show_pa(struct super_block *sb)
5740 : {
5741 : return;
5742 : }
5743 : static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
5744 : {
5745 : ext4_mb_show_pa(ac->ac_sb);
5746 : return;
5747 : }
5748 : #endif
5749 :
5750 : /*
5751 : * We use locality group preallocation for small size file. The size of the
5752 : * file is determined by the current size or the resulting size after
5753 : * allocation which ever is larger
5754 : *
5755 : * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
5756 : */
5757 3617994 : static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
5758 : {
5759 3617994 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
5760 3617994 : int bsbits = ac->ac_sb->s_blocksize_bits;
5761 3617994 : loff_t size, isize;
5762 3617994 : bool inode_pa_eligible, group_pa_eligible;
5763 :
5764 3617994 : if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
5765 : return;
5766 :
5767 3086768 : if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
5768 : return;
5769 :
5770 3086768 : group_pa_eligible = sbi->s_mb_group_prealloc > 0;
5771 3086768 : inode_pa_eligible = true;
5772 3086768 : size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
5773 3086768 : isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
5774 0 : >> bsbits;
5775 :
5776 : /* No point in using inode preallocation for closed files */
5777 3086768 : if ((size == isize) && !ext4_fs_is_busy(sbi) &&
5778 449975 : !inode_is_open_for_write(ac->ac_inode))
5779 326054 : inode_pa_eligible = false;
5780 :
5781 3086768 : size = max(size, isize);
5782 : /* Don't use group allocation for large files */
5783 3086768 : if (size > sbi->s_mb_stream_request)
5784 : group_pa_eligible = false;
5785 :
5786 295858 : if (!group_pa_eligible) {
5787 2790910 : if (inode_pa_eligible)
5788 2707881 : ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
5789 : else
5790 83029 : ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
5791 2790910 : return;
5792 : }
5793 :
5794 295858 : BUG_ON(ac->ac_lg != NULL);
5795 : /*
5796 : * locality group prealloc space are per cpu. The reason for having
5797 : * per cpu locality group is to reduce the contention between block
5798 : * request from multiple CPUs.
5799 : */
5800 295858 : ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
5801 :
5802 : /* we're going to use group allocation */
5803 295858 : ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
5804 :
5805 : /* serialize all allocations in the group */
5806 295858 : mutex_lock(&ac->ac_lg->lg_mutex);
5807 : }
5808 :
5809 : static noinline_for_stack void
5810 3619011 : ext4_mb_initialize_context(struct ext4_allocation_context *ac,
5811 : struct ext4_allocation_request *ar)
5812 : {
5813 3619011 : struct super_block *sb = ar->inode->i_sb;
5814 3619011 : struct ext4_sb_info *sbi = EXT4_SB(sb);
5815 3619011 : struct ext4_super_block *es = sbi->s_es;
5816 3619011 : ext4_group_t group;
5817 3619011 : unsigned int len;
5818 3619011 : ext4_fsblk_t goal;
5819 3619011 : ext4_grpblk_t block;
5820 :
5821 : /* we can't allocate > group size */
5822 3619011 : len = ar->len;
5823 :
5824 : /* just a dirty hack to filter too big requests */
5825 3619011 : if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
5826 1 : len = EXT4_CLUSTERS_PER_GROUP(sb);
5827 :
5828 : /* start searching from the goal */
5829 3619011 : goal = ar->goal;
5830 7237951 : if (goal < le32_to_cpu(es->s_first_data_block) ||
5831 : goal >= ext4_blocks_count(es))
5832 : goal = le32_to_cpu(es->s_first_data_block);
5833 3619011 : ext4_get_group_no_and_offset(sb, goal, &group, &block);
5834 :
5835 : /* set up allocation goals */
5836 3617690 : ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
5837 3617690 : ac->ac_status = AC_STATUS_CONTINUE;
5838 3617690 : ac->ac_sb = sb;
5839 3617690 : ac->ac_inode = ar->inode;
5840 3617690 : ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
5841 3617690 : ac->ac_o_ex.fe_group = group;
5842 3617690 : ac->ac_o_ex.fe_start = block;
5843 3617690 : ac->ac_o_ex.fe_len = len;
5844 3617690 : ac->ac_g_ex = ac->ac_o_ex;
5845 3617690 : ac->ac_orig_goal_len = ac->ac_g_ex.fe_len;
5846 3617690 : ac->ac_flags = ar->flags;
5847 :
5848 : /* we have to define context: we'll work with a file or
5849 : * locality group. this is a policy, actually */
5850 3617690 : ext4_mb_group_or_file(ac);
5851 :
5852 3617949 : mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, "
5853 : "left: %u/%u, right %u/%u to %swritable\n",
5854 : (unsigned) ar->len, (unsigned) ar->logical,
5855 : (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
5856 : (unsigned) ar->lleft, (unsigned) ar->pleft,
5857 : (unsigned) ar->lright, (unsigned) ar->pright,
5858 : inode_is_open_for_write(ar->inode) ? "" : "non-");
5859 3617949 : }
5860 :
5861 : static noinline_for_stack void
5862 37 : ext4_mb_discard_lg_preallocations(struct super_block *sb,
5863 : struct ext4_locality_group *lg,
5864 : int order, int total_entries)
5865 : {
5866 37 : ext4_group_t group = 0;
5867 37 : struct ext4_buddy e4b;
5868 37 : struct list_head discard_list;
5869 37 : struct ext4_prealloc_space *pa, *tmp;
5870 :
5871 37 : mb_debug(sb, "discard locality group preallocation\n");
5872 :
5873 37 : INIT_LIST_HEAD(&discard_list);
5874 :
5875 37 : spin_lock(&lg->lg_prealloc_lock);
5876 148 : list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
5877 : pa_node.lg_list,
5878 : lockdep_is_held(&lg->lg_prealloc_lock)) {
5879 148 : spin_lock(&pa->pa_lock);
5880 148 : if (atomic_read(&pa->pa_count)) {
5881 : /*
5882 : * This is the pa that we just used
5883 : * for block allocation. So don't
5884 : * free that
5885 : */
5886 0 : spin_unlock(&pa->pa_lock);
5887 0 : continue;
5888 : }
5889 148 : if (pa->pa_deleted) {
5890 0 : spin_unlock(&pa->pa_lock);
5891 0 : continue;
5892 : }
5893 : /* only lg prealloc space */
5894 148 : BUG_ON(pa->pa_type != MB_GROUP_PA);
5895 :
5896 : /* seems this one can be freed ... */
5897 148 : ext4_mb_mark_pa_deleted(sb, pa);
5898 148 : spin_unlock(&pa->pa_lock);
5899 :
5900 148 : list_del_rcu(&pa->pa_node.lg_list);
5901 148 : list_add(&pa->u.pa_tmp_list, &discard_list);
5902 :
5903 148 : total_entries--;
5904 148 : if (total_entries <= 5) {
5905 : /*
5906 : * we want to keep only 5 entries
5907 : * allowing it to grow to 8. This
5908 : * mak sure we don't call discard
5909 : * soon for this list.
5910 : */
5911 : break;
5912 : }
5913 : }
5914 37 : spin_unlock(&lg->lg_prealloc_lock);
5915 :
5916 185 : list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
5917 148 : int err;
5918 :
5919 148 : group = ext4_get_group_number(sb, pa->pa_pstart);
5920 148 : err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
5921 : GFP_NOFS|__GFP_NOFAIL);
5922 148 : if (err) {
5923 0 : ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
5924 : err, group);
5925 0 : continue;
5926 : }
5927 148 : ext4_lock_group(sb, group);
5928 148 : list_del(&pa->pa_group_list);
5929 148 : ext4_mb_release_group_pa(&e4b, pa);
5930 148 : ext4_unlock_group(sb, group);
5931 :
5932 148 : ext4_mb_unload_buddy(&e4b);
5933 148 : list_del(&pa->u.pa_tmp_list);
5934 148 : call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
5935 : }
5936 37 : }
5937 :
5938 : /*
5939 : * We have incremented pa_count. So it cannot be freed at this
5940 : * point. Also we hold lg_mutex. So no parallel allocation is
5941 : * possible from this lg. That means pa_free cannot be updated.
5942 : *
5943 : * A parallel ext4_mb_discard_group_preallocations is possible.
5944 : * which can cause the lg_prealloc_list to be updated.
5945 : */
5946 :
5947 265755 : static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
5948 : {
5949 265755 : int order, added = 0, lg_prealloc_count = 1;
5950 265755 : struct super_block *sb = ac->ac_sb;
5951 265755 : struct ext4_locality_group *lg = ac->ac_lg;
5952 265755 : struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
5953 :
5954 265755 : order = fls(pa->pa_free) - 1;
5955 265755 : if (order > PREALLOC_TB_SIZE - 1)
5956 : /* The max size of hash table is PREALLOC_TB_SIZE */
5957 : order = PREALLOC_TB_SIZE - 1;
5958 : /* Add the prealloc space to lg */
5959 265755 : spin_lock(&lg->lg_prealloc_lock);
5960 266904 : list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
5961 : pa_node.lg_list,
5962 : lockdep_is_held(&lg->lg_prealloc_lock)) {
5963 1149 : spin_lock(&tmp_pa->pa_lock);
5964 1149 : if (tmp_pa->pa_deleted) {
5965 0 : spin_unlock(&tmp_pa->pa_lock);
5966 0 : continue;
5967 : }
5968 1149 : if (!added && pa->pa_free < tmp_pa->pa_free) {
5969 : /* Add to the tail of the previous entry */
5970 12 : list_add_tail_rcu(&pa->pa_node.lg_list,
5971 : &tmp_pa->pa_node.lg_list);
5972 12 : added = 1;
5973 : /*
5974 : * we want to count the total
5975 : * number of entries in the list
5976 : */
5977 : }
5978 1149 : spin_unlock(&tmp_pa->pa_lock);
5979 1149 : lg_prealloc_count++;
5980 : }
5981 265755 : if (!added)
5982 265743 : list_add_tail_rcu(&pa->pa_node.lg_list,
5983 265743 : &lg->lg_prealloc_list[order]);
5984 265755 : spin_unlock(&lg->lg_prealloc_lock);
5985 :
5986 : /* Now trim the list to be not more than 8 elements */
5987 265755 : if (lg_prealloc_count > 8) {
5988 37 : ext4_mb_discard_lg_preallocations(sb, lg,
5989 : order, lg_prealloc_count);
5990 37 : return;
5991 : }
5992 : return ;
5993 : }
5994 :
5995 : /*
5996 : * release all resource we used in allocation
5997 : */
5998 3620230 : static int ext4_mb_release_context(struct ext4_allocation_context *ac)
5999 : {
6000 3620230 : struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
6001 3620230 : struct ext4_prealloc_space *pa = ac->ac_pa;
6002 3620230 : if (pa) {
6003 1657292 : if (pa->pa_type == MB_GROUP_PA) {
6004 : /* see comment in ext4_mb_use_group_pa() */
6005 266501 : spin_lock(&pa->pa_lock);
6006 266501 : pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
6007 266501 : pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
6008 266501 : pa->pa_free -= ac->ac_b_ex.fe_len;
6009 266501 : pa->pa_len -= ac->ac_b_ex.fe_len;
6010 266501 : spin_unlock(&pa->pa_lock);
6011 :
6012 : /*
6013 : * We want to add the pa to the right bucket.
6014 : * Remove it from the list and while adding
6015 : * make sure the list to which we are adding
6016 : * doesn't grow big.
6017 : */
6018 266501 : if (likely(pa->pa_free)) {
6019 265755 : spin_lock(pa->pa_node_lock.lg_lock);
6020 265755 : list_del_rcu(&pa->pa_node.lg_list);
6021 265755 : spin_unlock(pa->pa_node_lock.lg_lock);
6022 265755 : ext4_mb_add_n_trim(ac);
6023 : }
6024 : }
6025 :
6026 1657292 : ext4_mb_put_pa(ac, ac->ac_sb, pa);
6027 : }
6028 3620163 : if (ac->ac_bitmap_page)
6029 2679567 : put_page(ac->ac_bitmap_page);
6030 3620187 : if (ac->ac_buddy_page)
6031 2679596 : put_page(ac->ac_buddy_page);
6032 3620200 : if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
6033 295889 : mutex_unlock(&ac->ac_lg->lg_mutex);
6034 3620200 : ext4_mb_collect_stats(ac);
6035 3619732 : return 0;
6036 : }
6037 :
6038 5941 : static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
6039 : {
6040 5941 : ext4_group_t i, ngroups = ext4_get_groups_count(sb);
6041 5939 : int ret;
6042 5939 : int freed = 0, busy = 0;
6043 5939 : int retry = 0;
6044 :
6045 5939 : trace_ext4_mb_discard_preallocations(sb, needed);
6046 :
6047 5941 : if (needed == 0)
6048 0 : needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
6049 5941 : repeat:
6050 675465 : for (i = 0; i < ngroups && needed > 0; i++) {
6051 669506 : ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
6052 669447 : freed += ret;
6053 669447 : needed -= ret;
6054 669447 : cond_resched();
6055 : }
6056 :
6057 5959 : if (needed > 0 && busy && ++retry < 3) {
6058 1 : busy = 0;
6059 1 : goto repeat;
6060 : }
6061 :
6062 5958 : return freed;
6063 : }
6064 :
6065 5942 : static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
6066 : struct ext4_allocation_context *ac, u64 *seq)
6067 : {
6068 5942 : int freed;
6069 5942 : u64 seq_retry = 0;
6070 5942 : bool ret = false;
6071 :
6072 5942 : freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
6073 5958 : if (freed) {
6074 5512 : ret = true;
6075 5512 : goto out_dbg;
6076 : }
6077 446 : seq_retry = ext4_get_discard_pa_seq_sum();
6078 445 : if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) {
6079 240 : ac->ac_flags |= EXT4_MB_STRICT_CHECK;
6080 240 : *seq = seq_retry;
6081 240 : ret = true;
6082 : }
6083 :
6084 205 : out_dbg:
6085 5957 : mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
6086 5957 : return ret;
6087 : }
6088 :
6089 : /*
6090 : * Simple allocator for Ext4 fast commit replay path. It searches for blocks
6091 : * linearly starting at the goal block and also excludes the blocks which
6092 : * are going to be in use after fast commit replay.
6093 : */
6094 : static ext4_fsblk_t
6095 0 : ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp)
6096 : {
6097 0 : struct buffer_head *bitmap_bh;
6098 0 : struct super_block *sb = ar->inode->i_sb;
6099 0 : struct ext4_sb_info *sbi = EXT4_SB(sb);
6100 0 : ext4_group_t group, nr;
6101 0 : ext4_grpblk_t blkoff;
6102 0 : ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
6103 0 : ext4_grpblk_t i = 0;
6104 0 : ext4_fsblk_t goal, block;
6105 0 : struct ext4_super_block *es = EXT4_SB(sb)->s_es;
6106 :
6107 0 : goal = ar->goal;
6108 0 : if (goal < le32_to_cpu(es->s_first_data_block) ||
6109 : goal >= ext4_blocks_count(es))
6110 : goal = le32_to_cpu(es->s_first_data_block);
6111 :
6112 0 : ar->len = 0;
6113 0 : ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
6114 0 : for (nr = ext4_get_groups_count(sb); nr > 0; nr--) {
6115 0 : bitmap_bh = ext4_read_block_bitmap(sb, group);
6116 0 : if (IS_ERR(bitmap_bh)) {
6117 0 : *errp = PTR_ERR(bitmap_bh);
6118 0 : pr_warn("Failed to read block bitmap\n");
6119 0 : return 0;
6120 : }
6121 :
6122 0 : while (1) {
6123 0 : i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
6124 : blkoff);
6125 0 : if (i >= max)
6126 : break;
6127 0 : if (ext4_fc_replay_check_excluded(sb,
6128 0 : ext4_group_first_block_no(sb, group) +
6129 0 : EXT4_C2B(sbi, i))) {
6130 0 : blkoff = i + 1;
6131 : } else
6132 : break;
6133 : }
6134 0 : brelse(bitmap_bh);
6135 0 : if (i < max)
6136 : break;
6137 :
6138 0 : if (++group >= ext4_get_groups_count(sb))
6139 0 : group = 0;
6140 :
6141 0 : blkoff = 0;
6142 : }
6143 :
6144 0 : if (i >= max) {
6145 0 : *errp = -ENOSPC;
6146 0 : return 0;
6147 : }
6148 :
6149 0 : block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i);
6150 0 : ext4_mb_mark_bb(sb, block, 1, 1);
6151 0 : ar->len = 1;
6152 :
6153 0 : return block;
6154 : }
6155 :
6156 : /*
6157 : * Main entry point into mballoc to allocate blocks
6158 : * it tries to use preallocation first, then falls back
6159 : * to usual allocation
6160 : */
6161 4055377 : ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
6162 : struct ext4_allocation_request *ar, int *errp)
6163 : {
6164 4055377 : struct ext4_allocation_context *ac = NULL;
6165 4055377 : struct ext4_sb_info *sbi;
6166 4055377 : struct super_block *sb;
6167 4055377 : ext4_fsblk_t block = 0;
6168 4055377 : unsigned int inquota = 0;
6169 4055377 : unsigned int reserv_clstrs = 0;
6170 4055377 : int retries = 0;
6171 4055377 : u64 seq;
6172 :
6173 4055377 : might_sleep();
6174 4054422 : sb = ar->inode->i_sb;
6175 4054422 : sbi = EXT4_SB(sb);
6176 :
6177 4054422 : trace_ext4_request_blocks(ar);
6178 4054906 : if (sbi->s_mount_state & EXT4_FC_REPLAY)
6179 0 : return ext4_mb_new_blocks_simple(ar, errp);
6180 :
6181 : /* Allow to use superuser reservation for quota file */
6182 4057704 : if (ext4_is_quota_file(ar->inode))
6183 2788 : ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
6184 :
6185 4054906 : if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
6186 : /* Without delayed allocation we need to verify
6187 : * there is enough free blocks to do block allocation
6188 : * and verify allocation doesn't exceed the quota limits.
6189 : */
6190 9081287 : while (ar->len &&
6191 4323891 : ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
6192 :
6193 : /* let others to free the space */
6194 1588053 : cond_resched();
6195 1588013 : ar->len = ar->len >> 1;
6196 : }
6197 3169343 : if (!ar->len) {
6198 432044 : ext4_mb_show_pa(sb);
6199 432096 : *errp = -ENOSPC;
6200 432096 : return 0;
6201 : }
6202 2737299 : reserv_clstrs = ar->len;
6203 2737299 : if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
6204 5516 : dquot_alloc_block_nofail(ar->inode,
6205 2758 : EXT4_C2B(sbi, ar->len));
6206 : } else {
6207 5560893 : while (ar->len &&
6208 5555522 : dquot_alloc_block(ar->inode,
6209 2777761 : EXT4_C2B(sbi, ar->len))) {
6210 :
6211 48091 : ar->flags |= EXT4_MB_HINT_NOPREALLOC;
6212 48091 : ar->len--;
6213 : }
6214 : }
6215 2737799 : inquota = ar->len;
6216 2737799 : if (ar->len == 0) {
6217 4871 : *errp = -EDQUOT;
6218 4871 : goto out;
6219 : }
6220 : }
6221 :
6222 3619935 : ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
6223 3619720 : if (!ac) {
6224 0 : ar->len = 0;
6225 0 : *errp = -ENOMEM;
6226 0 : goto out;
6227 : }
6228 :
6229 3619720 : ext4_mb_initialize_context(ac, ar);
6230 :
6231 3618191 : ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
6232 3618191 : seq = this_cpu_read(discard_pa_seq);
6233 3618459 : if (!ext4_mb_use_preallocated(ac)) {
6234 2678988 : ac->ac_op = EXT4_MB_HISTORY_ALLOC;
6235 2678988 : ext4_mb_normalize_request(ac, ar);
6236 :
6237 2679292 : *errp = ext4_mb_pa_alloc(ac);
6238 2679272 : if (*errp)
6239 0 : goto errout;
6240 2679272 : repeat:
6241 : /* allocate space in core */
6242 2685024 : *errp = ext4_mb_regular_allocator(ac);
6243 : /*
6244 : * pa allocated above is added to grp->bb_prealloc_list only
6245 : * when we were able to allocate some block i.e. when
6246 : * ac->ac_status == AC_STATUS_FOUND.
6247 : * And error from above mean ac->ac_status != AC_STATUS_FOUND
6248 : * So we have to free this pa here itself.
6249 : */
6250 2685593 : if (*errp) {
6251 1 : ext4_mb_pa_put_free(ac);
6252 1 : ext4_discard_allocated_blocks(ac);
6253 1 : goto errout;
6254 : }
6255 2685592 : if (ac->ac_status == AC_STATUS_FOUND &&
6256 2679551 : ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
6257 1962661 : ext4_mb_pa_put_free(ac);
6258 : }
6259 3625166 : if (likely(ac->ac_status == AC_STATUS_FOUND)) {
6260 3619196 : *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
6261 3619975 : if (*errp) {
6262 0 : ext4_discard_allocated_blocks(ac);
6263 0 : goto errout;
6264 : } else {
6265 3619975 : block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
6266 3619979 : ar->len = ac->ac_b_ex.fe_len;
6267 : }
6268 : } else {
6269 11926 : if (++retries < 3 &&
6270 5942 : ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
6271 5752 : goto repeat;
6272 : /*
6273 : * If block allocation fails then the pa allocated above
6274 : * needs to be freed here itself.
6275 : */
6276 232 : ext4_mb_pa_put_free(ac);
6277 233 : *errp = -ENOSPC;
6278 : }
6279 :
6280 3620212 : if (*errp) {
6281 233 : errout:
6282 234 : ac->ac_b_ex.fe_len = 0;
6283 234 : ar->len = 0;
6284 234 : ext4_mb_show_ac(ac);
6285 : }
6286 3620213 : ext4_mb_release_context(ac);
6287 3619696 : kmem_cache_free(ext4_ac_cachep, ac);
6288 3624729 : out:
6289 3624729 : if (inquota && ar->len < inquota)
6290 680599 : dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
6291 3624734 : if (!ar->len) {
6292 5101 : if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
6293 : /* release all the reserved blocks if non delalloc */
6294 5021 : percpu_counter_sub(&sbi->s_dirtyclusters_counter,
6295 : reserv_clstrs);
6296 : }
6297 :
6298 3624735 : trace_ext4_allocate_blocks(ar, (unsigned long long)block);
6299 :
6300 3624735 : return block;
6301 : }
6302 :
6303 : /*
6304 : * We can merge two free data extents only if the physical blocks
6305 : * are contiguous, AND the extents were freed by the same transaction,
6306 : * AND the blocks are associated with the same group.
6307 : */
6308 4210765 : static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
6309 : struct ext4_free_data *entry,
6310 : struct ext4_free_data *new_entry,
6311 : struct rb_root *entry_rb_root)
6312 : {
6313 4210765 : if ((entry->efd_tid != new_entry->efd_tid) ||
6314 4188293 : (entry->efd_group != new_entry->efd_group))
6315 : return;
6316 4188295 : if (entry->efd_start_cluster + entry->efd_count ==
6317 4188295 : new_entry->efd_start_cluster) {
6318 408840 : new_entry->efd_start_cluster = entry->efd_start_cluster;
6319 408840 : new_entry->efd_count += entry->efd_count;
6320 3779455 : } else if (new_entry->efd_start_cluster + new_entry->efd_count ==
6321 : entry->efd_start_cluster) {
6322 551353 : new_entry->efd_count += entry->efd_count;
6323 : } else
6324 : return;
6325 960193 : spin_lock(&sbi->s_md_lock);
6326 960194 : list_del(&entry->efd_list);
6327 960194 : spin_unlock(&sbi->s_md_lock);
6328 960194 : rb_erase(&entry->efd_node, entry_rb_root);
6329 960194 : kmem_cache_free(ext4_free_data_cachep, entry);
6330 : }
6331 :
6332 : static noinline_for_stack void
6333 2624605 : ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
6334 : struct ext4_free_data *new_entry)
6335 : {
6336 2624605 : ext4_group_t group = e4b->bd_group;
6337 2624605 : ext4_grpblk_t cluster;
6338 2624605 : ext4_grpblk_t clusters = new_entry->efd_count;
6339 2624605 : struct ext4_free_data *entry;
6340 2624605 : struct ext4_group_info *db = e4b->bd_info;
6341 2624605 : struct super_block *sb = e4b->bd_sb;
6342 2624605 : struct ext4_sb_info *sbi = EXT4_SB(sb);
6343 2624605 : struct rb_node **n = &db->bb_free_root.rb_node, *node;
6344 2624605 : struct rb_node *parent = NULL, *new_node;
6345 :
6346 2624605 : BUG_ON(!ext4_handle_valid(handle));
6347 2624605 : BUG_ON(e4b->bd_bitmap_page == NULL);
6348 2624605 : BUG_ON(e4b->bd_buddy_page == NULL);
6349 :
6350 2624605 : new_node = &new_entry->efd_node;
6351 2624605 : cluster = new_entry->efd_start_cluster;
6352 :
6353 2624605 : if (!*n) {
6354 : /* first free block exent. We need to
6355 : protect buddy cache from being freed,
6356 : * otherwise we'll refresh it from
6357 : * on-disk bitmap and lose not-yet-available
6358 : * blocks */
6359 212330 : get_page(e4b->bd_buddy_page);
6360 212336 : get_page(e4b->bd_bitmap_page);
6361 : }
6362 14323932 : while (*n) {
6363 11699321 : parent = *n;
6364 11699321 : entry = rb_entry(parent, struct ext4_free_data, efd_node);
6365 11699321 : if (cluster < entry->efd_start_cluster)
6366 5560455 : n = &(*n)->rb_left;
6367 6138866 : else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
6368 6138866 : n = &(*n)->rb_right;
6369 : else {
6370 0 : ext4_grp_locked_error(sb, group, 0,
6371 : ext4_group_first_block_no(sb, group) +
6372 : EXT4_C2B(sbi, cluster),
6373 : "Block already on to-be-freed list");
6374 0 : kmem_cache_free(ext4_free_data_cachep, new_entry);
6375 0 : return;
6376 : }
6377 : }
6378 :
6379 2624611 : rb_link_node(new_node, parent, n);
6380 2624611 : rb_insert_color(new_node, &db->bb_free_root);
6381 :
6382 : /* Now try to see the extent can be merged to left and right */
6383 2624606 : node = rb_prev(new_node);
6384 2624596 : if (node) {
6385 2071302 : entry = rb_entry(node, struct ext4_free_data, efd_node);
6386 2071302 : ext4_try_merge_freed_extent(sbi, entry, new_entry,
6387 : &(db->bb_free_root));
6388 : }
6389 :
6390 2624591 : node = rb_next(new_node);
6391 2624592 : if (node) {
6392 2139484 : entry = rb_entry(node, struct ext4_free_data, efd_node);
6393 2139484 : ext4_try_merge_freed_extent(sbi, entry, new_entry,
6394 : &(db->bb_free_root));
6395 : }
6396 :
6397 2624587 : spin_lock(&sbi->s_md_lock);
6398 2624621 : list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
6399 2624621 : sbi->s_mb_free_pending += clusters;
6400 2624621 : spin_unlock(&sbi->s_md_lock);
6401 : }
6402 :
6403 0 : static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
6404 : unsigned long count)
6405 : {
6406 0 : struct buffer_head *bitmap_bh;
6407 0 : struct super_block *sb = inode->i_sb;
6408 0 : struct ext4_group_desc *gdp;
6409 0 : struct buffer_head *gdp_bh;
6410 0 : ext4_group_t group;
6411 0 : ext4_grpblk_t blkoff;
6412 0 : int already_freed = 0, err, i;
6413 :
6414 0 : ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
6415 0 : bitmap_bh = ext4_read_block_bitmap(sb, group);
6416 0 : if (IS_ERR(bitmap_bh)) {
6417 0 : pr_warn("Failed to read block bitmap\n");
6418 0 : return;
6419 : }
6420 0 : gdp = ext4_get_group_desc(sb, group, &gdp_bh);
6421 0 : if (!gdp)
6422 0 : goto err_out;
6423 :
6424 0 : for (i = 0; i < count; i++) {
6425 0 : if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
6426 0 : already_freed++;
6427 : }
6428 0 : mb_clear_bits(bitmap_bh->b_data, blkoff, count);
6429 0 : err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
6430 0 : if (err)
6431 0 : goto err_out;
6432 0 : ext4_free_group_clusters_set(
6433 0 : sb, gdp, ext4_free_group_clusters(sb, gdp) +
6434 : count - already_freed);
6435 0 : ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
6436 0 : ext4_group_desc_csum_set(sb, group, gdp);
6437 0 : ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
6438 0 : sync_dirty_buffer(bitmap_bh);
6439 0 : sync_dirty_buffer(gdp_bh);
6440 :
6441 0 : err_out:
6442 0 : brelse(bitmap_bh);
6443 : }
6444 :
6445 : /**
6446 : * ext4_mb_clear_bb() -- helper function for freeing blocks.
6447 : * Used by ext4_free_blocks()
6448 : * @handle: handle for this transaction
6449 : * @inode: inode
6450 : * @block: starting physical block to be freed
6451 : * @count: number of blocks to be freed
6452 : * @flags: flags used by ext4_free_blocks
6453 : */
6454 2622166 : static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
6455 : ext4_fsblk_t block, unsigned long count,
6456 : int flags)
6457 : {
6458 2622166 : struct buffer_head *bitmap_bh = NULL;
6459 2622166 : struct super_block *sb = inode->i_sb;
6460 2622166 : struct ext4_group_desc *gdp;
6461 2622166 : struct ext4_group_info *grp;
6462 2622166 : unsigned int overflow;
6463 2622166 : ext4_grpblk_t bit;
6464 2622166 : struct buffer_head *gd_bh;
6465 2622166 : ext4_group_t block_group;
6466 2622166 : struct ext4_sb_info *sbi;
6467 2622166 : struct ext4_buddy e4b;
6468 2622166 : unsigned int count_clusters;
6469 2622166 : int err = 0;
6470 2622166 : int ret;
6471 :
6472 2622166 : sbi = EXT4_SB(sb);
6473 :
6474 2622168 : if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
6475 2 : !ext4_inode_block_valid(inode, block, count)) {
6476 0 : ext4_error(sb, "Freeing blocks in system zone - "
6477 : "Block = %llu, count = %lu", block, count);
6478 : /* err = 0. ext4_std_error should be a no op */
6479 0 : goto error_return;
6480 : }
6481 2622166 : flags |= EXT4_FREE_BLOCKS_VALIDATED;
6482 :
6483 2624607 : do_more:
6484 2624607 : overflow = 0;
6485 2624607 : ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
6486 :
6487 2624596 : grp = ext4_get_group_info(sb, block_group);
6488 2624592 : if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
6489 : return;
6490 :
6491 : /*
6492 : * Check to see if we are freeing blocks across a group
6493 : * boundary.
6494 : */
6495 2624592 : if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
6496 2441 : overflow = EXT4_C2B(sbi, bit) + count -
6497 2441 : EXT4_BLOCKS_PER_GROUP(sb);
6498 2441 : count -= overflow;
6499 : /* The range changed so it's no longer validated */
6500 2441 : flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
6501 : }
6502 2624592 : count_clusters = EXT4_NUM_B2C(sbi, count);
6503 2624592 : bitmap_bh = ext4_read_block_bitmap(sb, block_group);
6504 2624586 : if (IS_ERR(bitmap_bh)) {
6505 0 : err = PTR_ERR(bitmap_bh);
6506 0 : bitmap_bh = NULL;
6507 0 : goto error_return;
6508 : }
6509 2624586 : gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
6510 2624566 : if (!gdp) {
6511 0 : err = -EIO;
6512 0 : goto error_return;
6513 : }
6514 :
6515 2629448 : if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
6516 4882 : !ext4_inode_block_valid(inode, block, count)) {
6517 0 : ext4_error(sb, "Freeing blocks in system zone - "
6518 : "Block = %llu, count = %lu", block, count);
6519 : /* err = 0. ext4_std_error should be a no op */
6520 0 : goto error_return;
6521 : }
6522 :
6523 2624566 : BUFFER_TRACE(bitmap_bh, "getting write access");
6524 2624566 : err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
6525 : EXT4_JTR_NONE);
6526 2624610 : if (err)
6527 0 : goto error_return;
6528 :
6529 : /*
6530 : * We are about to modify some metadata. Call the journal APIs
6531 : * to unshare ->b_data if a currently-committing transaction is
6532 : * using it
6533 : */
6534 2624610 : BUFFER_TRACE(gd_bh, "get_write_access");
6535 2624610 : err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
6536 2624611 : if (err)
6537 0 : goto error_return;
6538 : #ifdef AGGRESSIVE_CHECK
6539 : {
6540 : int i;
6541 : for (i = 0; i < count_clusters; i++)
6542 : BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
6543 : }
6544 : #endif
6545 2624611 : trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
6546 :
6547 : /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
6548 2624606 : err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
6549 : GFP_NOFS|__GFP_NOFAIL);
6550 2624594 : if (err)
6551 0 : goto error_return;
6552 :
6553 : /*
6554 : * We need to make sure we don't reuse the freed block until after the
6555 : * transaction is committed. We make an exception if the inode is to be
6556 : * written in writeback mode since writeback mode has weak data
6557 : * consistency guarantees.
6558 : */
6559 2624594 : if (ext4_handle_valid(handle) &&
6560 4964928 : ((flags & EXT4_FREE_BLOCKS_METADATA) ||
6561 2624619 : !ext4_should_writeback_data(inode))) {
6562 2624593 : struct ext4_free_data *new_entry;
6563 : /*
6564 : * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
6565 : * to fail.
6566 : */
6567 2624593 : new_entry = kmem_cache_alloc(ext4_free_data_cachep,
6568 : GFP_NOFS|__GFP_NOFAIL);
6569 2624590 : new_entry->efd_start_cluster = bit;
6570 2624590 : new_entry->efd_group = block_group;
6571 2624590 : new_entry->efd_count = count_clusters;
6572 2624590 : new_entry->efd_tid = handle->h_transaction->t_tid;
6573 :
6574 2624590 : ext4_lock_group(sb, block_group);
6575 2624590 : mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
6576 2624602 : ext4_mb_free_metadata(handle, &e4b, new_entry);
6577 : } else {
6578 : /* need to update group_info->bb_free and bitmap
6579 : * with group lock held. generate_buddy look at
6580 : * them with group lock_held
6581 : */
6582 1 : if (test_opt(sb, DISCARD)) {
6583 0 : err = ext4_issue_discard(sb, block_group, bit,
6584 : count_clusters, NULL);
6585 0 : if (err && err != -EOPNOTSUPP)
6586 0 : ext4_msg(sb, KERN_WARNING, "discard request in"
6587 : " group:%u block:%d count:%lu failed"
6588 : " with %d", block_group, bit, count,
6589 : err);
6590 : } else
6591 1 : EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
6592 :
6593 1 : ext4_lock_group(sb, block_group);
6594 1 : mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
6595 1 : mb_free_blocks(inode, &e4b, bit, count_clusters);
6596 : }
6597 :
6598 2624620 : ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
6599 2624619 : ext4_free_group_clusters_set(sb, gdp, ret);
6600 2624616 : ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
6601 2624584 : ext4_group_desc_csum_set(sb, block_group, gdp);
6602 2624594 : ext4_unlock_group(sb, block_group);
6603 :
6604 2624553 : if (sbi->s_log_groups_per_flex) {
6605 2624400 : ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
6606 5248834 : atomic64_add(count_clusters,
6607 2624403 : &sbi_array_rcu_deref(sbi, s_flex_groups,
6608 : flex_group)->free_clusters);
6609 : }
6610 :
6611 : /*
6612 : * on a bigalloc file system, defer the s_freeclusters_counter
6613 : * update to the caller (ext4_remove_space and friends) so they
6614 : * can determine if a cluster freed here should be rereserved
6615 : */
6616 2624602 : if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
6617 2624602 : if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
6618 2624600 : dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
6619 2624613 : percpu_counter_add(&sbi->s_freeclusters_counter,
6620 : count_clusters);
6621 : }
6622 :
6623 2624609 : ext4_mb_unload_buddy(&e4b);
6624 :
6625 : /* We dirtied the bitmap block */
6626 2624607 : BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
6627 2624607 : err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
6628 :
6629 : /* And the group descriptor block */
6630 2624612 : BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
6631 2624612 : ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
6632 2624593 : if (!err)
6633 2624607 : err = ret;
6634 :
6635 2624593 : if (overflow && !err) {
6636 2441 : block += count;
6637 2441 : count = overflow;
6638 2441 : put_bh(bitmap_bh);
6639 : /* The range changed so it's no longer validated */
6640 2441 : flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
6641 2441 : goto do_more;
6642 : }
6643 2622152 : error_return:
6644 2622152 : brelse(bitmap_bh);
6645 2622172 : ext4_std_error(sb, err);
6646 : return;
6647 : }
6648 :
6649 : /**
6650 : * ext4_free_blocks() -- Free given blocks and update quota
6651 : * @handle: handle for this transaction
6652 : * @inode: inode
6653 : * @bh: optional buffer of the block to be freed
6654 : * @block: starting physical block to be freed
6655 : * @count: number of blocks to be freed
6656 : * @flags: flags used by ext4_free_blocks
6657 : */
6658 2622126 : void ext4_free_blocks(handle_t *handle, struct inode *inode,
6659 : struct buffer_head *bh, ext4_fsblk_t block,
6660 : unsigned long count, int flags)
6661 : {
6662 2622126 : struct super_block *sb = inode->i_sb;
6663 2622126 : unsigned int overflow;
6664 2622126 : struct ext4_sb_info *sbi;
6665 :
6666 2622126 : sbi = EXT4_SB(sb);
6667 :
6668 2622126 : if (bh) {
6669 26284 : if (block)
6670 0 : BUG_ON(block != bh->b_blocknr);
6671 : else
6672 26284 : block = bh->b_blocknr;
6673 : }
6674 :
6675 2622126 : if (sbi->s_mount_state & EXT4_FC_REPLAY) {
6676 0 : ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count));
6677 0 : return;
6678 : }
6679 :
6680 2622126 : might_sleep();
6681 :
6682 5244192 : if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
6683 2622045 : !ext4_inode_block_valid(inode, block, count)) {
6684 0 : ext4_error(sb, "Freeing blocks not in datazone - "
6685 : "block = %llu, count = %lu", block, count);
6686 0 : return;
6687 : }
6688 2622147 : flags |= EXT4_FREE_BLOCKS_VALIDATED;
6689 :
6690 2622147 : ext4_debug("freeing block %llu\n", block);
6691 2622147 : trace_ext4_free_blocks(inode, block, count, flags);
6692 :
6693 2622120 : if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
6694 26282 : BUG_ON(count > 1);
6695 :
6696 26282 : ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
6697 : inode, bh, block);
6698 : }
6699 :
6700 : /*
6701 : * If the extent to be freed does not begin on a cluster
6702 : * boundary, we need to deal with partial clusters at the
6703 : * beginning and end of the extent. Normally we will free
6704 : * blocks at the beginning or the end unless we are explicitly
6705 : * requested to avoid doing so.
6706 : */
6707 2622122 : overflow = EXT4_PBLK_COFF(sbi, block);
6708 2622122 : if (overflow) {
6709 0 : if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
6710 0 : overflow = sbi->s_cluster_ratio - overflow;
6711 0 : block += overflow;
6712 0 : if (count > overflow)
6713 0 : count -= overflow;
6714 : else
6715 : return;
6716 : } else {
6717 0 : block -= overflow;
6718 0 : count += overflow;
6719 : }
6720 : /* The range changed so it's no longer validated */
6721 0 : flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
6722 : }
6723 2622122 : overflow = EXT4_LBLK_COFF(sbi, count);
6724 2622122 : if (overflow) {
6725 2 : if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
6726 0 : if (count > overflow)
6727 0 : count -= overflow;
6728 : else
6729 : return;
6730 : } else
6731 2 : count += sbi->s_cluster_ratio - overflow;
6732 : /* The range changed so it's no longer validated */
6733 2 : flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
6734 : }
6735 :
6736 2622122 : if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
6737 257955 : int i;
6738 257955 : int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
6739 :
6740 520879 : for (i = 0; i < count; i++) {
6741 262884 : cond_resched();
6742 262865 : if (is_metadata)
6743 262863 : bh = sb_find_get_block(inode->i_sb, block + i);
6744 262912 : ext4_forget(handle, is_metadata, inode, bh, block + i);
6745 : }
6746 : }
6747 :
6748 2622162 : ext4_mb_clear_bb(handle, inode, block, count, flags);
6749 2622162 : return;
6750 : }
6751 :
6752 : /**
6753 : * ext4_group_add_blocks() -- Add given blocks to an existing group
6754 : * @handle: handle to this transaction
6755 : * @sb: super block
6756 : * @block: start physical block to add to the block group
6757 : * @count: number of blocks to free
6758 : *
6759 : * This marks the blocks as free in the bitmap and buddy.
6760 : */
6761 29 : int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
6762 : ext4_fsblk_t block, unsigned long count)
6763 : {
6764 29 : struct buffer_head *bitmap_bh = NULL;
6765 29 : struct buffer_head *gd_bh;
6766 29 : ext4_group_t block_group;
6767 29 : ext4_grpblk_t bit;
6768 29 : unsigned int i;
6769 29 : struct ext4_group_desc *desc;
6770 29 : struct ext4_sb_info *sbi = EXT4_SB(sb);
6771 29 : struct ext4_buddy e4b;
6772 29 : int err = 0, ret, free_clusters_count;
6773 29 : ext4_grpblk_t clusters_freed;
6774 29 : ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block);
6775 29 : ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1);
6776 29 : unsigned long cluster_count = last_cluster - first_cluster + 1;
6777 :
6778 29 : ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
6779 :
6780 29 : if (count == 0)
6781 : return 0;
6782 :
6783 29 : ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
6784 : /*
6785 : * Check to see if we are freeing blocks across a group
6786 : * boundary.
6787 : */
6788 29 : if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) {
6789 0 : ext4_warning(sb, "too many blocks added to group %u",
6790 : block_group);
6791 0 : err = -EINVAL;
6792 0 : goto error_return;
6793 : }
6794 :
6795 29 : bitmap_bh = ext4_read_block_bitmap(sb, block_group);
6796 29 : if (IS_ERR(bitmap_bh)) {
6797 0 : err = PTR_ERR(bitmap_bh);
6798 0 : bitmap_bh = NULL;
6799 0 : goto error_return;
6800 : }
6801 :
6802 29 : desc = ext4_get_group_desc(sb, block_group, &gd_bh);
6803 29 : if (!desc) {
6804 0 : err = -EIO;
6805 0 : goto error_return;
6806 : }
6807 :
6808 29 : if (!ext4_sb_block_valid(sb, NULL, block, count)) {
6809 0 : ext4_error(sb, "Adding blocks in system zones - "
6810 : "Block = %llu, count = %lu",
6811 : block, count);
6812 0 : err = -EINVAL;
6813 0 : goto error_return;
6814 : }
6815 :
6816 29 : BUFFER_TRACE(bitmap_bh, "getting write access");
6817 29 : err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
6818 : EXT4_JTR_NONE);
6819 29 : if (err)
6820 0 : goto error_return;
6821 :
6822 : /*
6823 : * We are about to modify some metadata. Call the journal APIs
6824 : * to unshare ->b_data if a currently-committing transaction is
6825 : * using it
6826 : */
6827 29 : BUFFER_TRACE(gd_bh, "get_write_access");
6828 29 : err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
6829 29 : if (err)
6830 0 : goto error_return;
6831 :
6832 225307 : for (i = 0, clusters_freed = 0; i < cluster_count; i++) {
6833 225278 : BUFFER_TRACE(bitmap_bh, "clear bit");
6834 225278 : if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
6835 0 : ext4_error(sb, "bit already cleared for block %llu",
6836 : (ext4_fsblk_t)(block + i));
6837 0 : BUFFER_TRACE(bitmap_bh, "bit already cleared");
6838 : } else {
6839 225278 : clusters_freed++;
6840 : }
6841 : }
6842 :
6843 29 : err = ext4_mb_load_buddy(sb, block_group, &e4b);
6844 29 : if (err)
6845 0 : goto error_return;
6846 :
6847 : /*
6848 : * need to update group_info->bb_free and bitmap
6849 : * with group lock held. generate_buddy look at
6850 : * them with group lock_held
6851 : */
6852 29 : ext4_lock_group(sb, block_group);
6853 29 : mb_clear_bits(bitmap_bh->b_data, bit, cluster_count);
6854 29 : mb_free_blocks(NULL, &e4b, bit, cluster_count);
6855 58 : free_clusters_count = clusters_freed +
6856 29 : ext4_free_group_clusters(sb, desc);
6857 29 : ext4_free_group_clusters_set(sb, desc, free_clusters_count);
6858 29 : ext4_block_bitmap_csum_set(sb, desc, bitmap_bh);
6859 29 : ext4_group_desc_csum_set(sb, block_group, desc);
6860 29 : ext4_unlock_group(sb, block_group);
6861 29 : percpu_counter_add(&sbi->s_freeclusters_counter,
6862 : clusters_freed);
6863 :
6864 29 : if (sbi->s_log_groups_per_flex) {
6865 29 : ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
6866 58 : atomic64_add(clusters_freed,
6867 29 : &sbi_array_rcu_deref(sbi, s_flex_groups,
6868 : flex_group)->free_clusters);
6869 : }
6870 :
6871 29 : ext4_mb_unload_buddy(&e4b);
6872 :
6873 : /* We dirtied the bitmap block */
6874 29 : BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
6875 29 : err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
6876 :
6877 : /* And the group descriptor block */
6878 29 : BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
6879 29 : ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
6880 29 : if (!err)
6881 29 : err = ret;
6882 :
6883 0 : error_return:
6884 29 : brelse(bitmap_bh);
6885 29 : ext4_std_error(sb, err);
6886 : return err;
6887 : }
6888 :
6889 : /**
6890 : * ext4_trim_extent -- function to TRIM one single free extent in the group
6891 : * @sb: super block for the file system
6892 : * @start: starting block of the free extent in the alloc. group
6893 : * @count: number of blocks to TRIM
6894 : * @e4b: ext4 buddy for the group
6895 : *
6896 : * Trim "count" blocks starting at "start" in the "group". To assure that no
6897 : * one will allocate those blocks, mark it as used in buddy bitmap. This must
6898 : * be called with under the group lock.
6899 : */
6900 17545 : static int ext4_trim_extent(struct super_block *sb,
6901 : int start, int count, struct ext4_buddy *e4b)
6902 : __releases(bitlock)
6903 : __acquires(bitlock)
6904 : {
6905 17545 : struct ext4_free_extent ex;
6906 17545 : ext4_group_t group = e4b->bd_group;
6907 17545 : int ret = 0;
6908 :
6909 17545 : trace_ext4_trim_extent(sb, group, start, count);
6910 :
6911 17545 : assert_spin_locked(ext4_group_lock_ptr(sb, group));
6912 :
6913 17545 : ex.fe_start = start;
6914 17545 : ex.fe_group = group;
6915 17545 : ex.fe_len = count;
6916 :
6917 : /*
6918 : * Mark blocks used, so no one can reuse them while
6919 : * being trimmed.
6920 : */
6921 17545 : mb_mark_used(e4b, &ex);
6922 17545 : ext4_unlock_group(sb, group);
6923 17545 : ret = ext4_issue_discard(sb, group, start, count, NULL);
6924 17545 : ext4_lock_group(sb, group);
6925 17545 : mb_free_blocks(NULL, e4b, start, ex.fe_len);
6926 17545 : return ret;
6927 : }
6928 :
6929 14348 : static int ext4_try_to_trim_range(struct super_block *sb,
6930 : struct ext4_buddy *e4b, ext4_grpblk_t start,
6931 : ext4_grpblk_t max, ext4_grpblk_t minblocks)
6932 : __acquires(ext4_group_lock_ptr(sb, e4b->bd_group))
6933 : __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
6934 : {
6935 14348 : ext4_grpblk_t next, count, free_count;
6936 14348 : void *bitmap;
6937 :
6938 14348 : bitmap = e4b->bd_bitmap;
6939 14348 : start = (e4b->bd_info->bb_first_free > start) ?
6940 : e4b->bd_info->bb_first_free : start;
6941 14348 : count = 0;
6942 14348 : free_count = 0;
6943 :
6944 18370 : while (start <= max) {
6945 18367 : start = mb_find_next_zero_bit(bitmap, max + 1, start);
6946 18367 : if (start > max)
6947 : break;
6948 18367 : next = mb_find_next_bit(bitmap, max + 1, start);
6949 :
6950 18367 : if ((next - start) >= minblocks) {
6951 17545 : int ret = ext4_trim_extent(sb, start, next - start, e4b);
6952 :
6953 17545 : if (ret && ret != -EOPNOTSUPP)
6954 : break;
6955 17545 : count += next - start;
6956 : }
6957 18367 : free_count += next - start;
6958 18367 : start = next + 1;
6959 :
6960 18367 : if (fatal_signal_pending(current)) {
6961 : count = -ERESTARTSYS;
6962 : break;
6963 : }
6964 :
6965 18366 : if (need_resched()) {
6966 11 : ext4_unlock_group(sb, e4b->bd_group);
6967 11 : cond_resched();
6968 11 : ext4_lock_group(sb, e4b->bd_group);
6969 : }
6970 :
6971 18366 : if ((e4b->bd_info->bb_free - free_count) < minblocks)
6972 : break;
6973 : }
6974 :
6975 14348 : return count;
6976 : }
6977 :
6978 : /**
6979 : * ext4_trim_all_free -- function to trim all free space in alloc. group
6980 : * @sb: super block for file system
6981 : * @group: group to be trimmed
6982 : * @start: first group block to examine
6983 : * @max: last group block to examine
6984 : * @minblocks: minimum extent block count
6985 : * @set_trimmed: set the trimmed flag if at least one block is trimmed
6986 : *
6987 : * ext4_trim_all_free walks through group's block bitmap searching for free
6988 : * extents. When the free extent is found, mark it as used in group buddy
6989 : * bitmap. Then issue a TRIM command on this extent and free the extent in
6990 : * the group buddy bitmap.
6991 : */
6992 : static ext4_grpblk_t
6993 4679755 : ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
6994 : ext4_grpblk_t start, ext4_grpblk_t max,
6995 : ext4_grpblk_t minblocks, bool set_trimmed)
6996 : {
6997 4679755 : struct ext4_buddy e4b;
6998 4679755 : int ret;
6999 :
7000 4679755 : trace_ext4_trim_all_free(sb, group, start, max);
7001 :
7002 4678655 : ret = ext4_mb_load_buddy(sb, group, &e4b);
7003 4691937 : if (ret) {
7004 0 : ext4_warning(sb, "Error %d loading buddy information for %u",
7005 : ret, group);
7006 0 : return ret;
7007 : }
7008 :
7009 4691937 : ext4_lock_group(sb, group);
7010 :
7011 4696258 : if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
7012 4693504 : minblocks < EXT4_SB(sb)->s_last_trim_minblks) {
7013 14348 : ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
7014 14348 : if (ret >= 0 && set_trimmed)
7015 14344 : EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
7016 : } else {
7017 : ret = 0;
7018 : }
7019 :
7020 4696258 : ext4_unlock_group(sb, group);
7021 4696957 : ext4_mb_unload_buddy(&e4b);
7022 :
7023 4696957 : ext4_debug("trimmed %d blocks in the group %d\n",
7024 : ret, group);
7025 :
7026 4696957 : return ret;
7027 : }
7028 :
7029 : /**
7030 : * ext4_trim_fs() -- trim ioctl handle function
7031 : * @sb: superblock for filesystem
7032 : * @range: fstrim_range structure
7033 : *
7034 : * start: First Byte to trim
7035 : * len: number of Bytes to trim from start
7036 : * minlen: minimum extent length in Bytes
7037 : * ext4_trim_fs goes through all allocation groups containing Bytes from
7038 : * start to start+len. For each such a group ext4_trim_all_free function
7039 : * is invoked to trim all free space.
7040 : */
7041 18121 : int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
7042 : {
7043 18121 : unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev);
7044 18121 : struct ext4_group_info *grp;
7045 18121 : ext4_group_t group, first_group, last_group;
7046 18121 : ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
7047 18121 : uint64_t start, end, minlen, trimmed = 0;
7048 18121 : ext4_fsblk_t first_data_blk =
7049 18121 : le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
7050 18121 : ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
7051 18121 : bool whole_group, eof = false;
7052 18121 : int ret = 0;
7053 :
7054 18121 : start = range->start >> sb->s_blocksize_bits;
7055 18121 : end = start + (range->len >> sb->s_blocksize_bits) - 1;
7056 18121 : minlen = EXT4_NUM_B2C(EXT4_SB(sb),
7057 : range->minlen >> sb->s_blocksize_bits);
7058 :
7059 18121 : if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
7060 18116 : start >= max_blks ||
7061 18116 : range->len < sb->s_blocksize)
7062 : return -EINVAL;
7063 : /* No point to try to trim less than discard granularity */
7064 18114 : if (range->minlen < discard_granularity) {
7065 18040 : minlen = EXT4_NUM_B2C(EXT4_SB(sb),
7066 : discard_granularity >> sb->s_blocksize_bits);
7067 18040 : if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
7068 0 : goto out;
7069 : }
7070 18114 : if (end >= max_blks - 1) {
7071 18101 : end = max_blks - 1;
7072 18101 : eof = true;
7073 : }
7074 18114 : if (end <= first_data_blk)
7075 0 : goto out;
7076 18114 : if (start < first_data_blk)
7077 : start = first_data_blk;
7078 :
7079 : /* Determine first and last group to examine based on start and end */
7080 18114 : ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
7081 : &first_group, &first_cluster);
7082 18142 : ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
7083 : &last_group, &last_cluster);
7084 :
7085 : /* end now represents the last cluster to discard in this group */
7086 18149 : end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
7087 18149 : whole_group = true;
7088 :
7089 4857958 : for (group = first_group; group <= last_group; group++) {
7090 4839789 : grp = ext4_get_group_info(sb, group);
7091 4824593 : if (!grp)
7092 0 : continue;
7093 : /* We only do this if the grp has never been initialized */
7094 4824593 : if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
7095 2379 : ret = ext4_mb_init_group(sb, group, GFP_NOFS);
7096 2379 : if (ret)
7097 : break;
7098 : }
7099 :
7100 : /*
7101 : * For all the groups except the last one, last cluster will
7102 : * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
7103 : * change it for the last group, note that last_cluster is
7104 : * already computed earlier by ext4_get_group_no_and_offset()
7105 : */
7106 4824593 : if (group == last_group) {
7107 18123 : end = last_cluster;
7108 18123 : whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1;
7109 : }
7110 4824593 : if (grp->bb_free >= minlen) {
7111 4679780 : cnt = ext4_trim_all_free(sb, group, first_cluster,
7112 : end, minlen, whole_group);
7113 4694997 : if (cnt < 0) {
7114 : ret = cnt;
7115 : break;
7116 : }
7117 4694996 : trimmed += cnt;
7118 : }
7119 :
7120 : /*
7121 : * For every group except the first one, we are sure
7122 : * that the first cluster to discard will be cluster #0.
7123 : */
7124 4839809 : first_cluster = 0;
7125 : }
7126 :
7127 18170 : if (!ret)
7128 18169 : EXT4_SB(sb)->s_last_trim_minblks = minlen;
7129 :
7130 1 : out:
7131 18170 : range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
7132 18170 : return ret;
7133 : }
7134 :
7135 : /* Iterate all the free extents in the group. */
7136 : int
7137 2475 : ext4_mballoc_query_range(
7138 : struct super_block *sb,
7139 : ext4_group_t group,
7140 : ext4_grpblk_t start,
7141 : ext4_grpblk_t end,
7142 : ext4_mballoc_query_range_fn formatter,
7143 : void *priv)
7144 : {
7145 2475 : void *bitmap;
7146 2475 : ext4_grpblk_t next;
7147 2475 : struct ext4_buddy e4b;
7148 2475 : int error;
7149 :
7150 2475 : error = ext4_mb_load_buddy(sb, group, &e4b);
7151 2475 : if (error)
7152 : return error;
7153 2475 : bitmap = e4b.bd_bitmap;
7154 :
7155 2475 : ext4_lock_group(sb, group);
7156 :
7157 2475 : start = (e4b.bd_info->bb_first_free > start) ?
7158 : e4b.bd_info->bb_first_free : start;
7159 2475 : if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
7160 2469 : end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
7161 :
7162 4897 : while (start <= end) {
7163 3231 : start = mb_find_next_zero_bit(bitmap, end + 1, start);
7164 3231 : if (start > end)
7165 : break;
7166 3229 : next = mb_find_next_bit(bitmap, end + 1, start);
7167 :
7168 3229 : ext4_unlock_group(sb, group);
7169 3229 : error = formatter(sb, group, start, next - start, priv);
7170 3229 : if (error)
7171 807 : goto out_unload;
7172 2422 : ext4_lock_group(sb, group);
7173 :
7174 2422 : start = next + 1;
7175 : }
7176 :
7177 1668 : ext4_unlock_group(sb, group);
7178 2475 : out_unload:
7179 2475 : ext4_mb_unload_buddy(&e4b);
7180 :
7181 2475 : return error;
7182 : }
|