Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 : /*
3 : * Copyright (C) 2021-2023 Oracle. All Rights Reserved.
4 : * Author: Darrick J. Wong <djwong@kernel.org>
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "scrub/xfile.h"
11 : #include "scrub/xfarray.h"
12 : #include "scrub/scrub.h"
13 : #include "scrub/trace.h"
14 :
15 : /*
16 : * Large Arrays of Fixed-Size Records
17 : * ==================================
18 : *
19 : * This memory array uses an xfile (which itself is a memfd "file") to store
20 : * large numbers of fixed-size records in memory that can be paged out. This
21 : * puts less stress on the memory reclaim algorithms during an online repair
22 : * because we don't have to pin so much memory. However, array access is less
23 : * direct than would be in a regular memory array. Access to the array is
24 : * performed via indexed load and store methods, and an append method is
25 : * provided for convenience. Array elements can be unset, which sets them to
26 : * all zeroes. Unset entries are skipped during iteration, though direct loads
27 : * will return a zeroed buffer. Callers are responsible for concurrency
28 : * control.
29 : */
30 :
31 : /*
32 : * Pointer to scratch space. Because we can't access the xfile data directly,
33 : * we allocate a small amount of memory on the end of the xfarray structure to
34 : * buffer array items when we need space to store values temporarily.
35 : */
36 : static inline void *xfarray_scratch(struct xfarray *array)
37 : {
38 461279 : return (array + 1);
39 : }
40 :
41 : /* Compute array index given an xfile offset. */
42 : static xfarray_idx_t
43 : xfarray_idx(
44 : struct xfarray *array,
45 : loff_t pos)
46 : {
47 108487295 : if (array->obj_size_log >= 0)
48 7894804 : return (xfarray_idx_t)pos >> array->obj_size_log;
49 :
50 100592491 : return div_u64((xfarray_idx_t)pos, array->obj_size);
51 : }
52 :
53 : /* Compute xfile offset of array element. */
54 : static inline loff_t xfarray_pos(struct xfarray *array, xfarray_idx_t idx)
55 : {
56 24711878675 : if (array->obj_size_log >= 0)
57 23631762456 : return idx << array->obj_size_log;
58 :
59 1080116219 : return idx * array->obj_size;
60 : }
61 :
62 : /*
63 : * Initialize a big memory array. Array records cannot be larger than a
64 : * page, and the array cannot span more bytes than the page cache supports.
65 : * If @required_capacity is nonzero, the maximum array size will be set to this
66 : * quantity and the array creation will fail if the underlying storage cannot
67 : * support that many records.
68 : */
69 : int
70 101475962 : xfarray_create(
71 : const char *description,
72 : unsigned long long required_capacity,
73 : size_t obj_size,
74 : struct xfarray **arrayp)
75 : {
76 101475962 : struct xfarray *array;
77 101475962 : struct xfile *xfile;
78 101475962 : int error;
79 :
80 101475962 : ASSERT(obj_size < PAGE_SIZE);
81 :
82 101475962 : error = xfile_create(description, 0, &xfile);
83 101548130 : if (error)
84 : return error;
85 :
86 101550960 : error = -ENOMEM;
87 101550960 : array = kzalloc(sizeof(struct xfarray) + obj_size, XCHK_GFP_FLAGS);
88 101543149 : if (!array)
89 0 : goto out_xfile;
90 :
91 101543149 : array->xfile = xfile;
92 101543149 : array->obj_size = obj_size;
93 :
94 203086298 : if (is_power_of_2(obj_size))
95 1901316 : array->obj_size_log = ilog2(obj_size);
96 : else
97 100592491 : array->obj_size_log = -1;
98 :
99 101543149 : array->max_nr = xfarray_idx(array, MAX_LFS_FILESIZE);
100 101543149 : trace_xfarray_create(array, required_capacity);
101 :
102 101557547 : if (required_capacity > 0) {
103 354447 : if (array->max_nr < required_capacity) {
104 0 : error = -ENOMEM;
105 0 : goto out_xfarray;
106 : }
107 354447 : array->max_nr = required_capacity;
108 : }
109 :
110 101557547 : *arrayp = array;
111 101557547 : return 0;
112 :
113 : out_xfarray:
114 0 : kfree(array);
115 0 : out_xfile:
116 0 : xfile_destroy(xfile);
117 0 : return error;
118 : }
119 :
120 : /* Destroy the array. */
121 : void
122 101578950 : xfarray_destroy(
123 : struct xfarray *array)
124 : {
125 101578950 : xfile_destroy(array->xfile);
126 101579786 : kfree(array);
127 101579592 : }
128 :
129 : /* Load an element from the array. */
130 : int
131 21139257969 : xfarray_load(
132 : struct xfarray *array,
133 : xfarray_idx_t idx,
134 : void *ptr)
135 : {
136 21139257969 : if (idx >= array->nr)
137 : return -ENODATA;
138 :
139 42276446496 : return xfile_obj_load(array->xfile, ptr, array->obj_size,
140 : xfarray_pos(array, idx));
141 : }
142 :
143 : /* Is this array element potentially unset? */
144 : static inline bool
145 0 : xfarray_is_unset(
146 : struct xfarray *array,
147 : loff_t pos)
148 : {
149 0 : void *temp = xfarray_scratch(array);
150 0 : int error;
151 :
152 0 : if (array->unset_slots == 0)
153 : return false;
154 :
155 0 : error = xfile_obj_load(array->xfile, temp, array->obj_size, pos);
156 0 : if (!error && xfarray_element_is_null(array, temp))
157 0 : return true;
158 :
159 : return false;
160 : }
161 :
162 : /*
163 : * Unset an array element. If @idx is the last element in the array, the
164 : * array will be truncated. Otherwise, the entry will be zeroed.
165 : */
166 : int
167 0 : xfarray_unset(
168 : struct xfarray *array,
169 : xfarray_idx_t idx)
170 : {
171 0 : void *temp = xfarray_scratch(array);
172 0 : loff_t pos = xfarray_pos(array, idx);
173 0 : int error;
174 :
175 0 : if (idx >= array->nr)
176 : return -ENODATA;
177 :
178 0 : if (idx == array->nr - 1) {
179 0 : array->nr--;
180 0 : return 0;
181 : }
182 :
183 0 : if (xfarray_is_unset(array, pos))
184 : return 0;
185 :
186 0 : memset(temp, 0, array->obj_size);
187 0 : error = xfile_obj_store(array->xfile, temp, array->obj_size, pos);
188 0 : if (error)
189 : return error;
190 :
191 0 : array->unset_slots++;
192 0 : return 0;
193 : }
194 :
195 : /*
196 : * Store an element in the array. The element must not be completely zeroed,
197 : * because those are considered unset sparse elements.
198 : */
199 : int
200 2813575772 : xfarray_store(
201 : struct xfarray *array,
202 : xfarray_idx_t idx,
203 : const void *ptr)
204 : {
205 2813575772 : int ret;
206 :
207 2813575772 : if (idx >= array->max_nr)
208 : return -EFBIG;
209 :
210 2813575772 : ASSERT(!xfarray_element_is_null(array, ptr));
211 :
212 5628491108 : ret = xfile_obj_store(array->xfile, ptr, array->obj_size,
213 : xfarray_pos(array, idx));
214 2815967098 : if (ret)
215 : return ret;
216 :
217 2815967098 : array->nr = max(array->nr, idx + 1);
218 2815967098 : return 0;
219 : }
220 :
221 : /* Is this array element NULL? */
222 : bool
223 22064597604 : xfarray_element_is_null(
224 : struct xfarray *array,
225 : const void *ptr)
226 : {
227 22064597604 : return !memchr_inv(ptr, 0, array->obj_size);
228 : }
229 :
230 : /*
231 : * Store an element anywhere in the array that is unset. If there are no
232 : * unset slots, append the element to the array.
233 : */
234 : int
235 0 : xfarray_store_anywhere(
236 : struct xfarray *array,
237 : const void *ptr)
238 : {
239 0 : void *temp = xfarray_scratch(array);
240 0 : loff_t endpos = xfarray_pos(array, array->nr);
241 0 : loff_t pos;
242 0 : int error;
243 :
244 : /* Find an unset slot to put it in. */
245 0 : for (pos = 0;
246 0 : pos < endpos && array->unset_slots > 0;
247 0 : pos += array->obj_size) {
248 0 : error = xfile_obj_load(array->xfile, temp, array->obj_size,
249 : pos);
250 0 : if (error || !xfarray_element_is_null(array, temp))
251 0 : continue;
252 :
253 0 : error = xfile_obj_store(array->xfile, ptr, array->obj_size,
254 : pos);
255 0 : if (error)
256 : return error;
257 :
258 0 : array->unset_slots--;
259 0 : return 0;
260 : }
261 :
262 : /* No unset slots found; attach it on the end. */
263 0 : array->unset_slots = 0;
264 0 : return xfarray_append(array, ptr);
265 : }
266 :
267 : /* Return length of array. */
268 : uint64_t
269 255835412 : xfarray_length(
270 : struct xfarray *array)
271 : {
272 255835412 : return array->nr;
273 : }
274 :
275 : /*
276 : * Decide which array item we're going to read as part of an _iter_get.
277 : * @cur is the array index, and @pos is the file offset of that array index in
278 : * the backing xfile. Returns ENODATA if we reach the end of the records.
279 : *
280 : * Reading from a hole in a sparse xfile causes page instantiation, so for
281 : * iterating a (possibly sparse) array we need to figure out if the cursor is
282 : * pointing at a totally uninitialized hole and move the cursor up if
283 : * necessary.
284 : */
285 : static inline int
286 19250933645 : xfarray_find_data(
287 : struct xfarray *array,
288 : xfarray_idx_t *cur,
289 : loff_t *pos)
290 : {
291 19250933645 : unsigned int pgoff = offset_in_page(*pos);
292 19250933645 : loff_t end_pos = *pos + array->obj_size - 1;
293 19250933645 : loff_t new_pos;
294 :
295 : /*
296 : * If the current array record is not adjacent to a page boundary, we
297 : * are in the middle of the page. We do not need to move the cursor.
298 : */
299 19250933645 : if (pgoff != 0 && pgoff + array->obj_size - 1 < PAGE_SIZE)
300 : return 0;
301 :
302 : /*
303 : * Call SEEK_DATA on the last byte in the record we're about to read.
304 : * If the record ends at (or crosses) the end of a page then we know
305 : * that the first byte of the record is backed by pages and don't need
306 : * to query it. If instead the record begins at the start of the page
307 : * then we know that querying the last byte is just as good as querying
308 : * the first byte, since records cannot be larger than a page.
309 : *
310 : * If the call returns the same file offset, we know this record is
311 : * backed by real pages. We do not need to move the cursor.
312 : */
313 9278176 : new_pos = xfile_seek_data(array->xfile, end_pos);
314 9278260 : if (new_pos == -ENXIO)
315 : return -ENODATA;
316 9278260 : if (new_pos < 0)
317 0 : return new_pos;
318 9278260 : if (new_pos == end_pos)
319 : return 0;
320 :
321 : /*
322 : * Otherwise, SEEK_DATA told us how far up to move the file pointer to
323 : * find more data. Move the array index to the first record past the
324 : * byte offset we were given.
325 : */
326 6944146 : new_pos = roundup_64(new_pos, array->obj_size);
327 6944146 : *cur = xfarray_idx(array, new_pos);
328 6944146 : *pos = xfarray_pos(array, *cur);
329 6944146 : return 0;
330 : }
331 :
332 : /*
333 : * Starting at *idx, fetch the next non-null array entry and advance the index
334 : * to set up the next _load_next call. Returns ENODATA if we reach the end of
335 : * the array. Callers must set @*idx to XFARRAY_CURSOR_INIT before the first
336 : * call to this function.
337 : */
338 : int
339 603857321 : xfarray_load_next(
340 : struct xfarray *array,
341 : xfarray_idx_t *idx,
342 : void *rec)
343 : {
344 603857321 : xfarray_idx_t cur = *idx;
345 1207714642 : loff_t pos = xfarray_pos(array, cur);
346 19251350731 : int error;
347 :
348 19251350731 : do {
349 19251350731 : if (cur >= array->nr)
350 : return -ENODATA;
351 :
352 : /*
353 : * Ask the backing store for the location of next possible
354 : * written record, then retrieve that record.
355 : */
356 19250967986 : error = xfarray_find_data(array, &cur, &pos);
357 19250919154 : if (error)
358 0 : return error;
359 19250919154 : error = xfarray_load(array, cur, rec);
360 19250939323 : if (error)
361 0 : return error;
362 :
363 19250939323 : cur++;
364 19250939323 : pos += array->obj_size;
365 19250939323 : } while (xfarray_element_is_null(array, rec));
366 :
367 603475232 : *idx = cur;
368 603475232 : return 0;
369 : }
370 :
371 : /* Sorting functions */
372 :
373 : #ifdef DEBUG
374 : # define xfarray_sort_bump_loads(si) do { (si)->loads++; } while (0)
375 : # define xfarray_sort_bump_stores(si) do { (si)->stores++; } while (0)
376 : # define xfarray_sort_bump_compares(si) do { (si)->compares++; } while (0)
377 : # define xfarray_sort_bump_heapsorts(si) do { (si)->heapsorts++; } while (0)
378 : #else
379 : # define xfarray_sort_bump_loads(si)
380 : # define xfarray_sort_bump_stores(si)
381 : # define xfarray_sort_bump_compares(si)
382 : # define xfarray_sort_bump_heapsorts(si)
383 : #endif /* DEBUG */
384 :
385 : /* Load an array element for sorting. */
386 : static inline int
387 : xfarray_sort_load(
388 : struct xfarray_sortinfo *si,
389 : xfarray_idx_t idx,
390 : void *ptr)
391 : {
392 816444 : xfarray_sort_bump_loads(si);
393 816444 : return xfarray_load(si->array, idx, ptr);
394 : }
395 :
396 : /* Store an array element for sorting. */
397 : static inline int
398 : xfarray_sort_store(
399 : struct xfarray_sortinfo *si,
400 : xfarray_idx_t idx,
401 : void *ptr)
402 : {
403 56007652 : xfarray_sort_bump_stores(si);
404 56007652 : return xfarray_store(si->array, idx, ptr);
405 : }
406 :
407 : /* Compare an array element for sorting. */
408 : static inline int
409 : xfarray_sort_cmp(
410 : struct xfarray_sortinfo *si,
411 : const void *a,
412 : const void *b)
413 : {
414 146699282 : xfarray_sort_bump_compares(si);
415 146699282 : return si->cmp_fn(a, b);
416 : }
417 :
418 : /* Return a pointer to the low index stack for quicksort partitioning. */
419 : static inline xfarray_idx_t *xfarray_sortinfo_lo(struct xfarray_sortinfo *si)
420 : {
421 341207 : return (xfarray_idx_t *)(si + 1);
422 : }
423 :
424 : /* Return a pointer to the high index stack for quicksort partitioning. */
425 : static inline xfarray_idx_t *xfarray_sortinfo_hi(struct xfarray_sortinfo *si)
426 : {
427 780993 : return xfarray_sortinfo_lo(si) + si->max_stack_depth;
428 : }
429 :
430 : /* Size of each element in the quicksort pivot array. */
431 : static inline size_t
432 : xfarray_pivot_rec_sz(
433 : struct xfarray *array)
434 : {
435 431923 : return round_up(array->obj_size, 8) + sizeof(xfarray_idx_t);
436 : }
437 :
438 : /* Allocate memory to handle the sort. */
439 : static inline int
440 341207 : xfarray_sortinfo_alloc(
441 : struct xfarray *array,
442 : xfarray_cmp_fn cmp_fn,
443 : unsigned int flags,
444 : struct xfarray_sortinfo **infop)
445 : {
446 341207 : struct xfarray_sortinfo *si;
447 341207 : size_t nr_bytes = sizeof(struct xfarray_sortinfo);
448 341207 : size_t pivot_rec_sz = xfarray_pivot_rec_sz(array);
449 341207 : int max_stack_depth;
450 :
451 : /*
452 : * The median-of-nine pivot algorithm doesn't work if a subset has
453 : * fewer than 9 items. Make sure the in-memory sort will always take
454 : * over for subsets where this wouldn't be the case.
455 : */
456 341207 : BUILD_BUG_ON(XFARRAY_QSORT_PIVOT_NR >= XFARRAY_ISORT_NR);
457 :
458 : /*
459 : * Tail-call recursion during the partitioning phase means that
460 : * quicksort will never recurse more than log2(nr) times. We need one
461 : * extra level of stack to hold the initial parameters. In-memory
462 : * sort will always take care of the last few levels of recursion for
463 : * us, so we can reduce the stack depth by that much.
464 : */
465 682414 : max_stack_depth = ilog2(array->nr) + 1 - (XFARRAY_ISORT_SHIFT - 1);
466 341207 : if (max_stack_depth < 1)
467 194706 : max_stack_depth = 1;
468 :
469 : /* Each level of quicksort uses a lo and a hi index */
470 341207 : nr_bytes += max_stack_depth * sizeof(xfarray_idx_t) * 2;
471 :
472 : /* Scratchpad for in-memory sort, or finding the pivot */
473 341207 : nr_bytes += max_t(size_t,
474 : (XFARRAY_QSORT_PIVOT_NR + 1) * pivot_rec_sz,
475 : XFARRAY_ISORT_NR * array->obj_size);
476 :
477 341207 : si = kvzalloc(nr_bytes, XCHK_GFP_FLAGS);
478 341207 : if (!si)
479 : return -ENOMEM;
480 :
481 341207 : si->array = array;
482 341207 : si->cmp_fn = cmp_fn;
483 341207 : si->flags = flags;
484 341207 : si->max_stack_depth = max_stack_depth;
485 341207 : si->max_stack_used = 1;
486 :
487 341207 : xfarray_sortinfo_lo(si)[0] = 0;
488 341207 : xfarray_sortinfo_hi(si)[0] = array->nr - 1;
489 :
490 341207 : trace_xfarray_sort(si, nr_bytes);
491 341207 : *infop = si;
492 341207 : return 0;
493 : }
494 :
495 : /* Should this sort be terminated by a fatal signal? */
496 : static inline bool
497 139780287 : xfarray_sort_terminated(
498 : struct xfarray_sortinfo *si,
499 : int *error)
500 : {
501 : /*
502 : * If preemption is disabled, we need to yield to the scheduler every
503 : * few seconds so that we don't run afoul of the soft lockup watchdog
504 : * or RCU stall detector.
505 : */
506 139780287 : cond_resched();
507 :
508 279560574 : if ((si->flags & XFARRAY_SORT_KILLABLE) &&
509 139780287 : fatal_signal_pending(current)) {
510 0 : if (*error == 0)
511 0 : *error = -EINTR;
512 0 : return true;
513 : }
514 : return false;
515 : }
516 :
517 : /* Do we want an in-memory sort? */
518 : static inline bool
519 : xfarray_want_isort(
520 : struct xfarray_sortinfo *si,
521 : xfarray_idx_t start,
522 : xfarray_idx_t end)
523 : {
524 : /*
525 : * For array subsets that fit in the scratchpad, it's much faster to
526 : * use the kernel's heapsort than quicksort's stack machine.
527 : */
528 98579 : return (end - start) < XFARRAY_ISORT_NR;
529 : }
530 :
531 : /* Return the scratch space within the sortinfo structure. */
532 : static inline void *xfarray_sortinfo_isort_scratch(struct xfarray_sortinfo *si)
533 : {
534 7863 : return xfarray_sortinfo_hi(si) + si->max_stack_depth;
535 : }
536 :
537 : /*
538 : * Sort a small number of array records using scratchpad memory. The records
539 : * need not be contiguous in the xfile's memory pages.
540 : */
541 : STATIC int
542 7863 : xfarray_isort(
543 : struct xfarray_sortinfo *si,
544 : xfarray_idx_t lo,
545 : xfarray_idx_t hi)
546 : {
547 7863 : void *scratch = xfarray_sortinfo_isort_scratch(si);
548 7863 : loff_t lo_pos = xfarray_pos(si->array, lo);
549 7863 : loff_t len = xfarray_pos(si->array, hi - lo + 1);
550 7863 : int error;
551 :
552 7863 : trace_xfarray_isort(si, lo, hi);
553 :
554 7863 : xfarray_sort_bump_loads(si);
555 7863 : error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos);
556 7863 : if (error)
557 : return error;
558 :
559 7863 : xfarray_sort_bump_heapsorts(si);
560 7863 : sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
561 :
562 7863 : xfarray_sort_bump_stores(si);
563 7863 : return xfile_obj_store(si->array->xfile, scratch, len, lo_pos);
564 : }
565 :
566 : /* Grab a page for sorting records. */
567 : static inline int
568 56331793 : xfarray_sort_get_page(
569 : struct xfarray_sortinfo *si,
570 : loff_t pos,
571 : uint64_t len)
572 : {
573 56331793 : int error;
574 :
575 56331793 : error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage);
576 56331794 : if (error)
577 : return error;
578 :
579 : /*
580 : * xfile pages must never be mapped into userspace, so we skip the
581 : * dcache flush when mapping the page.
582 : */
583 56331794 : si->page_kaddr = kmap_local_page(si->xfpage.page);
584 56331794 : return 0;
585 : }
586 :
587 : /* Release a page we grabbed for sorting records. */
588 : static inline int
589 56331791 : xfarray_sort_put_page(
590 : struct xfarray_sortinfo *si)
591 : {
592 56331791 : if (!si->page_kaddr)
593 : return 0;
594 :
595 56331791 : kunmap_local(si->page_kaddr);
596 56331791 : si->page_kaddr = NULL;
597 :
598 56331791 : return xfile_put_page(si->array->xfile, &si->xfpage);
599 : }
600 :
601 : /* Decide if these records are eligible for in-page sorting. */
602 : static inline bool
603 522639 : xfarray_want_pagesort(
604 : struct xfarray_sortinfo *si,
605 : xfarray_idx_t lo,
606 : xfarray_idx_t hi)
607 : {
608 522639 : pgoff_t lo_page;
609 522639 : pgoff_t hi_page;
610 522639 : loff_t end_pos;
611 :
612 : /* We can only map one page at a time. */
613 522639 : lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT;
614 522639 : end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1;
615 522639 : hi_page = end_pos >> PAGE_SHIFT;
616 :
617 522639 : return lo_page == hi_page;
618 : }
619 :
620 : /* Sort a bunch of records that all live in the same memory page. */
621 : STATIC int
622 424060 : xfarray_pagesort(
623 : struct xfarray_sortinfo *si,
624 : xfarray_idx_t lo,
625 : xfarray_idx_t hi)
626 : {
627 424060 : void *startp;
628 424060 : loff_t lo_pos = xfarray_pos(si->array, lo);
629 424060 : uint64_t len = xfarray_pos(si->array, hi - lo);
630 424060 : int error = 0;
631 :
632 424060 : trace_xfarray_pagesort(si, lo, hi);
633 :
634 424060 : xfarray_sort_bump_loads(si);
635 424060 : error = xfarray_sort_get_page(si, lo_pos, len);
636 424060 : if (error)
637 : return error;
638 :
639 424060 : xfarray_sort_bump_heapsorts(si);
640 424060 : startp = si->page_kaddr + offset_in_page(lo_pos);
641 424060 : sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
642 :
643 424058 : xfarray_sort_bump_stores(si);
644 424058 : return xfarray_sort_put_page(si);
645 : }
646 :
647 : /* Return a pointer to the xfarray pivot record within the sortinfo struct. */
648 : static inline void *xfarray_sortinfo_pivot(struct xfarray_sortinfo *si)
649 : {
650 431923 : return xfarray_sortinfo_hi(si) + si->max_stack_depth;
651 : }
652 :
653 : /* Return a pointer to the start of the pivot array. */
654 : static inline void *
655 : xfarray_sortinfo_pivot_array(
656 : struct xfarray_sortinfo *si)
657 : {
658 90716 : return xfarray_sortinfo_pivot(si) + si->array->obj_size;
659 : }
660 :
661 : /* The xfarray record is stored at the start of each pivot array element. */
662 : static inline void *
663 : xfarray_pivot_array_rec(
664 : void *pa,
665 : size_t pa_recsz,
666 : unsigned int pa_idx)
667 : {
668 2534504 : return pa + (pa_recsz * pa_idx);
669 : }
670 :
671 : /* The xfarray index is stored at the end of each pivot array element. */
672 : static inline xfarray_idx_t *
673 : xfarray_pivot_array_idx(
674 : void *pa,
675 : size_t pa_recsz,
676 : unsigned int pa_idx)
677 : {
678 2453414 : return xfarray_pivot_array_rec(pa, pa_recsz, pa_idx + 1) -
679 : sizeof(xfarray_idx_t);
680 : }
681 :
682 : /*
683 : * Find a pivot value for quicksort partitioning, swap it with a[lo], and save
684 : * the cached pivot record for the next step.
685 : *
686 : * Load evenly-spaced records within the given range into memory, sort them,
687 : * and choose the pivot from the median record. Using multiple points will
688 : * improve the quality of the pivot selection, and hopefully avoid the worst
689 : * quicksort behavior, since our array values are nearly always evenly sorted.
690 : */
691 : STATIC int
692 90716 : xfarray_qsort_pivot(
693 : struct xfarray_sortinfo *si,
694 : xfarray_idx_t lo,
695 : xfarray_idx_t hi)
696 : {
697 90716 : void *pivot = xfarray_sortinfo_pivot(si);
698 90716 : void *parray = xfarray_sortinfo_pivot_array(si);
699 90716 : void *recp;
700 90716 : xfarray_idx_t *idxp;
701 90716 : xfarray_idx_t step = (hi - lo) / (XFARRAY_QSORT_PIVOT_NR - 1);
702 90716 : size_t pivot_rec_sz = xfarray_pivot_rec_sz(si->array);
703 90716 : int i, j;
704 90716 : int error;
705 :
706 90716 : ASSERT(step > 0);
707 :
708 : /*
709 : * Load the xfarray indexes of the records we intend to sample into the
710 : * pivot array.
711 : */
712 90716 : idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, 0);
713 90716 : *idxp = lo;
714 725728 : for (i = 1; i < XFARRAY_QSORT_PIVOT_NR - 1; i++) {
715 635012 : idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
716 635012 : *idxp = lo + (i * step);
717 : }
718 90716 : idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
719 : XFARRAY_QSORT_PIVOT_NR - 1);
720 90716 : *idxp = hi;
721 :
722 : /* Load the selected xfarray records into the pivot array. */
723 907160 : for (i = 0; i < XFARRAY_QSORT_PIVOT_NR; i++) {
724 816444 : xfarray_idx_t idx;
725 :
726 816444 : recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, i);
727 816444 : idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
728 :
729 : /* No unset records; load directly into the array. */
730 816444 : if (likely(si->array->unset_slots == 0)) {
731 816444 : error = xfarray_sort_load(si, *idxp, recp);
732 816444 : if (error)
733 0 : return error;
734 816444 : continue;
735 : }
736 :
737 : /*
738 : * Load non-null records into the scratchpad without changing
739 : * the xfarray_idx_t in the pivot array.
740 : */
741 0 : idx = *idxp;
742 0 : xfarray_sort_bump_loads(si);
743 0 : error = xfarray_load_next(si->array, &idx, recp);
744 0 : if (error)
745 0 : return error;
746 : }
747 :
748 90716 : xfarray_sort_bump_heapsorts(si);
749 90716 : sort(parray, XFARRAY_QSORT_PIVOT_NR, pivot_rec_sz, si->cmp_fn, NULL);
750 :
751 : /*
752 : * We sorted the pivot array records (which includes the xfarray
753 : * indices) in xfarray record order. The median element of the pivot
754 : * array contains the xfarray record that we will use as the pivot.
755 : * Copy that xfarray record to the designated space.
756 : */
757 90716 : recp = xfarray_pivot_array_rec(parray, pivot_rec_sz,
758 : XFARRAY_QSORT_PIVOT_NR / 2);
759 181432 : memcpy(pivot, recp, si->array->obj_size);
760 :
761 : /* If the pivot record we chose was already in a[lo] then we're done. */
762 90716 : idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
763 : XFARRAY_QSORT_PIVOT_NR / 2);
764 90716 : if (*idxp == lo)
765 : return 0;
766 :
767 : /*
768 : * Find the cached copy of a[lo] in the pivot array so that we can swap
769 : * a[lo] and a[pivot].
770 : */
771 810900 : for (i = 0, j = -1; i < XFARRAY_QSORT_PIVOT_NR; i++) {
772 729810 : idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
773 729810 : if (*idxp == lo)
774 81090 : j = i;
775 : }
776 81090 : if (j < 0) {
777 0 : ASSERT(j >= 0);
778 0 : return -EFSCORRUPTED;
779 : }
780 :
781 : /* Swap a[lo] and a[pivot]. */
782 81090 : error = xfarray_sort_store(si, lo, pivot);
783 81090 : if (error)
784 : return error;
785 :
786 81090 : recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, j);
787 81090 : idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
788 : XFARRAY_QSORT_PIVOT_NR / 2);
789 81090 : return xfarray_sort_store(si, *idxp, recp);
790 : }
791 :
792 : /*
793 : * Set up the pointers for the next iteration. We push onto the stack all of
794 : * the unsorted values between a[lo + 1] and a[end[i]], and we tweak the
795 : * current stack frame to point to the unsorted values between a[beg[i]] and
796 : * a[lo] so that those values will be sorted when we pop the stack.
797 : */
798 : static inline int
799 90716 : xfarray_qsort_push(
800 : struct xfarray_sortinfo *si,
801 : xfarray_idx_t *si_lo,
802 : xfarray_idx_t *si_hi,
803 : xfarray_idx_t lo,
804 : xfarray_idx_t hi)
805 : {
806 : /* Check for stack overflows */
807 90716 : if (si->stack_depth >= si->max_stack_depth - 1) {
808 0 : ASSERT(si->stack_depth < si->max_stack_depth - 1);
809 0 : return -EFSCORRUPTED;
810 : }
811 :
812 90716 : si->max_stack_used = max_t(uint8_t, si->max_stack_used,
813 : si->stack_depth + 2);
814 :
815 90716 : si_lo[si->stack_depth + 1] = lo + 1;
816 90716 : si_hi[si->stack_depth + 1] = si_hi[si->stack_depth];
817 90716 : si_hi[si->stack_depth++] = lo - 1;
818 :
819 : /*
820 : * Always start with the smaller of the two partitions to keep the
821 : * amount of recursion in check.
822 : */
823 90716 : if (si_hi[si->stack_depth] - si_lo[si->stack_depth] >
824 90716 : si_hi[si->stack_depth - 1] - si_lo[si->stack_depth - 1]) {
825 46284 : swap(si_lo[si->stack_depth], si_lo[si->stack_depth - 1]);
826 46284 : swap(si_hi[si->stack_depth], si_hi[si->stack_depth - 1]);
827 : }
828 :
829 : return 0;
830 : }
831 :
832 : /*
833 : * Load an element from the array into the first scratchpad and cache the page,
834 : * if possible.
835 : */
836 : static inline int
837 146699282 : xfarray_sort_load_cached(
838 : struct xfarray_sortinfo *si,
839 : xfarray_idx_t idx,
840 : void *ptr)
841 : {
842 146699282 : loff_t idx_pos = xfarray_pos(si->array, idx);
843 146699282 : pgoff_t startpage;
844 146699282 : pgoff_t endpage;
845 146699282 : int error = 0;
846 :
847 : /*
848 : * If this load would split a page, release the cached page, if any,
849 : * and perform a traditional read.
850 : */
851 146699282 : startpage = idx_pos >> PAGE_SHIFT;
852 146699282 : endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT;
853 146699282 : if (startpage != endpage) {
854 0 : error = xfarray_sort_put_page(si);
855 0 : if (error)
856 : return error;
857 :
858 0 : if (xfarray_sort_terminated(si, &error))
859 0 : return error;
860 :
861 0 : return xfile_obj_load(si->array->xfile, ptr,
862 : si->array->obj_size, idx_pos);
863 : }
864 :
865 : /* If the cached page is not the one we want, release it. */
866 146699282 : if (xfile_page_cached(&si->xfpage) &&
867 : xfile_page_index(&si->xfpage) != startpage) {
868 53176 : error = xfarray_sort_put_page(si);
869 53176 : if (error)
870 : return error;
871 : }
872 :
873 : /*
874 : * If we don't have a cached page (and we know the load is contained
875 : * in a single page) then grab it.
876 : */
877 146699282 : if (!xfile_page_cached(&si->xfpage)) {
878 55907734 : if (xfarray_sort_terminated(si, &error))
879 0 : return error;
880 :
881 55907734 : error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT,
882 : PAGE_SIZE);
883 55907734 : if (error)
884 : return error;
885 : }
886 :
887 293398564 : memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos),
888 : si->array->obj_size);
889 146699282 : return 0;
890 : }
891 :
892 : /*
893 : * Sort the array elements via quicksort. This implementation incorporates
894 : * four optimizations discussed in Sedgewick:
895 : *
896 : * 1. Use an explicit stack of array indices to store the next array partition
897 : * to sort. This helps us to avoid recursion in the call stack, which is
898 : * particularly expensive in the kernel.
899 : *
900 : * 2. For arrays with records in arbitrary or user-controlled order, choose the
901 : * pivot element using a median-of-nine decision tree. This reduces the
902 : * probability of selecting a bad pivot value which causes worst case
903 : * behavior (i.e. partition sizes of 1).
904 : *
905 : * 3. The smaller of the two sub-partitions is pushed onto the stack to start
906 : * the next level of recursion, and the larger sub-partition replaces the
907 : * current stack frame. This guarantees that we won't need more than
908 : * log2(nr) stack space.
909 : *
910 : * 4. For small sets, load the records into the scratchpad and run heapsort on
911 : * them because that is very fast. In the author's experience, this yields
912 : * a ~10% reduction in runtime.
913 : *
914 : * If a small set is contained entirely within a single xfile memory page,
915 : * map the page directly and run heap sort directly on the xfile page
916 : * instead of using the load/store interface. This halves the runtime.
917 : *
918 : * 5. This optimization is specific to the implementation. When converging lo
919 : * and hi after selecting a pivot, we will try to retain the xfile memory
920 : * page between load calls, which reduces run time by 50%.
921 : */
922 :
923 : /*
924 : * Due to the use of signed indices, we can only support up to 2^63 records.
925 : * Files can only grow to 2^63 bytes, so this is not much of a limitation.
926 : */
927 : #define QSORT_MAX_RECS (1ULL << 63)
928 :
929 : int
930 461279 : xfarray_sort(
931 : struct xfarray *array,
932 : xfarray_cmp_fn cmp_fn,
933 : unsigned int flags)
934 : {
935 461279 : struct xfarray_sortinfo *si;
936 461279 : xfarray_idx_t *si_lo, *si_hi;
937 461279 : void *pivot;
938 461279 : void *scratch = xfarray_scratch(array);
939 461279 : xfarray_idx_t lo, hi;
940 461279 : int error = 0;
941 :
942 461279 : if (array->nr < 2)
943 : return 0;
944 341205 : if (array->nr >= QSORT_MAX_RECS)
945 : return -E2BIG;
946 :
947 341205 : error = xfarray_sortinfo_alloc(array, cmp_fn, flags, &si);
948 341207 : if (error)
949 : return error;
950 341207 : si_lo = xfarray_sortinfo_lo(si);
951 341207 : si_hi = xfarray_sortinfo_hi(si);
952 341207 : pivot = xfarray_sortinfo_pivot(si);
953 :
954 863846 : while (si->stack_depth >= 0) {
955 522639 : lo = si_lo[si->stack_depth];
956 522639 : hi = si_hi[si->stack_depth];
957 :
958 522639 : trace_xfarray_qsort(si, lo, hi);
959 :
960 : /* Nothing left in this partition to sort; pop stack. */
961 522639 : if (lo >= hi) {
962 0 : si->stack_depth--;
963 0 : continue;
964 : }
965 :
966 : /*
967 : * If directly mapping the page and sorting can solve our
968 : * problems, we're done.
969 : */
970 522639 : if (xfarray_want_pagesort(si, lo, hi)) {
971 424060 : error = xfarray_pagesort(si, lo, hi);
972 424060 : if (error)
973 0 : goto out_free;
974 424060 : si->stack_depth--;
975 424060 : continue;
976 : }
977 :
978 : /* If insertion sort can solve our problems, we're done. */
979 98579 : if (xfarray_want_isort(si, lo, hi)) {
980 7863 : error = xfarray_isort(si, lo, hi);
981 7863 : if (error)
982 0 : goto out_free;
983 7863 : si->stack_depth--;
984 7863 : continue;
985 : }
986 :
987 : /* Pick a pivot, move it to a[lo] and stash it. */
988 90716 : error = xfarray_qsort_pivot(si, lo, hi);
989 90716 : if (error)
990 0 : goto out_free;
991 :
992 : /*
993 : * Rearrange a[lo..hi] such that everything smaller than the
994 : * pivot is on the left side of the range and everything larger
995 : * than the pivot is on the right side of the range.
996 : */
997 28017995 : while (lo < hi) {
998 : /*
999 : * Decrement hi until it finds an a[hi] less than the
1000 : * pivot value.
1001 : */
1002 27927279 : error = xfarray_sort_load_cached(si, hi, scratch);
1003 27927279 : if (error)
1004 0 : goto out_free;
1005 76625247 : while (xfarray_sort_cmp(si, scratch, pivot) >= 0 &&
1006 : lo < hi) {
1007 48697968 : hi--;
1008 48697968 : error = xfarray_sort_load_cached(si, hi,
1009 : scratch);
1010 48697968 : if (error)
1011 0 : goto out_free;
1012 : }
1013 27927279 : error = xfarray_sort_put_page(si);
1014 27927279 : if (error)
1015 0 : goto out_free;
1016 :
1017 27927279 : if (xfarray_sort_terminated(si, &error))
1018 0 : goto out_free;
1019 :
1020 : /* Copy that item (a[hi]) to a[lo]. */
1021 27927279 : if (lo < hi) {
1022 27898623 : error = xfarray_sort_store(si, lo++, scratch);
1023 27898623 : if (error)
1024 0 : goto out_free;
1025 : }
1026 :
1027 : /*
1028 : * Increment lo until it finds an a[lo] greater than
1029 : * the pivot value.
1030 : */
1031 27927279 : error = xfarray_sort_load_cached(si, lo, scratch);
1032 27927279 : if (error)
1033 0 : goto out_free;
1034 70074035 : while (xfarray_sort_cmp(si, scratch, pivot) <= 0 &&
1035 : lo < hi) {
1036 42146756 : lo++;
1037 42146756 : error = xfarray_sort_load_cached(si, lo,
1038 : scratch);
1039 42146756 : if (error)
1040 0 : goto out_free;
1041 : }
1042 27927279 : error = xfarray_sort_put_page(si);
1043 27927279 : if (error)
1044 0 : goto out_free;
1045 :
1046 27927279 : if (xfarray_sort_terminated(si, &error))
1047 0 : goto out_free;
1048 :
1049 : /* Copy that item (a[lo]) to a[hi]. */
1050 27927279 : if (lo < hi) {
1051 27856133 : error = xfarray_sort_store(si, hi--, scratch);
1052 27856133 : if (error)
1053 0 : goto out_free;
1054 : }
1055 :
1056 27927279 : if (xfarray_sort_terminated(si, &error))
1057 0 : goto out_free;
1058 : }
1059 :
1060 : /*
1061 : * Put our pivot value in the correct place at a[lo]. All
1062 : * values between a[beg[i]] and a[lo - 1] should be less than
1063 : * the pivot; and all values between a[lo + 1] and a[end[i]-1]
1064 : * should be greater than the pivot.
1065 : */
1066 90716 : error = xfarray_sort_store(si, lo, pivot);
1067 90716 : if (error)
1068 0 : goto out_free;
1069 :
1070 : /* Set up the stack frame to process the two partitions. */
1071 90716 : error = xfarray_qsort_push(si, si_lo, si_hi, lo, hi);
1072 90716 : if (error)
1073 0 : goto out_free;
1074 :
1075 90716 : if (xfarray_sort_terminated(si, &error))
1076 0 : goto out_free;
1077 : }
1078 :
1079 341207 : out_free:
1080 341207 : trace_xfarray_sort_stats(si, error);
1081 341207 : kvfree(si);
1082 341207 : return error;
1083 : }
1084 :
1085 : /* How many bytes is this array consuming? */
1086 : unsigned long long
1087 3774307328 : xfarray_bytes(
1088 : struct xfarray *array)
1089 : {
1090 3774307328 : return xfile_bytes(array->xfile);
1091 : }
1092 :
1093 : /* Empty the entire array. */
1094 : void
1095 89613157 : xfarray_truncate(
1096 : struct xfarray *array)
1097 : {
1098 89613157 : xfile_discard(array->xfile, 0, MAX_LFS_FILESIZE);
1099 89614343 : array->nr = 0;
1100 89614343 : }
|