Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 : /*
3 : * Copyright (C) 2021-2023 Oracle. All Rights Reserved.
4 : * Author: Darrick J. Wong <djwong@kernel.org>
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "scrub/xfile.h"
11 : #include "scrub/xfarray.h"
12 : #include "scrub/scrub.h"
13 : #include "scrub/trace.h"
14 :
15 : /*
16 : * Large Arrays of Fixed-Size Records
17 : * ==================================
18 : *
19 : * This memory array uses an xfile (which itself is a memfd "file") to store
20 : * large numbers of fixed-size records in memory that can be paged out. This
21 : * puts less stress on the memory reclaim algorithms during an online repair
22 : * because we don't have to pin so much memory. However, array access is less
23 : * direct than would be in a regular memory array. Access to the array is
24 : * performed via indexed load and store methods, and an append method is
25 : * provided for convenience. Array elements can be unset, which sets them to
26 : * all zeroes. Unset entries are skipped during iteration, though direct loads
27 : * will return a zeroed buffer. Callers are responsible for concurrency
28 : * control.
29 : */
30 :
31 : /*
32 : * Pointer to scratch space. Because we can't access the xfile data directly,
33 : * we allocate a small amount of memory on the end of the xfarray structure to
34 : * buffer array items when we need space to store values temporarily.
35 : */
36 : static inline void *xfarray_scratch(struct xfarray *array)
37 : {
38 140804 : return (array + 1);
39 : }
40 :
41 : /* Compute array index given an xfile offset. */
42 : static xfarray_idx_t
43 : xfarray_idx(
44 : struct xfarray *array,
45 : loff_t pos)
46 : {
47 132691522 : if (array->obj_size_log >= 0)
48 4720500 : return (xfarray_idx_t)pos >> array->obj_size_log;
49 :
50 127971022 : return div_u64((xfarray_idx_t)pos, array->obj_size);
51 : }
52 :
53 : /* Compute xfile offset of array element. */
54 : static inline loff_t xfarray_pos(struct xfarray *array, xfarray_idx_t idx)
55 : {
56 17362763434 : if (array->obj_size_log >= 0)
57 16160058093 : return idx << array->obj_size_log;
58 :
59 1202705341 : return idx * array->obj_size;
60 : }
61 :
62 : /*
63 : * Initialize a big memory array. Array records cannot be larger than a
64 : * page, and the array cannot span more bytes than the page cache supports.
65 : * If @required_capacity is nonzero, the maximum array size will be set to this
66 : * quantity and the array creation will fail if the underlying storage cannot
67 : * support that many records.
68 : */
69 : int
70 128326075 : xfarray_create(
71 : const char *description,
72 : unsigned long long required_capacity,
73 : size_t obj_size,
74 : struct xfarray **arrayp)
75 : {
76 128326075 : struct xfarray *array;
77 128326075 : struct xfile *xfile;
78 128326075 : int error;
79 :
80 128326075 : ASSERT(obj_size < PAGE_SIZE);
81 :
82 128326075 : error = xfile_create(description, 0, &xfile);
83 128486392 : if (error)
84 : return error;
85 :
86 128484033 : error = -ENOMEM;
87 128484033 : array = kzalloc(sizeof(struct xfarray) + obj_size, XCHK_GFP_FLAGS);
88 128474794 : if (!array)
89 0 : goto out_xfile;
90 :
91 128474794 : array->xfile = xfile;
92 128474794 : array->obj_size = obj_size;
93 :
94 256949588 : if (is_power_of_2(obj_size))
95 1007548 : array->obj_size_log = ilog2(obj_size);
96 : else
97 127971020 : array->obj_size_log = -1;
98 :
99 128474794 : array->max_nr = xfarray_idx(array, MAX_LFS_FILESIZE);
100 128474794 : trace_xfarray_create(array, required_capacity);
101 :
102 128488813 : if (required_capacity > 0) {
103 140494 : if (array->max_nr < required_capacity) {
104 0 : error = -ENOMEM;
105 0 : goto out_xfarray;
106 : }
107 140494 : array->max_nr = required_capacity;
108 : }
109 :
110 128488813 : *arrayp = array;
111 128488813 : return 0;
112 :
113 : out_xfarray:
114 0 : kfree(array);
115 0 : out_xfile:
116 0 : xfile_destroy(xfile);
117 0 : return error;
118 : }
119 :
120 : /* Destroy the array. */
121 : void
122 128516569 : xfarray_destroy(
123 : struct xfarray *array)
124 : {
125 128516569 : xfile_destroy(array->xfile);
126 128518745 : kfree(array);
127 128518813 : }
128 :
129 : /* Load an element from the array. */
130 : int
131 13927572190 : xfarray_load(
132 : struct xfarray *array,
133 : xfarray_idx_t idx,
134 : void *ptr)
135 : {
136 13927572190 : if (idx >= array->nr)
137 : return -ENODATA;
138 :
139 27853092176 : return xfile_obj_load(array->xfile, ptr, array->obj_size,
140 : xfarray_pos(array, idx));
141 : }
142 :
143 : /* Is this array element potentially unset? */
144 : static inline bool
145 0 : xfarray_is_unset(
146 : struct xfarray *array,
147 : loff_t pos)
148 : {
149 0 : void *temp = xfarray_scratch(array);
150 0 : int error;
151 :
152 0 : if (array->unset_slots == 0)
153 : return false;
154 :
155 0 : error = xfile_obj_load(array->xfile, temp, array->obj_size, pos);
156 0 : if (!error && xfarray_element_is_null(array, temp))
157 0 : return true;
158 :
159 : return false;
160 : }
161 :
162 : /*
163 : * Unset an array element. If @idx is the last element in the array, the
164 : * array will be truncated. Otherwise, the entry will be zeroed.
165 : */
166 : int
167 0 : xfarray_unset(
168 : struct xfarray *array,
169 : xfarray_idx_t idx)
170 : {
171 0 : void *temp = xfarray_scratch(array);
172 0 : loff_t pos = xfarray_pos(array, idx);
173 0 : int error;
174 :
175 0 : if (idx >= array->nr)
176 : return -ENODATA;
177 :
178 0 : if (idx == array->nr - 1) {
179 0 : array->nr--;
180 0 : return 0;
181 : }
182 :
183 0 : if (xfarray_is_unset(array, pos))
184 : return 0;
185 :
186 0 : memset(temp, 0, array->obj_size);
187 0 : error = xfile_obj_store(array->xfile, temp, array->obj_size, pos);
188 0 : if (error)
189 : return error;
190 :
191 0 : array->unset_slots++;
192 0 : return 0;
193 : }
194 :
195 : /*
196 : * Store an element in the array. The element must not be completely zeroed,
197 : * because those are considered unset sparse elements.
198 : */
199 : int
200 2815090090 : xfarray_store(
201 : struct xfarray *array,
202 : xfarray_idx_t idx,
203 : const void *ptr)
204 : {
205 2815090090 : int ret;
206 :
207 2815090090 : if (idx >= array->max_nr)
208 : return -EFBIG;
209 :
210 2815090090 : ASSERT(!xfarray_element_is_null(array, ptr));
211 :
212 5631335094 : ret = xfile_obj_store(array->xfile, ptr, array->obj_size,
213 : xfarray_pos(array, idx));
214 2817296537 : if (ret)
215 : return ret;
216 :
217 2817296537 : array->nr = max(array->nr, idx + 1);
218 2817296537 : return 0;
219 : }
220 :
221 : /* Is this array element NULL? */
222 : bool
223 15045250900 : xfarray_element_is_null(
224 : struct xfarray *array,
225 : const void *ptr)
226 : {
227 15045250900 : return !memchr_inv(ptr, 0, array->obj_size);
228 : }
229 :
230 : /*
231 : * Store an element anywhere in the array that is unset. If there are no
232 : * unset slots, append the element to the array.
233 : */
234 : int
235 0 : xfarray_store_anywhere(
236 : struct xfarray *array,
237 : const void *ptr)
238 : {
239 0 : void *temp = xfarray_scratch(array);
240 0 : loff_t endpos = xfarray_pos(array, array->nr);
241 0 : loff_t pos;
242 0 : int error;
243 :
244 : /* Find an unset slot to put it in. */
245 0 : for (pos = 0;
246 0 : pos < endpos && array->unset_slots > 0;
247 0 : pos += array->obj_size) {
248 0 : error = xfile_obj_load(array->xfile, temp, array->obj_size,
249 : pos);
250 0 : if (error || !xfarray_element_is_null(array, temp))
251 0 : continue;
252 :
253 0 : error = xfile_obj_store(array->xfile, ptr, array->obj_size,
254 : pos);
255 0 : if (error)
256 : return error;
257 :
258 0 : array->unset_slots--;
259 0 : return 0;
260 : }
261 :
262 : /* No unset slots found; attach it on the end. */
263 0 : array->unset_slots = 0;
264 0 : return xfarray_append(array, ptr);
265 : }
266 :
267 : /* Return length of array. */
268 : uint64_t
269 149547885 : xfarray_length(
270 : struct xfarray *array)
271 : {
272 149547885 : return array->nr;
273 : }
274 :
275 : /*
276 : * Decide which array item we're going to read as part of an _iter_get.
277 : * @cur is the array index, and @pos is the file offset of that array index in
278 : * the backing xfile. Returns ENODATA if we reach the end of the records.
279 : *
280 : * Reading from a hole in a sparse xfile causes page instantiation, so for
281 : * iterating a (possibly sparse) array we need to figure out if the cursor is
282 : * pointing at a totally uninitialized hole and move the cursor up if
283 : * necessary.
284 : */
285 : static inline int
286 12230294161 : xfarray_find_data(
287 : struct xfarray *array,
288 : xfarray_idx_t *cur,
289 : loff_t *pos)
290 : {
291 12230294161 : unsigned int pgoff = offset_in_page(*pos);
292 12230294161 : loff_t end_pos = *pos + array->obj_size - 1;
293 12230294161 : loff_t new_pos;
294 :
295 : /*
296 : * If the current array record is not adjacent to a page boundary, we
297 : * are in the middle of the page. We do not need to move the cursor.
298 : */
299 12230294161 : if (pgoff != 0 && pgoff + array->obj_size - 1 < PAGE_SIZE)
300 : return 0;
301 :
302 : /*
303 : * Call SEEK_DATA on the last byte in the record we're about to read.
304 : * If the record ends at (or crosses) the end of a page then we know
305 : * that the first byte of the record is backed by pages and don't need
306 : * to query it. If instead the record begins at the start of the page
307 : * then we know that querying the last byte is just as good as querying
308 : * the first byte, since records cannot be larger than a page.
309 : *
310 : * If the call returns the same file offset, we know this record is
311 : * backed by real pages. We do not need to move the cursor.
312 : */
313 5754290 : new_pos = xfile_seek_data(array->xfile, end_pos);
314 5645887 : if (new_pos == -ENXIO)
315 : return -ENODATA;
316 5645887 : if (new_pos < 0)
317 0 : return new_pos;
318 5645887 : if (new_pos == end_pos)
319 : return 0;
320 :
321 : /*
322 : * Otherwise, SEEK_DATA told us how far up to move the file pointer to
323 : * find more data. Move the array index to the first record past the
324 : * byte offset we were given.
325 : */
326 4216728 : new_pos = roundup_64(new_pos, array->obj_size);
327 4216728 : *cur = xfarray_idx(array, new_pos);
328 4216728 : *pos = xfarray_pos(array, *cur);
329 4216728 : return 0;
330 : }
331 :
332 : /*
333 : * Starting at *idx, fetch the next non-null array entry and advance the index
334 : * to set up the next _load_next call. Returns ENODATA if we reach the end of
335 : * the array. Callers must set @*idx to XFARRAY_CURSOR_INIT before the first
336 : * call to this function.
337 : */
338 : int
339 615869198 : xfarray_load_next(
340 : struct xfarray *array,
341 : xfarray_idx_t *idx,
342 : void *rec)
343 : {
344 615869198 : xfarray_idx_t cur = *idx;
345 1231738396 : loff_t pos = xfarray_pos(array, cur);
346 12230534371 : int error;
347 :
348 12230534371 : do {
349 12230534371 : if (cur >= array->nr)
350 : return -ENODATA;
351 :
352 : /*
353 : * Ask the backing store for the location of next possible
354 : * written record, then retrieve that record.
355 : */
356 12230331246 : error = xfarray_find_data(array, &cur, &pos);
357 12230280914 : if (error)
358 0 : return error;
359 12230280914 : error = xfarray_load(array, cur, rec);
360 12230240876 : if (error)
361 0 : return error;
362 :
363 12230240876 : cur++;
364 12230240876 : pos += array->obj_size;
365 12230240876 : } while (xfarray_element_is_null(array, rec));
366 :
367 615667021 : *idx = cur;
368 615667021 : return 0;
369 : }
370 :
371 : /* Sorting functions */
372 :
373 : #ifdef DEBUG
374 : # define xfarray_sort_bump_loads(si) do { (si)->loads++; } while (0)
375 : # define xfarray_sort_bump_stores(si) do { (si)->stores++; } while (0)
376 : # define xfarray_sort_bump_compares(si) do { (si)->compares++; } while (0)
377 : # define xfarray_sort_bump_heapsorts(si) do { (si)->heapsorts++; } while (0)
378 : #else
379 : # define xfarray_sort_bump_loads(si)
380 : # define xfarray_sort_bump_stores(si)
381 : # define xfarray_sort_bump_compares(si)
382 : # define xfarray_sort_bump_heapsorts(si)
383 : #endif /* DEBUG */
384 :
385 : /* Load an array element for sorting. */
386 : static inline int
387 : xfarray_sort_load(
388 : struct xfarray_sortinfo *si,
389 : xfarray_idx_t idx,
390 : void *ptr)
391 : {
392 459 : xfarray_sort_bump_loads(si);
393 459 : return xfarray_load(si->array, idx, ptr);
394 : }
395 :
396 : /* Store an array element for sorting. */
397 : static inline int
398 : xfarray_sort_store(
399 : struct xfarray_sortinfo *si,
400 : xfarray_idx_t idx,
401 : void *ptr)
402 : {
403 36764 : xfarray_sort_bump_stores(si);
404 36764 : return xfarray_store(si->array, idx, ptr);
405 : }
406 :
407 : /* Compare an array element for sorting. */
408 : static inline int
409 : xfarray_sort_cmp(
410 : struct xfarray_sortinfo *si,
411 : const void *a,
412 : const void *b)
413 : {
414 91021 : xfarray_sort_bump_compares(si);
415 91021 : return si->cmp_fn(a, b);
416 : }
417 :
418 : /* Return a pointer to the low index stack for quicksort partitioning. */
419 : static inline xfarray_idx_t *xfarray_sortinfo_lo(struct xfarray_sortinfo *si)
420 : {
421 93121 : return (xfarray_idx_t *)(si + 1);
422 : }
423 :
424 : /* Return a pointer to the high index stack for quicksort partitioning. */
425 : static inline xfarray_idx_t *xfarray_sortinfo_hi(struct xfarray_sortinfo *si)
426 : {
427 186301 : return xfarray_sortinfo_lo(si) + si->max_stack_depth;
428 : }
429 :
430 : /* Size of each element in the quicksort pivot array. */
431 : static inline size_t
432 : xfarray_pivot_rec_sz(
433 : struct xfarray *array)
434 : {
435 93169 : return round_up(array->obj_size, 8) + sizeof(xfarray_idx_t);
436 : }
437 :
438 : /* Allocate memory to handle the sort. */
439 : static inline int
440 93118 : xfarray_sortinfo_alloc(
441 : struct xfarray *array,
442 : xfarray_cmp_fn cmp_fn,
443 : unsigned int flags,
444 : struct xfarray_sortinfo **infop)
445 : {
446 93118 : struct xfarray_sortinfo *si;
447 93118 : size_t nr_bytes = sizeof(struct xfarray_sortinfo);
448 93118 : size_t pivot_rec_sz = xfarray_pivot_rec_sz(array);
449 93118 : int max_stack_depth;
450 :
451 : /*
452 : * The median-of-nine pivot algorithm doesn't work if a subset has
453 : * fewer than 9 items. Make sure the in-memory sort will always take
454 : * over for subsets where this wouldn't be the case.
455 : */
456 93118 : BUILD_BUG_ON(XFARRAY_QSORT_PIVOT_NR >= XFARRAY_ISORT_NR);
457 :
458 : /*
459 : * Tail-call recursion during the partitioning phase means that
460 : * quicksort will never recurse more than log2(nr) times. We need one
461 : * extra level of stack to hold the initial parameters. In-memory
462 : * sort will always take care of the last few levels of recursion for
463 : * us, so we can reduce the stack depth by that much.
464 : */
465 186236 : max_stack_depth = ilog2(array->nr) + 1 - (XFARRAY_ISORT_SHIFT - 1);
466 93118 : if (max_stack_depth < 1)
467 24489 : max_stack_depth = 1;
468 :
469 : /* Each level of quicksort uses a lo and a hi index */
470 93118 : nr_bytes += max_stack_depth * sizeof(xfarray_idx_t) * 2;
471 :
472 : /* Scratchpad for in-memory sort, or finding the pivot */
473 93118 : nr_bytes += max_t(size_t,
474 : (XFARRAY_QSORT_PIVOT_NR + 1) * pivot_rec_sz,
475 : XFARRAY_ISORT_NR * array->obj_size);
476 :
477 93118 : si = kvzalloc(nr_bytes, XCHK_GFP_FLAGS);
478 93125 : if (!si)
479 : return -ENOMEM;
480 :
481 93125 : si->array = array;
482 93125 : si->cmp_fn = cmp_fn;
483 93125 : si->flags = flags;
484 93125 : si->max_stack_depth = max_stack_depth;
485 93125 : si->max_stack_used = 1;
486 :
487 93125 : xfarray_sortinfo_lo(si)[0] = 0;
488 93125 : xfarray_sortinfo_hi(si)[0] = array->nr - 1;
489 :
490 93125 : trace_xfarray_sort(si, nr_bytes);
491 93118 : *infop = si;
492 93118 : return 0;
493 : }
494 :
495 : /* Should this sort be terminated by a fatal signal? */
496 : static inline bool
497 91799 : xfarray_sort_terminated(
498 : struct xfarray_sortinfo *si,
499 : int *error)
500 : {
501 : /*
502 : * If preemption is disabled, we need to yield to the scheduler every
503 : * few seconds so that we don't run afoul of the soft lockup watchdog
504 : * or RCU stall detector.
505 : */
506 91799 : cond_resched();
507 :
508 183598 : if ((si->flags & XFARRAY_SORT_KILLABLE) &&
509 91799 : fatal_signal_pending(current)) {
510 0 : if (*error == 0)
511 0 : *error = -EINTR;
512 0 : return true;
513 : }
514 : return false;
515 : }
516 :
517 : /* Do we want an in-memory sort? */
518 : static inline bool
519 : xfarray_want_isort(
520 : struct xfarray_sortinfo *si,
521 : xfarray_idx_t start,
522 : xfarray_idx_t end)
523 : {
524 : /*
525 : * For array subsets that fit in the scratchpad, it's much faster to
526 : * use the kernel's heapsort than quicksort's stack machine.
527 : */
528 55 : return (end - start) < XFARRAY_ISORT_NR;
529 : }
530 :
531 : /* Return the scratch space within the sortinfo structure. */
532 : static inline void *xfarray_sortinfo_isort_scratch(struct xfarray_sortinfo *si)
533 : {
534 4 : return xfarray_sortinfo_hi(si) + si->max_stack_depth;
535 : }
536 :
537 : /*
538 : * Sort a small number of array records using scratchpad memory. The records
539 : * need not be contiguous in the xfile's memory pages.
540 : */
541 : STATIC int
542 4 : xfarray_isort(
543 : struct xfarray_sortinfo *si,
544 : xfarray_idx_t lo,
545 : xfarray_idx_t hi)
546 : {
547 4 : void *scratch = xfarray_sortinfo_isort_scratch(si);
548 4 : loff_t lo_pos = xfarray_pos(si->array, lo);
549 4 : loff_t len = xfarray_pos(si->array, hi - lo + 1);
550 4 : int error;
551 :
552 4 : trace_xfarray_isort(si, lo, hi);
553 :
554 4 : xfarray_sort_bump_loads(si);
555 4 : error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos);
556 4 : if (error)
557 : return error;
558 :
559 4 : xfarray_sort_bump_heapsorts(si);
560 4 : sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
561 :
562 4 : xfarray_sort_bump_stores(si);
563 4 : return xfile_obj_store(si->array->xfile, scratch, len, lo_pos);
564 : }
565 :
566 : /* Grab a page for sorting records. */
567 : static inline int
568 129902 : xfarray_sort_get_page(
569 : struct xfarray_sortinfo *si,
570 : loff_t pos,
571 : uint64_t len)
572 : {
573 129902 : int error;
574 :
575 129902 : error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage);
576 129910 : if (error)
577 : return error;
578 :
579 : /*
580 : * xfile pages must never be mapped into userspace, so we skip the
581 : * dcache flush when mapping the page.
582 : */
583 129910 : si->page_kaddr = kmap_local_page(si->xfpage.page);
584 129910 : return 0;
585 : }
586 :
587 : /* Release a page we grabbed for sorting records. */
588 : static inline int
589 129914 : xfarray_sort_put_page(
590 : struct xfarray_sortinfo *si)
591 : {
592 129914 : if (!si->page_kaddr)
593 : return 0;
594 :
595 129914 : kunmap_local(si->page_kaddr);
596 129914 : si->page_kaddr = NULL;
597 :
598 129914 : return xfile_put_page(si->array->xfile, &si->xfpage);
599 : }
600 :
601 : /* Decide if these records are eligible for in-page sorting. */
602 : static inline bool
603 93232 : xfarray_want_pagesort(
604 : struct xfarray_sortinfo *si,
605 : xfarray_idx_t lo,
606 : xfarray_idx_t hi)
607 : {
608 93232 : pgoff_t lo_page;
609 93232 : pgoff_t hi_page;
610 93232 : loff_t end_pos;
611 :
612 : /* We can only map one page at a time. */
613 93232 : lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT;
614 93232 : end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1;
615 93232 : hi_page = end_pos >> PAGE_SHIFT;
616 :
617 93232 : return lo_page == hi_page;
618 : }
619 :
620 : /* Sort a bunch of records that all live in the same memory page. */
621 : STATIC int
622 93190 : xfarray_pagesort(
623 : struct xfarray_sortinfo *si,
624 : xfarray_idx_t lo,
625 : xfarray_idx_t hi)
626 : {
627 93190 : void *startp;
628 93190 : loff_t lo_pos = xfarray_pos(si->array, lo);
629 93190 : uint64_t len = xfarray_pos(si->array, hi - lo);
630 93190 : int error = 0;
631 :
632 93190 : trace_xfarray_pagesort(si, lo, hi);
633 :
634 93189 : xfarray_sort_bump_loads(si);
635 93189 : error = xfarray_sort_get_page(si, lo_pos, len);
636 93196 : if (error)
637 : return error;
638 :
639 93196 : xfarray_sort_bump_heapsorts(si);
640 93196 : startp = si->page_kaddr + offset_in_page(lo_pos);
641 93196 : sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
642 :
643 93177 : xfarray_sort_bump_stores(si);
644 93177 : return xfarray_sort_put_page(si);
645 : }
646 :
647 : /* Return a pointer to the xfarray pivot record within the sortinfo struct. */
648 : static inline void *xfarray_sortinfo_pivot(struct xfarray_sortinfo *si)
649 : {
650 93172 : return xfarray_sortinfo_hi(si) + si->max_stack_depth;
651 : }
652 :
653 : /* Return a pointer to the start of the pivot array. */
654 : static inline void *
655 : xfarray_sortinfo_pivot_array(
656 : struct xfarray_sortinfo *si)
657 : {
658 51 : return xfarray_sortinfo_pivot(si) + si->array->obj_size;
659 : }
660 :
661 : /* The xfarray record is stored at the start of each pivot array element. */
662 : static inline void *
663 : xfarray_pivot_array_rec(
664 : void *pa,
665 : size_t pa_recsz,
666 : unsigned int pa_idx)
667 : {
668 1399 : return pa + (pa_recsz * pa_idx);
669 : }
670 :
671 : /* The xfarray index is stored at the end of each pivot array element. */
672 : static inline xfarray_idx_t *
673 : xfarray_pivot_array_idx(
674 : void *pa,
675 : size_t pa_recsz,
676 : unsigned int pa_idx)
677 : {
678 1356 : return xfarray_pivot_array_rec(pa, pa_recsz, pa_idx + 1) -
679 : sizeof(xfarray_idx_t);
680 : }
681 :
682 : /*
683 : * Find a pivot value for quicksort partitioning, swap it with a[lo], and save
684 : * the cached pivot record for the next step.
685 : *
686 : * Load evenly-spaced records within the given range into memory, sort them,
687 : * and choose the pivot from the median record. Using multiple points will
688 : * improve the quality of the pivot selection, and hopefully avoid the worst
689 : * quicksort behavior, since our array values are nearly always evenly sorted.
690 : */
691 : STATIC int
692 51 : xfarray_qsort_pivot(
693 : struct xfarray_sortinfo *si,
694 : xfarray_idx_t lo,
695 : xfarray_idx_t hi)
696 : {
697 51 : void *pivot = xfarray_sortinfo_pivot(si);
698 51 : void *parray = xfarray_sortinfo_pivot_array(si);
699 51 : void *recp;
700 51 : xfarray_idx_t *idxp;
701 51 : xfarray_idx_t step = (hi - lo) / (XFARRAY_QSORT_PIVOT_NR - 1);
702 51 : size_t pivot_rec_sz = xfarray_pivot_rec_sz(si->array);
703 51 : int i, j;
704 51 : int error;
705 :
706 51 : ASSERT(step > 0);
707 :
708 : /*
709 : * Load the xfarray indexes of the records we intend to sample into the
710 : * pivot array.
711 : */
712 51 : idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, 0);
713 51 : *idxp = lo;
714 408 : for (i = 1; i < XFARRAY_QSORT_PIVOT_NR - 1; i++) {
715 357 : idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
716 357 : *idxp = lo + (i * step);
717 : }
718 51 : idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
719 : XFARRAY_QSORT_PIVOT_NR - 1);
720 51 : *idxp = hi;
721 :
722 : /* Load the selected xfarray records into the pivot array. */
723 510 : for (i = 0; i < XFARRAY_QSORT_PIVOT_NR; i++) {
724 459 : xfarray_idx_t idx;
725 :
726 459 : recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, i);
727 459 : idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
728 :
729 : /* No unset records; load directly into the array. */
730 459 : if (likely(si->array->unset_slots == 0)) {
731 459 : error = xfarray_sort_load(si, *idxp, recp);
732 459 : if (error)
733 0 : return error;
734 459 : continue;
735 : }
736 :
737 : /*
738 : * Load non-null records into the scratchpad without changing
739 : * the xfarray_idx_t in the pivot array.
740 : */
741 0 : idx = *idxp;
742 0 : xfarray_sort_bump_loads(si);
743 0 : error = xfarray_load_next(si->array, &idx, recp);
744 0 : if (error)
745 0 : return error;
746 : }
747 :
748 51 : xfarray_sort_bump_heapsorts(si);
749 51 : sort(parray, XFARRAY_QSORT_PIVOT_NR, pivot_rec_sz, si->cmp_fn, NULL);
750 :
751 : /*
752 : * We sorted the pivot array records (which includes the xfarray
753 : * indices) in xfarray record order. The median element of the pivot
754 : * array contains the xfarray record that we will use as the pivot.
755 : * Copy that xfarray record to the designated space.
756 : */
757 51 : recp = xfarray_pivot_array_rec(parray, pivot_rec_sz,
758 : XFARRAY_QSORT_PIVOT_NR / 2);
759 102 : memcpy(pivot, recp, si->array->obj_size);
760 :
761 : /* If the pivot record we chose was already in a[lo] then we're done. */
762 51 : idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
763 : XFARRAY_QSORT_PIVOT_NR / 2);
764 51 : if (*idxp == lo)
765 : return 0;
766 :
767 : /*
768 : * Find the cached copy of a[lo] in the pivot array so that we can swap
769 : * a[lo] and a[pivot].
770 : */
771 430 : for (i = 0, j = -1; i < XFARRAY_QSORT_PIVOT_NR; i++) {
772 387 : idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
773 387 : if (*idxp == lo)
774 43 : j = i;
775 : }
776 43 : if (j < 0) {
777 0 : ASSERT(j >= 0);
778 0 : return -EFSCORRUPTED;
779 : }
780 :
781 : /* Swap a[lo] and a[pivot]. */
782 43 : error = xfarray_sort_store(si, lo, pivot);
783 43 : if (error)
784 : return error;
785 :
786 43 : recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, j);
787 43 : idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
788 : XFARRAY_QSORT_PIVOT_NR / 2);
789 43 : return xfarray_sort_store(si, *idxp, recp);
790 : }
791 :
792 : /*
793 : * Set up the pointers for the next iteration. We push onto the stack all of
794 : * the unsorted values between a[lo + 1] and a[end[i]], and we tweak the
795 : * current stack frame to point to the unsorted values between a[beg[i]] and
796 : * a[lo] so that those values will be sorted when we pop the stack.
797 : */
798 : static inline int
799 51 : xfarray_qsort_push(
800 : struct xfarray_sortinfo *si,
801 : xfarray_idx_t *si_lo,
802 : xfarray_idx_t *si_hi,
803 : xfarray_idx_t lo,
804 : xfarray_idx_t hi)
805 : {
806 : /* Check for stack overflows */
807 51 : if (si->stack_depth >= si->max_stack_depth - 1) {
808 0 : ASSERT(si->stack_depth < si->max_stack_depth - 1);
809 0 : return -EFSCORRUPTED;
810 : }
811 :
812 51 : si->max_stack_used = max_t(uint8_t, si->max_stack_used,
813 : si->stack_depth + 2);
814 :
815 51 : si_lo[si->stack_depth + 1] = lo + 1;
816 51 : si_hi[si->stack_depth + 1] = si_hi[si->stack_depth];
817 51 : si_hi[si->stack_depth++] = lo - 1;
818 :
819 : /*
820 : * Always start with the smaller of the two partitions to keep the
821 : * amount of recursion in check.
822 : */
823 51 : if (si_hi[si->stack_depth] - si_lo[si->stack_depth] >
824 51 : si_hi[si->stack_depth - 1] - si_lo[si->stack_depth - 1]) {
825 22 : swap(si_lo[si->stack_depth], si_lo[si->stack_depth - 1]);
826 22 : swap(si_hi[si->stack_depth], si_hi[si->stack_depth - 1]);
827 : }
828 :
829 : return 0;
830 : }
831 :
832 : /*
833 : * Load an element from the array into the first scratchpad and cache the page,
834 : * if possible.
835 : */
836 : static inline int
837 91021 : xfarray_sort_load_cached(
838 : struct xfarray_sortinfo *si,
839 : xfarray_idx_t idx,
840 : void *ptr)
841 : {
842 91021 : loff_t idx_pos = xfarray_pos(si->array, idx);
843 91021 : pgoff_t startpage;
844 91021 : pgoff_t endpage;
845 91021 : int error = 0;
846 :
847 : /*
848 : * If this load would split a page, release the cached page, if any,
849 : * and perform a traditional read.
850 : */
851 91021 : startpage = idx_pos >> PAGE_SHIFT;
852 91021 : endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT;
853 91021 : if (startpage != endpage) {
854 0 : error = xfarray_sort_put_page(si);
855 0 : if (error)
856 : return error;
857 :
858 0 : if (xfarray_sort_terminated(si, &error))
859 0 : return error;
860 :
861 0 : return xfile_obj_load(si->array->xfile, ptr,
862 : si->array->obj_size, idx_pos);
863 : }
864 :
865 : /* If the cached page is not the one we want, release it. */
866 91021 : if (xfile_page_cached(&si->xfpage) &&
867 : xfile_page_index(&si->xfpage) != startpage) {
868 33 : error = xfarray_sort_put_page(si);
869 33 : if (error)
870 : return error;
871 : }
872 :
873 : /*
874 : * If we don't have a cached page (and we know the load is contained
875 : * in a single page) then grab it.
876 : */
877 91021 : if (!xfile_page_cached(&si->xfpage)) {
878 36719 : if (xfarray_sort_terminated(si, &error))
879 0 : return error;
880 :
881 36719 : error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT,
882 : PAGE_SIZE);
883 36719 : if (error)
884 : return error;
885 : }
886 :
887 182042 : memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos),
888 : si->array->obj_size);
889 91021 : return 0;
890 : }
891 :
892 : /*
893 : * Sort the array elements via quicksort. This implementation incorporates
894 : * four optimizations discussed in Sedgewick:
895 : *
896 : * 1. Use an explicit stack of array indices to store the next array partition
897 : * to sort. This helps us to avoid recursion in the call stack, which is
898 : * particularly expensive in the kernel.
899 : *
900 : * 2. For arrays with records in arbitrary or user-controlled order, choose the
901 : * pivot element using a median-of-nine decision tree. This reduces the
902 : * probability of selecting a bad pivot value which causes worst case
903 : * behavior (i.e. partition sizes of 1).
904 : *
905 : * 3. The smaller of the two sub-partitions is pushed onto the stack to start
906 : * the next level of recursion, and the larger sub-partition replaces the
907 : * current stack frame. This guarantees that we won't need more than
908 : * log2(nr) stack space.
909 : *
910 : * 4. For small sets, load the records into the scratchpad and run heapsort on
911 : * them because that is very fast. In the author's experience, this yields
912 : * a ~10% reduction in runtime.
913 : *
914 : * If a small set is contained entirely within a single xfile memory page,
915 : * map the page directly and run heap sort directly on the xfile page
916 : * instead of using the load/store interface. This halves the runtime.
917 : *
918 : * 5. This optimization is specific to the implementation. When converging lo
919 : * and hi after selecting a pivot, we will try to retain the xfile memory
920 : * page between load calls, which reduces run time by 50%.
921 : */
922 :
923 : /*
924 : * Due to the use of signed indices, we can only support up to 2^63 records.
925 : * Files can only grow to 2^63 bytes, so this is not much of a limitation.
926 : */
927 : #define QSORT_MAX_RECS (1ULL << 63)
928 :
929 : int
930 140804 : xfarray_sort(
931 : struct xfarray *array,
932 : xfarray_cmp_fn cmp_fn,
933 : unsigned int flags)
934 : {
935 140804 : struct xfarray_sortinfo *si;
936 140804 : xfarray_idx_t *si_lo, *si_hi;
937 140804 : void *pivot;
938 140804 : void *scratch = xfarray_scratch(array);
939 140804 : xfarray_idx_t lo, hi;
940 140804 : int error = 0;
941 :
942 140804 : if (array->nr < 2)
943 : return 0;
944 93114 : if (array->nr >= QSORT_MAX_RECS)
945 : return -E2BIG;
946 :
947 93114 : error = xfarray_sortinfo_alloc(array, cmp_fn, flags, &si);
948 93121 : if (error)
949 : return error;
950 93121 : si_lo = xfarray_sortinfo_lo(si);
951 93121 : si_hi = xfarray_sortinfo_hi(si);
952 93121 : pivot = xfarray_sortinfo_pivot(si);
953 :
954 186360 : while (si->stack_depth >= 0) {
955 93221 : lo = si_lo[si->stack_depth];
956 93221 : hi = si_hi[si->stack_depth];
957 :
958 93221 : trace_xfarray_qsort(si, lo, hi);
959 :
960 : /* Nothing left in this partition to sort; pop stack. */
961 93223 : if (lo >= hi) {
962 0 : si->stack_depth--;
963 0 : continue;
964 : }
965 :
966 : /*
967 : * If directly mapping the page and sorting can solve our
968 : * problems, we're done.
969 : */
970 93223 : if (xfarray_want_pagesort(si, lo, hi)) {
971 93168 : error = xfarray_pagesort(si, lo, hi);
972 93184 : if (error)
973 0 : goto out_free;
974 93184 : si->stack_depth--;
975 93184 : continue;
976 : }
977 :
978 : /* If insertion sort can solve our problems, we're done. */
979 55 : if (xfarray_want_isort(si, lo, hi)) {
980 4 : error = xfarray_isort(si, lo, hi);
981 4 : if (error)
982 0 : goto out_free;
983 4 : si->stack_depth--;
984 4 : continue;
985 : }
986 :
987 : /* Pick a pivot, move it to a[lo] and stash it. */
988 51 : error = xfarray_qsort_pivot(si, lo, hi);
989 51 : if (error)
990 0 : goto out_free;
991 :
992 : /*
993 : * Rearrange a[lo..hi] such that everything smaller than the
994 : * pivot is on the left side of the range and everything larger
995 : * than the pivot is on the right side of the range.
996 : */
997 18394 : while (lo < hi) {
998 : /*
999 : * Decrement hi until it finds an a[hi] less than the
1000 : * pivot value.
1001 : */
1002 18343 : error = xfarray_sort_load_cached(si, hi, scratch);
1003 18343 : if (error)
1004 0 : goto out_free;
1005 45809 : while (xfarray_sort_cmp(si, scratch, pivot) >= 0 &&
1006 : lo < hi) {
1007 27466 : hi--;
1008 27466 : error = xfarray_sort_load_cached(si, hi,
1009 : scratch);
1010 27466 : if (error)
1011 0 : goto out_free;
1012 : }
1013 18343 : error = xfarray_sort_put_page(si);
1014 18343 : if (error)
1015 0 : goto out_free;
1016 :
1017 18343 : if (xfarray_sort_terminated(si, &error))
1018 0 : goto out_free;
1019 :
1020 : /* Copy that item (a[hi]) to a[lo]. */
1021 18343 : if (lo < hi) {
1022 18326 : error = xfarray_sort_store(si, lo++, scratch);
1023 18326 : if (error)
1024 0 : goto out_free;
1025 : }
1026 :
1027 : /*
1028 : * Increment lo until it finds an a[lo] greater than
1029 : * the pivot value.
1030 : */
1031 18343 : error = xfarray_sort_load_cached(si, lo, scratch);
1032 18343 : if (error)
1033 0 : goto out_free;
1034 45212 : while (xfarray_sort_cmp(si, scratch, pivot) <= 0 &&
1035 : lo < hi) {
1036 26869 : lo++;
1037 26869 : error = xfarray_sort_load_cached(si, lo,
1038 : scratch);
1039 26869 : if (error)
1040 0 : goto out_free;
1041 : }
1042 18343 : error = xfarray_sort_put_page(si);
1043 18343 : if (error)
1044 0 : goto out_free;
1045 :
1046 18343 : if (xfarray_sort_terminated(si, &error))
1047 0 : goto out_free;
1048 :
1049 : /* Copy that item (a[lo]) to a[hi]. */
1050 18343 : if (lo < hi) {
1051 18301 : error = xfarray_sort_store(si, hi--, scratch);
1052 18301 : if (error)
1053 0 : goto out_free;
1054 : }
1055 :
1056 18343 : if (xfarray_sort_terminated(si, &error))
1057 0 : goto out_free;
1058 : }
1059 :
1060 : /*
1061 : * Put our pivot value in the correct place at a[lo]. All
1062 : * values between a[beg[i]] and a[lo - 1] should be less than
1063 : * the pivot; and all values between a[lo + 1] and a[end[i]-1]
1064 : * should be greater than the pivot.
1065 : */
1066 51 : error = xfarray_sort_store(si, lo, pivot);
1067 51 : if (error)
1068 0 : goto out_free;
1069 :
1070 : /* Set up the stack frame to process the two partitions. */
1071 51 : error = xfarray_qsort_push(si, si_lo, si_hi, lo, hi);
1072 51 : if (error)
1073 0 : goto out_free;
1074 :
1075 51 : if (xfarray_sort_terminated(si, &error))
1076 0 : goto out_free;
1077 : }
1078 :
1079 93139 : out_free:
1080 93139 : trace_xfarray_sort_stats(si, error);
1081 93138 : kvfree(si);
1082 93138 : return error;
1083 : }
1084 :
1085 : /* How many bytes is this array consuming? */
1086 : unsigned long long
1087 2115596972 : xfarray_bytes(
1088 : struct xfarray *array)
1089 : {
1090 2115596972 : return xfile_bytes(array->xfile);
1091 : }
1092 :
1093 : /* Empty the entire array. */
1094 : void
1095 113831463 : xfarray_truncate(
1096 : struct xfarray *array)
1097 : {
1098 113831463 : xfile_discard(array->xfile, 0, MAX_LFS_FILESIZE);
1099 113829470 : array->nr = 0;
1100 113829470 : }
|