Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (c) 2006-2007 Silicon Graphics, Inc.
4 : * All Rights Reserved.
5 : */
6 : #include "xfs.h"
7 : #include "xfs_mru_cache.h"
8 :
9 : /*
10 : * The MRU Cache data structure consists of a data store, an array of lists and
11 : * a lock to protect its internal state. At initialisation time, the client
12 : * supplies an element lifetime in milliseconds and a group count, as well as a
13 : * function pointer to call when deleting elements. A data structure for
14 : * queueing up work in the form of timed callbacks is also included.
15 : *
16 : * The group count controls how many lists are created, and thereby how finely
17 : * the elements are grouped in time. When reaping occurs, all the elements in
18 : * all the lists whose time has expired are deleted.
19 : *
20 : * To give an example of how this works in practice, consider a client that
21 : * initialises an MRU Cache with a lifetime of ten seconds and a group count of
22 : * five. Five internal lists will be created, each representing a two second
23 : * period in time. When the first element is added, time zero for the data
24 : * structure is initialised to the current time.
25 : *
26 : * All the elements added in the first two seconds are appended to the first
27 : * list. Elements added in the third second go into the second list, and so on.
28 : * If an element is accessed at any point, it is removed from its list and
29 : * inserted at the head of the current most-recently-used list.
30 : *
31 : * The reaper function will have nothing to do until at least twelve seconds
32 : * have elapsed since the first element was added. The reason for this is that
33 : * if it were called at t=11s, there could be elements in the first list that
34 : * have only been inactive for nine seconds, so it still does nothing. If it is
35 : * called anywhere between t=12 and t=14 seconds, it will delete all the
36 : * elements that remain in the first list. It's therefore possible for elements
37 : * to remain in the data store even after they've been inactive for up to
38 : * (t + t/g) seconds, where t is the inactive element lifetime and g is the
39 : * number of groups.
40 : *
41 : * The above example assumes that the reaper function gets called at least once
42 : * every (t/g) seconds. If it is called less frequently, unused elements will
43 : * accumulate in the reap list until the reaper function is eventually called.
44 : * The current implementation uses work queue callbacks to carefully time the
45 : * reaper function calls, so this should happen rarely, if at all.
46 : *
47 : * From a design perspective, the primary reason for the choice of a list array
48 : * representing discrete time intervals is that it's only practical to reap
49 : * expired elements in groups of some appreciable size. This automatically
50 : * introduces a granularity to element lifetimes, so there's no point storing an
51 : * individual timeout with each element that specifies a more precise reap time.
52 : * The bonus is a saving of sizeof(long) bytes of memory per element stored.
53 : *
54 : * The elements could have been stored in just one list, but an array of
55 : * counters or pointers would need to be maintained to allow them to be divided
56 : * up into discrete time groups. More critically, the process of touching or
57 : * removing an element would involve walking large portions of the entire list,
58 : * which would have a detrimental effect on performance. The additional memory
59 : * requirement for the array of list heads is minimal.
60 : *
61 : * When an element is touched or deleted, it needs to be removed from its
62 : * current list. Doubly linked lists are used to make the list maintenance
63 : * portion of these operations O(1). Since reaper timing can be imprecise,
64 : * inserts and lookups can occur when there are no free lists available. When
65 : * this happens, all the elements on the LRU list need to be migrated to the end
66 : * of the reap list. To keep the list maintenance portion of these operations
67 : * O(1) also, list tails need to be accessible without walking the entire list.
68 : * This is the reason why doubly linked list heads are used.
69 : */
70 :
71 : /*
72 : * An MRU Cache is a dynamic data structure that stores its elements in a way
73 : * that allows efficient lookups, but also groups them into discrete time
74 : * intervals based on insertion time. This allows elements to be efficiently
75 : * and automatically reaped after a fixed period of inactivity.
76 : *
77 : * When a client data pointer is stored in the MRU Cache it needs to be added to
78 : * both the data store and to one of the lists. It must also be possible to
79 : * access each of these entries via the other, i.e. to:
80 : *
81 : * a) Walk a list, removing the corresponding data store entry for each item.
82 : * b) Look up a data store entry, then access its list entry directly.
83 : *
84 : * To achieve both of these goals, each entry must contain both a list entry and
85 : * a key, in addition to the user's data pointer. Note that it's not a good
86 : * idea to have the client embed one of these structures at the top of their own
87 : * data structure, because inserting the same item more than once would most
88 : * likely result in a loop in one of the lists. That's a sure-fire recipe for
89 : * an infinite loop in the code.
90 : */
91 : struct xfs_mru_cache {
92 : struct radix_tree_root store; /* Core storage data structure. */
93 : struct list_head *lists; /* Array of lists, one per grp. */
94 : struct list_head reap_list; /* Elements overdue for reaping. */
95 : spinlock_t lock; /* Lock to protect this struct. */
96 : unsigned int grp_count; /* Number of discrete groups. */
97 : unsigned int grp_time; /* Time period spanned by grps. */
98 : unsigned int lru_grp; /* Group containing time zero. */
99 : unsigned long time_zero; /* Time first element was added. */
100 : xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
101 : struct delayed_work work; /* Workqueue data for reaping. */
102 : unsigned int queued; /* work has been queued */
103 : void *data;
104 : };
105 :
106 : static struct workqueue_struct *xfs_mru_reap_wq;
107 :
108 : /*
109 : * When inserting, destroying or reaping, it's first necessary to update the
110 : * lists relative to a particular time. In the case of destroying, that time
111 : * will be well in the future to ensure that all items are moved to the reap
112 : * list. In all other cases though, the time will be the current time.
113 : *
114 : * This function enters a loop, moving the contents of the LRU list to the reap
115 : * list again and again until either a) the lists are all empty, or b) time zero
116 : * has been advanced sufficiently to be within the immediate element lifetime.
117 : *
118 : * Case a) above is detected by counting how many groups are migrated and
119 : * stopping when they've all been moved. Case b) is detected by monitoring the
120 : * time_zero field, which is updated as each group is migrated.
121 : *
122 : * The return value is the earliest time that more migration could be needed, or
123 : * zero if there's no need to schedule more work because the lists are empty.
124 : */
125 : STATIC unsigned long
126 36224 : _xfs_mru_cache_migrate(
127 : struct xfs_mru_cache *mru,
128 : unsigned long now)
129 : {
130 36224 : unsigned int grp;
131 36224 : unsigned int migrated = 0;
132 36224 : struct list_head *lru_list;
133 :
134 : /* Nothing to do if the data store is empty. */
135 36224 : if (!mru->time_zero)
136 : return 0;
137 :
138 : /* While time zero is older than the time spanned by all the lists. */
139 12130 : while (mru->time_zero <= now - mru->grp_count * mru->grp_time) {
140 :
141 : /*
142 : * If the LRU list isn't empty, migrate its elements to the tail
143 : * of the reap list.
144 : */
145 38 : lru_list = mru->lists + mru->lru_grp;
146 38 : if (!list_empty(lru_list))
147 0 : list_splice_init(lru_list, mru->reap_list.prev);
148 :
149 : /*
150 : * Advance the LRU group number, freeing the old LRU list to
151 : * become the new MRU list; advance time zero accordingly.
152 : */
153 38 : mru->lru_grp = (mru->lru_grp + 1) % mru->grp_count;
154 38 : mru->time_zero += mru->grp_time;
155 :
156 : /*
157 : * If reaping is so far behind that all the elements on all the
158 : * lists have been migrated to the reap list, it's now empty.
159 : */
160 38 : if (++migrated == mru->grp_count) {
161 0 : mru->lru_grp = 0;
162 0 : mru->time_zero = 0;
163 0 : return 0;
164 : }
165 : }
166 :
167 : /* Find the first non-empty list from the LRU end. */
168 19415 : for (grp = 0; grp < mru->grp_count; grp++) {
169 :
170 : /* Check the grp'th list from the LRU end. */
171 18827 : lru_list = mru->lists + ((mru->lru_grp + grp) % mru->grp_count);
172 18827 : if (!list_empty(lru_list))
173 11504 : return mru->time_zero +
174 11504 : (mru->grp_count + grp) * mru->grp_time;
175 : }
176 :
177 : /* All the lists must be empty. */
178 588 : mru->lru_grp = 0;
179 588 : mru->time_zero = 0;
180 588 : return 0;
181 : }
182 :
183 : /*
184 : * When inserting or doing a lookup, an element needs to be inserted into the
185 : * MRU list. The lists must be migrated first to ensure that they're
186 : * up-to-date, otherwise the new element could be given a shorter lifetime in
187 : * the cache than it should.
188 : */
189 : STATIC void
190 12092 : _xfs_mru_cache_list_insert(
191 : struct xfs_mru_cache *mru,
192 : struct xfs_mru_cache_elem *elem)
193 : {
194 12092 : unsigned int grp = 0;
195 12092 : unsigned long now = jiffies;
196 :
197 : /*
198 : * If the data store is empty, initialise time zero, leave grp set to
199 : * zero and start the work queue timer if necessary. Otherwise, set grp
200 : * to the number of group times that have elapsed since time zero.
201 : */
202 12092 : if (!_xfs_mru_cache_migrate(mru, now)) {
203 588 : mru->time_zero = now;
204 588 : if (!mru->queued) {
205 30 : mru->queued = 1;
206 30 : queue_delayed_work(xfs_mru_reap_wq, &mru->work,
207 30 : mru->grp_count * mru->grp_time);
208 : }
209 : } else {
210 11504 : grp = (now - mru->time_zero) / mru->grp_time;
211 11504 : grp = (mru->lru_grp + grp) % mru->grp_count;
212 : }
213 :
214 : /* Insert the element at the tail of the corresponding list. */
215 12092 : list_add_tail(&elem->list_node, mru->lists + grp);
216 12092 : }
217 :
218 : /*
219 : * When destroying or reaping, all the elements that were migrated to the reap
220 : * list need to be deleted. For each element this involves removing it from the
221 : * data store, removing it from the reap list, calling the client's free
222 : * function and deleting the element from the element cache.
223 : *
224 : * We get called holding the mru->lock, which we drop and then reacquire.
225 : * Sparse need special help with this to tell it we know what we are doing.
226 : */
227 : STATIC void
228 24132 : _xfs_mru_cache_clear_reap_list(
229 : struct xfs_mru_cache *mru)
230 : __releases(mru->lock) __acquires(mru->lock)
231 : {
232 24132 : struct xfs_mru_cache_elem *elem, *next;
233 24132 : struct list_head tmp;
234 :
235 24132 : INIT_LIST_HEAD(&tmp);
236 24132 : list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) {
237 :
238 : /* Remove the element from the data store. */
239 0 : radix_tree_delete(&mru->store, elem->key);
240 :
241 : /*
242 : * remove to temp list so it can be freed without
243 : * needing to hold the lock
244 : */
245 0 : list_move(&elem->list_node, &tmp);
246 : }
247 24132 : spin_unlock(&mru->lock);
248 :
249 24132 : list_for_each_entry_safe(elem, next, &tmp, list_node) {
250 0 : list_del_init(&elem->list_node);
251 0 : mru->free_func(mru->data, elem);
252 : }
253 :
254 24132 : spin_lock(&mru->lock);
255 24132 : }
256 :
257 : /*
258 : * We fire the reap timer every group expiry interval so
259 : * we always have a reaper ready to run. This makes shutdown
260 : * and flushing of the reaper easy to do. Hence we need to
261 : * keep when the next reap must occur so we can determine
262 : * at each interval whether there is anything we need to do.
263 : */
264 : STATIC void
265 0 : _xfs_mru_cache_reap(
266 : struct work_struct *work)
267 : {
268 0 : struct xfs_mru_cache *mru =
269 0 : container_of(work, struct xfs_mru_cache, work.work);
270 0 : unsigned long now, next;
271 :
272 0 : ASSERT(mru && mru->lists);
273 0 : if (!mru || !mru->lists)
274 : return;
275 :
276 0 : spin_lock(&mru->lock);
277 0 : next = _xfs_mru_cache_migrate(mru, jiffies);
278 0 : _xfs_mru_cache_clear_reap_list(mru);
279 :
280 0 : mru->queued = next;
281 0 : if ((mru->queued > 0)) {
282 0 : now = jiffies;
283 0 : if (next <= now)
284 : next = 0;
285 : else
286 0 : next -= now;
287 0 : queue_delayed_work(xfs_mru_reap_wq, &mru->work, next);
288 : }
289 :
290 0 : spin_unlock(&mru->lock);
291 : }
292 :
293 : int
294 12 : xfs_mru_cache_init(void)
295 : {
296 12 : xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache",
297 : XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1);
298 12 : if (!xfs_mru_reap_wq)
299 0 : return -ENOMEM;
300 : return 0;
301 : }
302 :
303 : void
304 12 : xfs_mru_cache_uninit(void)
305 : {
306 12 : destroy_workqueue(xfs_mru_reap_wq);
307 12 : }
308 :
309 : /*
310 : * To initialise a struct xfs_mru_cache pointer, call xfs_mru_cache_create()
311 : * with the address of the pointer, a lifetime value in milliseconds, a group
312 : * count and a free function to use when deleting elements. This function
313 : * returns 0 if the initialisation was successful.
314 : */
315 : int
316 24125 : xfs_mru_cache_create(
317 : struct xfs_mru_cache **mrup,
318 : void *data,
319 : unsigned int lifetime_ms,
320 : unsigned int grp_count,
321 : xfs_mru_cache_free_func_t free_func)
322 : {
323 24125 : struct xfs_mru_cache *mru = NULL;
324 24125 : int err = 0, grp;
325 24125 : unsigned int grp_time;
326 :
327 24125 : if (mrup)
328 24125 : *mrup = NULL;
329 :
330 24125 : if (!mrup || !grp_count || !lifetime_ms || !free_func)
331 : return -EINVAL;
332 :
333 48250 : if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
334 : return -EINVAL;
335 :
336 24125 : if (!(mru = kmem_zalloc(sizeof(*mru), 0)))
337 : return -ENOMEM;
338 :
339 : /* An extra list is needed to avoid reaping up to a grp_time early. */
340 24125 : mru->grp_count = grp_count + 1;
341 24125 : mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0);
342 :
343 24125 : if (!mru->lists) {
344 0 : err = -ENOMEM;
345 0 : goto exit;
346 : }
347 :
348 289500 : for (grp = 0; grp < mru->grp_count; grp++)
349 265375 : INIT_LIST_HEAD(mru->lists + grp);
350 :
351 : /*
352 : * We use GFP_KERNEL radix tree preload and do inserts under a
353 : * spinlock so GFP_ATOMIC is appropriate for the radix tree itself.
354 : */
355 24125 : INIT_RADIX_TREE(&mru->store, GFP_ATOMIC);
356 24125 : INIT_LIST_HEAD(&mru->reap_list);
357 24125 : spin_lock_init(&mru->lock);
358 24125 : INIT_DELAYED_WORK(&mru->work, _xfs_mru_cache_reap);
359 :
360 24125 : mru->grp_time = grp_time;
361 24125 : mru->free_func = free_func;
362 24125 : mru->data = data;
363 24125 : *mrup = mru;
364 :
365 24125 : exit:
366 24125 : if (err && mru && mru->lists)
367 0 : kmem_free(mru->lists);
368 24125 : if (err && mru)
369 0 : kmem_free(mru);
370 :
371 : return err;
372 : }
373 :
374 : /*
375 : * Call xfs_mru_cache_flush() to flush out all cached entries, calling their
376 : * free functions as they're deleted. When this function returns, the caller is
377 : * guaranteed that all the free functions for all the elements have finished
378 : * executing and the reaper is not running.
379 : */
380 : static void
381 24132 : xfs_mru_cache_flush(
382 : struct xfs_mru_cache *mru)
383 : {
384 24132 : if (!mru || !mru->lists)
385 : return;
386 :
387 24132 : spin_lock(&mru->lock);
388 24132 : if (mru->queued) {
389 30 : spin_unlock(&mru->lock);
390 30 : cancel_delayed_work_sync(&mru->work);
391 30 : spin_lock(&mru->lock);
392 : }
393 :
394 24132 : _xfs_mru_cache_migrate(mru, jiffies + mru->grp_count * mru->grp_time);
395 24132 : _xfs_mru_cache_clear_reap_list(mru);
396 :
397 24132 : spin_unlock(&mru->lock);
398 : }
399 :
400 : void
401 24132 : xfs_mru_cache_destroy(
402 : struct xfs_mru_cache *mru)
403 : {
404 24132 : if (!mru || !mru->lists)
405 : return;
406 :
407 24132 : xfs_mru_cache_flush(mru);
408 :
409 24132 : kmem_free(mru->lists);
410 24132 : kmem_free(mru);
411 : }
412 :
413 : /*
414 : * To insert an element, call xfs_mru_cache_insert() with the data store, the
415 : * element's key and the client data pointer. This function returns 0 on
416 : * success or ENOMEM if memory for the data element couldn't be allocated.
417 : */
418 : int
419 1667 : xfs_mru_cache_insert(
420 : struct xfs_mru_cache *mru,
421 : unsigned long key,
422 : struct xfs_mru_cache_elem *elem)
423 : {
424 1667 : int error;
425 :
426 1667 : ASSERT(mru && mru->lists);
427 1667 : if (!mru || !mru->lists)
428 : return -EINVAL;
429 :
430 1667 : if (radix_tree_preload(GFP_NOFS))
431 : return -ENOMEM;
432 :
433 1667 : INIT_LIST_HEAD(&elem->list_node);
434 1667 : elem->key = key;
435 :
436 1667 : spin_lock(&mru->lock);
437 1667 : error = radix_tree_insert(&mru->store, key, elem);
438 1667 : radix_tree_preload_end();
439 1667 : if (!error)
440 1667 : _xfs_mru_cache_list_insert(mru, elem);
441 1667 : spin_unlock(&mru->lock);
442 :
443 1667 : return error;
444 : }
445 :
446 : /*
447 : * To remove an element without calling the free function, call
448 : * xfs_mru_cache_remove() with the data store and the element's key. On success
449 : * the client data pointer for the removed element is returned, otherwise this
450 : * function will return a NULL pointer.
451 : */
452 : struct xfs_mru_cache_elem *
453 2469 : xfs_mru_cache_remove(
454 : struct xfs_mru_cache *mru,
455 : unsigned long key)
456 : {
457 2469 : struct xfs_mru_cache_elem *elem;
458 :
459 2469 : ASSERT(mru && mru->lists);
460 2469 : if (!mru || !mru->lists)
461 : return NULL;
462 :
463 2469 : spin_lock(&mru->lock);
464 2469 : elem = radix_tree_delete(&mru->store, key);
465 2469 : if (elem)
466 1667 : list_del(&elem->list_node);
467 2469 : spin_unlock(&mru->lock);
468 :
469 2469 : return elem;
470 : }
471 :
472 : /*
473 : * To remove and element and call the free function, call xfs_mru_cache_delete()
474 : * with the data store and the element's key.
475 : */
476 : void
477 802 : xfs_mru_cache_delete(
478 : struct xfs_mru_cache *mru,
479 : unsigned long key)
480 : {
481 802 : struct xfs_mru_cache_elem *elem;
482 :
483 802 : elem = xfs_mru_cache_remove(mru, key);
484 802 : if (elem)
485 802 : mru->free_func(mru->data, elem);
486 802 : }
487 :
488 : /*
489 : * To look up an element using its key, call xfs_mru_cache_lookup() with the
490 : * data store and the element's key. If found, the element will be moved to the
491 : * head of the MRU list to indicate that it's been touched.
492 : *
493 : * The internal data structures are protected by a spinlock that is STILL HELD
494 : * when this function returns. Call xfs_mru_cache_done() to release it. Note
495 : * that it is not safe to call any function that might sleep in the interim.
496 : *
497 : * The implementation could have used reference counting to avoid this
498 : * restriction, but since most clients simply want to get, set or test a member
499 : * of the returned data structure, the extra per-element memory isn't warranted.
500 : *
501 : * If the element isn't found, this function returns NULL and the spinlock is
502 : * released. xfs_mru_cache_done() should NOT be called when this occurs.
503 : *
504 : * Because sparse isn't smart enough to know about conditional lock return
505 : * status, we need to help it get it right by annotating the path that does
506 : * not release the lock.
507 : */
508 : struct xfs_mru_cache_elem *
509 11221 : xfs_mru_cache_lookup(
510 : struct xfs_mru_cache *mru,
511 : unsigned long key)
512 : {
513 11221 : struct xfs_mru_cache_elem *elem;
514 :
515 11221 : ASSERT(mru && mru->lists);
516 11221 : if (!mru || !mru->lists)
517 : return NULL;
518 :
519 11221 : spin_lock(&mru->lock);
520 11227 : elem = radix_tree_lookup(&mru->store, key);
521 11227 : if (elem) {
522 10425 : list_del(&elem->list_node);
523 10425 : _xfs_mru_cache_list_insert(mru, elem);
524 10425 : __release(mru_lock); /* help sparse not be stupid */
525 : } else
526 802 : spin_unlock(&mru->lock);
527 :
528 : return elem;
529 : }
530 :
531 : /*
532 : * To release the internal data structure spinlock after having performed an
533 : * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
534 : * with the data store pointer.
535 : */
536 : void
537 10425 : xfs_mru_cache_done(
538 : struct xfs_mru_cache *mru)
539 : __releases(mru->lock)
540 : {
541 10425 : spin_unlock(&mru->lock);
542 10425 : }
|