Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 : * All Rights Reserved.
5 : */
6 : #include <linux/iversion.h>
7 :
8 : #include "xfs.h"
9 : #include "xfs_fs.h"
10 : #include "xfs_shared.h"
11 : #include "xfs_format.h"
12 : #include "xfs_log_format.h"
13 : #include "xfs_trans_resv.h"
14 : #include "xfs_mount.h"
15 : #include "xfs_defer.h"
16 : #include "xfs_inode.h"
17 : #include "xfs_dir2.h"
18 : #include "xfs_attr.h"
19 : #include "xfs_trans_space.h"
20 : #include "xfs_trans.h"
21 : #include "xfs_buf_item.h"
22 : #include "xfs_inode_item.h"
23 : #include "xfs_iunlink_item.h"
24 : #include "xfs_ialloc.h"
25 : #include "xfs_bmap.h"
26 : #include "xfs_bmap_util.h"
27 : #include "xfs_errortag.h"
28 : #include "xfs_error.h"
29 : #include "xfs_quota.h"
30 : #include "xfs_filestream.h"
31 : #include "xfs_trace.h"
32 : #include "xfs_icache.h"
33 : #include "xfs_symlink.h"
34 : #include "xfs_trans_priv.h"
35 : #include "xfs_log.h"
36 : #include "xfs_bmap_btree.h"
37 : #include "xfs_reflink.h"
38 : #include "xfs_ag.h"
39 : #include "xfs_log_priv.h"
40 :
41 : struct kmem_cache *xfs_inode_cache;
42 :
43 : /*
44 : * Used in xfs_itruncate_extents(). This is the maximum number of extents
45 : * freed from a file in a single transaction.
46 : */
47 : #define XFS_ITRUNC_MAX_EXTENTS 2
48 :
49 : STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
50 : STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
51 : struct xfs_inode *);
52 :
53 : /*
54 : * helper function to extract extent size hint from inode
55 : */
56 : xfs_extlen_t
57 216892562 : xfs_get_extsz_hint(
58 : struct xfs_inode *ip)
59 : {
60 : /*
61 : * No point in aligning allocations if we need to COW to actually
62 : * write to them.
63 : */
64 216892562 : if (xfs_is_always_cow_inode(ip))
65 : return 0;
66 216899051 : if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
67 : return ip->i_extsize;
68 204275547 : if (XFS_IS_REALTIME_INODE(ip))
69 97551373 : return ip->i_mount->m_sb.sb_rextsize;
70 : return 0;
71 : }
72 :
73 : /*
74 : * Helper function to extract CoW extent size hint from inode.
75 : * Between the extent size hint and the CoW extent size hint, we
76 : * return the greater of the two. If the value is zero (automatic),
77 : * use the default size.
78 : */
79 : xfs_extlen_t
80 692738 : xfs_get_cowextsz_hint(
81 : struct xfs_inode *ip)
82 : {
83 692738 : xfs_extlen_t a, b;
84 :
85 692738 : a = 0;
86 692738 : if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
87 28103 : a = ip->i_cowextsize;
88 692738 : b = xfs_get_extsz_hint(ip);
89 :
90 692738 : a = max(a, b);
91 692738 : if (a == 0)
92 664631 : return XFS_DEFAULT_COWEXTSZ_HINT;
93 : return a;
94 : }
95 :
96 : /*
97 : * These two are wrapper routines around the xfs_ilock() routine used to
98 : * centralize some grungy code. They are used in places that wish to lock the
99 : * inode solely for reading the extents. The reason these places can't just
100 : * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
101 : * bringing in of the extents from disk for a file in b-tree format. If the
102 : * inode is in b-tree format, then we need to lock the inode exclusively until
103 : * the extents are read in. Locking it exclusively all the time would limit
104 : * our parallelism unnecessarily, though. What we do instead is check to see
105 : * if the extents have been read in yet, and only lock the inode exclusively
106 : * if they have not.
107 : *
108 : * The functions return a value which should be given to the corresponding
109 : * xfs_iunlock() call.
110 : */
111 : uint
112 476496066 : xfs_ilock_data_map_shared(
113 : struct xfs_inode *ip)
114 : {
115 476496066 : uint lock_mode = XFS_ILOCK_SHARED;
116 :
117 476496066 : if (xfs_need_iread_extents(&ip->i_df))
118 41174 : lock_mode = XFS_ILOCK_EXCL;
119 476566545 : xfs_ilock(ip, lock_mode);
120 476562250 : return lock_mode;
121 : }
122 :
123 : uint
124 86304572 : xfs_ilock_attr_map_shared(
125 : struct xfs_inode *ip)
126 : {
127 86304572 : uint lock_mode = XFS_ILOCK_SHARED;
128 :
129 166323391 : if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
130 0 : lock_mode = XFS_ILOCK_EXCL;
131 86308737 : xfs_ilock(ip, lock_mode);
132 86314018 : return lock_mode;
133 : }
134 :
135 : /*
136 : * You can't set both SHARED and EXCL for the same lock,
137 : * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED,
138 : * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values
139 : * to set in lock_flags.
140 : */
141 : static inline void
142 >10670*10^7 : xfs_lock_flags_assert(
143 : uint lock_flags)
144 : {
145 >10670*10^7 : ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
146 : (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
147 >10670*10^7 : ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
148 : (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
149 >10670*10^7 : ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
150 : (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
151 >10670*10^7 : ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
152 >10670*10^7 : ASSERT(lock_flags != 0);
153 >10670*10^7 : }
154 :
155 : /*
156 : * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
157 : * multi-reader locks: invalidate_lock and the i_lock. This routine allows
158 : * various combinations of the locks to be obtained.
159 : *
160 : * The 3 locks should always be ordered so that the IO lock is obtained first,
161 : * the mmap lock second and the ilock last in order to prevent deadlock.
162 : *
163 : * Basic locking order:
164 : *
165 : * i_rwsem -> invalidate_lock -> page_lock -> i_ilock
166 : *
167 : * mmap_lock locking order:
168 : *
169 : * i_rwsem -> page lock -> mmap_lock
170 : * mmap_lock -> invalidate_lock -> page_lock
171 : *
172 : * The difference in mmap_lock locking order mean that we cannot hold the
173 : * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths
174 : * can fault in pages during copy in/out (for buffered IO) or require the
175 : * mmap_lock in get_user_pages() to map the user pages into the kernel address
176 : * space for direct IO. Similarly the i_rwsem cannot be taken inside a page
177 : * fault because page faults already hold the mmap_lock.
178 : *
179 : * Hence to serialise fully against both syscall and mmap based IO, we need to
180 : * take both the i_rwsem and the invalidate_lock. These locks should *only* be
181 : * both taken in places where we need to invalidate the page cache in a race
182 : * free manner (e.g. truncate, hole punch and other extent manipulation
183 : * functions).
184 : */
185 : void
186 53198395104 : xfs_ilock(
187 : xfs_inode_t *ip,
188 : uint lock_flags)
189 : {
190 53198395104 : trace_xfs_ilock(ip, lock_flags, _RET_IP_);
191 :
192 53313943352 : xfs_lock_flags_assert(lock_flags);
193 :
194 53402572637 : if (lock_flags & XFS_IOLOCK_EXCL) {
195 1920307115 : down_write_nested(&VFS_I(ip)->i_rwsem,
196 : XFS_IOLOCK_DEP(lock_flags));
197 51482265522 : } else if (lock_flags & XFS_IOLOCK_SHARED) {
198 322524783 : down_read_nested(&VFS_I(ip)->i_rwsem,
199 : XFS_IOLOCK_DEP(lock_flags));
200 : }
201 :
202 53403135911 : if (lock_flags & XFS_MMAPLOCK_EXCL) {
203 187103263 : down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
204 : XFS_MMAPLOCK_DEP(lock_flags));
205 53216032648 : } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
206 3446597 : down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
207 : XFS_MMAPLOCK_DEP(lock_flags));
208 : }
209 :
210 53403130753 : if (lock_flags & XFS_ILOCK_EXCL)
211 3052691477 : mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
212 50350439276 : else if (lock_flags & XFS_ILOCK_SHARED)
213 48003317327 : mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
214 53362014399 : }
215 :
216 : /*
217 : * This is just like xfs_ilock(), except that the caller
218 : * is guaranteed not to sleep. It returns 1 if it gets
219 : * the requested locks and 0 otherwise. If the IO lock is
220 : * obtained but the inode lock cannot be, then the IO lock
221 : * is dropped before returning.
222 : *
223 : * ip -- the inode being locked
224 : * lock_flags -- this parameter indicates the inode's locks to be
225 : * to be locked. See the comment for xfs_ilock() for a list
226 : * of valid values.
227 : */
228 : int
229 1975917194 : xfs_ilock_nowait(
230 : xfs_inode_t *ip,
231 : uint lock_flags)
232 : {
233 1975917194 : trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
234 :
235 1976099300 : xfs_lock_flags_assert(lock_flags);
236 :
237 1978272121 : if (lock_flags & XFS_IOLOCK_EXCL) {
238 318455697 : if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
239 584372 : goto out;
240 1659816424 : } else if (lock_flags & XFS_IOLOCK_SHARED) {
241 0 : if (!down_read_trylock(&VFS_I(ip)->i_rwsem))
242 0 : goto out;
243 : }
244 :
245 1977687615 : if (lock_flags & XFS_MMAPLOCK_EXCL) {
246 3880 : if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
247 0 : goto out_undo_iolock;
248 1977683735 : } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
249 0 : if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
250 0 : goto out_undo_iolock;
251 : }
252 :
253 1977687615 : if (lock_flags & XFS_ILOCK_EXCL) {
254 1087872649 : if (!mrtryupdate(&ip->i_lock))
255 12460 : goto out_undo_mmaplock;
256 889814966 : } else if (lock_flags & XFS_ILOCK_SHARED) {
257 572006565 : if (!mrtryaccess(&ip->i_lock))
258 19820563 : goto out_undo_mmaplock;
259 : }
260 : return 1;
261 :
262 19833023 : out_undo_mmaplock:
263 19833023 : if (lock_flags & XFS_MMAPLOCK_EXCL)
264 0 : up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
265 19833023 : else if (lock_flags & XFS_MMAPLOCK_SHARED)
266 0 : up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
267 19833023 : out_undo_iolock:
268 19833023 : if (lock_flags & XFS_IOLOCK_EXCL)
269 0 : up_write(&VFS_I(ip)->i_rwsem);
270 19833023 : else if (lock_flags & XFS_IOLOCK_SHARED)
271 0 : up_read(&VFS_I(ip)->i_rwsem);
272 19833023 : out:
273 : return 0;
274 : }
275 :
276 : /*
277 : * xfs_iunlock() is used to drop the inode locks acquired with
278 : * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
279 : * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
280 : * that we know which locks to drop.
281 : *
282 : * ip -- the inode being unlocked
283 : * lock_flags -- this parameter indicates the inode's locks to be
284 : * to be unlocked. See the comment for xfs_ilock() for a list
285 : * of valid values for this parameter.
286 : *
287 : */
288 : void
289 53049837964 : xfs_iunlock(
290 : xfs_inode_t *ip,
291 : uint lock_flags)
292 : {
293 53049837964 : xfs_lock_flags_assert(lock_flags);
294 :
295 53389693657 : if (lock_flags & XFS_IOLOCK_EXCL)
296 2238405702 : up_write(&VFS_I(ip)->i_rwsem);
297 51151287955 : else if (lock_flags & XFS_IOLOCK_SHARED)
298 322626756 : up_read(&VFS_I(ip)->i_rwsem);
299 :
300 53389521119 : if (lock_flags & XFS_MMAPLOCK_EXCL)
301 187103641 : up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
302 53202417478 : else if (lock_flags & XFS_MMAPLOCK_SHARED)
303 3446573 : up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
304 :
305 53389523455 : if (lock_flags & XFS_ILOCK_EXCL)
306 4139158051 : mrunlock_excl(&ip->i_lock);
307 49250365404 : else if (lock_flags & XFS_ILOCK_SHARED)
308 48541969508 : mrunlock_shared(&ip->i_lock);
309 :
310 53463631196 : trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
311 53401324728 : }
312 :
313 : /*
314 : * give up write locks. the i/o lock cannot be held nested
315 : * if it is being demoted.
316 : */
317 : void
318 104697 : xfs_ilock_demote(
319 : xfs_inode_t *ip,
320 : uint lock_flags)
321 : {
322 104697 : ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
323 104697 : ASSERT((lock_flags &
324 : ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
325 :
326 104697 : if (lock_flags & XFS_ILOCK_EXCL)
327 0 : mrdemote(&ip->i_lock);
328 104697 : if (lock_flags & XFS_MMAPLOCK_EXCL)
329 0 : downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
330 104697 : if (lock_flags & XFS_IOLOCK_EXCL)
331 104697 : downgrade_write(&VFS_I(ip)->i_rwsem);
332 :
333 104697 : trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
334 104697 : }
335 :
336 : #if defined(DEBUG) || defined(XFS_WARN)
337 : static inline bool
338 : __xfs_rwsem_islocked(
339 : struct rw_semaphore *rwsem,
340 : bool shared)
341 : {
342 1258227485 : if (!debug_locks)
343 0 : return rwsem_is_locked(rwsem);
344 :
345 : if (!shared)
346 : return lockdep_is_held_type(rwsem, 0);
347 :
348 : /*
349 : * We are checking that the lock is held at least in shared
350 : * mode but don't care that it might be held exclusively
351 : * (i.e. shared | excl). Hence we check if the lock is held
352 : * in any mode rather than an explicit shared mode.
353 : */
354 : return lockdep_is_held_type(rwsem, -1);
355 : }
356 :
357 : bool
358 10437626196 : xfs_isilocked(
359 : struct xfs_inode *ip,
360 : uint lock_flags)
361 : {
362 10437626196 : if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
363 9186702496 : if (!(lock_flags & XFS_ILOCK_SHARED))
364 4942444500 : return !!ip->i_lock.mr_writer;
365 4244257996 : return rwsem_is_locked(&ip->i_lock.mr_lock);
366 : }
367 :
368 1250923700 : if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
369 22306635 : return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock,
370 : (lock_flags & XFS_MMAPLOCK_SHARED));
371 : }
372 :
373 1228617065 : if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
374 1235920850 : return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
375 : (lock_flags & XFS_IOLOCK_SHARED));
376 : }
377 :
378 0 : ASSERT(0);
379 0 : return false;
380 : }
381 : #endif
382 :
383 : /*
384 : * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
385 : * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
386 : * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
387 : * errors and warnings.
388 : */
389 : #if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
390 : static bool
391 : xfs_lockdep_subclass_ok(
392 : int subclass)
393 : {
394 : return subclass < MAX_LOCKDEP_SUBCLASSES;
395 : }
396 : #else
397 : #define xfs_lockdep_subclass_ok(subclass) (true)
398 : #endif
399 :
400 : /*
401 : * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
402 : * value. This can be called for any type of inode lock combination, including
403 : * parent locking. Care must be taken to ensure we don't overrun the subclass
404 : * storage fields in the class mask we build.
405 : */
406 : static inline uint
407 156770820 : xfs_lock_inumorder(
408 : uint lock_mode,
409 : uint subclass)
410 : {
411 156770820 : uint class = 0;
412 :
413 156770820 : ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
414 : XFS_ILOCK_RTSUM)));
415 156770820 : ASSERT(xfs_lockdep_subclass_ok(subclass));
416 :
417 156770820 : if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
418 0 : ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
419 0 : class += subclass << XFS_IOLOCK_SHIFT;
420 : }
421 :
422 156770820 : if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
423 0 : ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS);
424 0 : class += subclass << XFS_MMAPLOCK_SHIFT;
425 : }
426 :
427 156770820 : if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) {
428 156772955 : ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS);
429 156772955 : class += subclass << XFS_ILOCK_SHIFT;
430 : }
431 :
432 156770820 : return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class;
433 : }
434 :
435 : /*
436 : * The following routine will lock n inodes in exclusive mode. We assume the
437 : * caller calls us with the inodes in i_ino order.
438 : *
439 : * We need to detect deadlock where an inode that we lock is in the AIL and we
440 : * start waiting for another inode that is locked by a thread in a long running
441 : * transaction (such as truncate). This can result in deadlock since the long
442 : * running trans might need to wait for the inode we just locked in order to
443 : * push the tail and free space in the log.
444 : *
445 : * xfs_lock_inodes() can only be used to lock one type of lock at a time -
446 : * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
447 : * lock more than one at a time, lockdep will report false positives saying we
448 : * have violated locking orders.
449 : */
450 : static void
451 22740317 : xfs_lock_inodes(
452 : struct xfs_inode **ips,
453 : int inodes,
454 : uint lock_mode)
455 : {
456 22740317 : int attempts = 0;
457 22740317 : uint i;
458 22740317 : int j;
459 22740317 : bool try_lock;
460 22740317 : struct xfs_log_item *lp;
461 :
462 : /*
463 : * Currently supports between 2 and 5 inodes with exclusive locking. We
464 : * support an arbitrary depth of locking here, but absolute limits on
465 : * inodes depend on the type of locking and the limits placed by
466 : * lockdep annotations in xfs_lock_inumorder. These are all checked by
467 : * the asserts.
468 : */
469 22740317 : ASSERT(ips && inodes >= 2 && inodes <= 5);
470 22740317 : ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL |
471 : XFS_ILOCK_EXCL));
472 22740317 : ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
473 : XFS_ILOCK_SHARED)));
474 22740317 : ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
475 : inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
476 22740317 : ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
477 : inodes <= XFS_ILOCK_MAX_SUBCLASS + 1);
478 :
479 22740317 : if (lock_mode & XFS_IOLOCK_EXCL) {
480 0 : ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)));
481 22740317 : } else if (lock_mode & XFS_MMAPLOCK_EXCL)
482 0 : ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
483 :
484 22740317 : again:
485 22748105 : try_lock = false;
486 22748105 : i = 0;
487 99382863 : for (; i < inodes; i++) {
488 76642533 : ASSERT(ips[i]);
489 :
490 76642533 : if (i && (ips[i] == ips[i - 1])) /* Already locked */
491 1442117 : continue;
492 :
493 : /*
494 : * If try_lock is not set yet, make sure all locked inodes are
495 : * not in the AIL. If any are, set try_lock to be used later.
496 : */
497 75200416 : if (!try_lock) {
498 106500708 : for (j = (i - 1); j >= 0 && !try_lock; j--) {
499 48471993 : lp = &ips[j]->i_itemp->ili_item;
500 92700508 : if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
501 15170358 : try_lock = true;
502 : }
503 : }
504 :
505 : /*
506 : * If any of the previous locks we have locked is in the AIL,
507 : * we must TRY to get the second and subsequent locks. If
508 : * we can't get any, we must release all we have
509 : * and try again.
510 : */
511 75200416 : if (!try_lock) {
512 42858375 : xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
513 42858379 : continue;
514 : }
515 :
516 : /* try_lock means we have an inode locked that is in the AIL. */
517 32342041 : ASSERT(i != 0);
518 32342041 : if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
519 32334262 : continue;
520 :
521 : /*
522 : * Unlock all previous guys and try again. xfs_iunlock will try
523 : * to push the tail if the inode is in the AIL.
524 : */
525 7788 : attempts++;
526 23807 : for (j = i - 1; j >= 0; j--) {
527 : /*
528 : * Check to see if we've already unlocked this one. Not
529 : * the first one going back, and the inode ptr is the
530 : * same.
531 : */
532 16019 : if (j != (i - 1) && ips[j] == ips[j + 1])
533 4923 : continue;
534 :
535 11096 : xfs_iunlock(ips[j], lock_mode);
536 : }
537 :
538 7788 : if ((attempts % 5) == 0) {
539 1544 : delay(1); /* Don't just spin the CPU */
540 : }
541 7788 : goto again;
542 : }
543 22740330 : }
544 :
545 : /*
546 : * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and
547 : * mmaplock must be double-locked separately since we use i_rwsem and
548 : * invalidate_lock for that. We now support taking one lock EXCL and the
549 : * other SHARED.
550 : */
551 : void
552 40783193 : xfs_lock_two_inodes(
553 : struct xfs_inode *ip0,
554 : uint ip0_mode,
555 : struct xfs_inode *ip1,
556 : uint ip1_mode)
557 : {
558 40783193 : int attempts = 0;
559 40783193 : struct xfs_log_item *lp;
560 :
561 81567536 : ASSERT(hweight32(ip0_mode) == 1);
562 81568764 : ASSERT(hweight32(ip1_mode) == 1);
563 40784031 : ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
564 40784031 : ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
565 40784031 : ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
566 40784031 : ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
567 40784031 : ASSERT(ip0->i_ino != ip1->i_ino);
568 :
569 40784031 : if (ip0->i_ino > ip1->i_ino) {
570 4774188 : swap(ip0, ip1);
571 4774188 : swap(ip0_mode, ip1_mode);
572 : }
573 :
574 40784031 : again:
575 40788696 : xfs_ilock(ip0, xfs_lock_inumorder(ip0_mode, 0));
576 :
577 : /*
578 : * If the first lock we have locked is in the AIL, we must TRY to get
579 : * the second lock. If we can't get it, we must release the first one
580 : * and try again.
581 : */
582 40785894 : lp = &ip0->i_itemp->ili_item;
583 40785894 : if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
584 30604722 : if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) {
585 4665 : xfs_iunlock(ip0, ip0_mode);
586 4665 : if ((++attempts % 5) == 0)
587 923 : delay(1); /* Don't just spin the CPU */
588 4665 : goto again;
589 : }
590 : } else {
591 10181172 : xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1));
592 : }
593 40784363 : }
594 :
595 : uint
596 47784568727 : xfs_ip2xflags(
597 : struct xfs_inode *ip)
598 : {
599 47784568727 : uint flags = 0;
600 :
601 47784568727 : if (ip->i_diflags & XFS_DIFLAG_ANY) {
602 13654295892 : if (ip->i_diflags & XFS_DIFLAG_REALTIME)
603 5611010449 : flags |= FS_XFLAG_REALTIME;
604 13654295892 : if (ip->i_diflags & XFS_DIFLAG_PREALLOC)
605 7408349088 : flags |= FS_XFLAG_PREALLOC;
606 13654295892 : if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE)
607 184 : flags |= FS_XFLAG_IMMUTABLE;
608 13654295892 : if (ip->i_diflags & XFS_DIFLAG_APPEND)
609 164 : flags |= FS_XFLAG_APPEND;
610 13654295892 : if (ip->i_diflags & XFS_DIFLAG_SYNC)
611 60 : flags |= FS_XFLAG_SYNC;
612 13654295892 : if (ip->i_diflags & XFS_DIFLAG_NOATIME)
613 42 : flags |= FS_XFLAG_NOATIME;
614 13654295892 : if (ip->i_diflags & XFS_DIFLAG_NODUMP)
615 46 : flags |= FS_XFLAG_NODUMP;
616 13654295892 : if (ip->i_diflags & XFS_DIFLAG_RTINHERIT)
617 2963739276 : flags |= FS_XFLAG_RTINHERIT;
618 13654295892 : if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT)
619 1220 : flags |= FS_XFLAG_PROJINHERIT;
620 13654295892 : if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS)
621 8 : flags |= FS_XFLAG_NOSYMLINKS;
622 13654295892 : if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
623 3696 : flags |= FS_XFLAG_EXTSIZE;
624 13654295892 : if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT)
625 3264 : flags |= FS_XFLAG_EXTSZINHERIT;
626 13654295892 : if (ip->i_diflags & XFS_DIFLAG_NODEFRAG)
627 0 : flags |= FS_XFLAG_NODEFRAG;
628 13654295892 : if (ip->i_diflags & XFS_DIFLAG_FILESTREAM)
629 14631 : flags |= FS_XFLAG_FILESTREAM;
630 : }
631 :
632 47784568727 : if (ip->i_diflags2 & XFS_DIFLAG2_ANY) {
633 47900253337 : if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
634 86 : flags |= FS_XFLAG_DAX;
635 47900253337 : if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
636 10436 : flags |= FS_XFLAG_COWEXTSIZE;
637 : }
638 :
639 47784568727 : if (xfs_inode_has_attr_fork(ip))
640 6249111322 : flags |= FS_XFLAG_HASATTR;
641 47784568727 : return flags;
642 : }
643 :
644 : /*
645 : * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
646 : * is allowed, otherwise it has to be an exact match. If a CI match is found,
647 : * ci_name->name will point to a the actual name (caller must free) or
648 : * will be set to NULL if an exact match is found.
649 : */
650 : int
651 140251751 : xfs_lookup(
652 : struct xfs_inode *dp,
653 : const struct xfs_name *name,
654 : struct xfs_inode **ipp,
655 : struct xfs_name *ci_name)
656 : {
657 140251751 : xfs_ino_t inum;
658 140251751 : int error;
659 :
660 140251751 : trace_xfs_lookup(dp, name);
661 :
662 280508564 : if (xfs_is_shutdown(dp->i_mount))
663 : return -EIO;
664 :
665 140196447 : error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
666 140206202 : if (error)
667 51236705 : goto out_unlock;
668 :
669 88969497 : error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
670 88971541 : if (error)
671 2481 : goto out_free_name;
672 :
673 : return 0;
674 :
675 : out_free_name:
676 2481 : if (ci_name)
677 0 : kmem_free(ci_name->name);
678 2481 : out_unlock:
679 51239186 : *ipp = NULL;
680 51239186 : return error;
681 : }
682 :
683 : /* Propagate di_flags from a parent inode to a child inode. */
684 : static void
685 3472488 : xfs_inode_inherit_flags(
686 : struct xfs_inode *ip,
687 : const struct xfs_inode *pip)
688 : {
689 3472488 : unsigned int di_flags = 0;
690 3472488 : xfs_failaddr_t failaddr;
691 3472488 : umode_t mode = VFS_I(ip)->i_mode;
692 :
693 3472488 : if (S_ISDIR(mode)) {
694 1129482 : if (pip->i_diflags & XFS_DIFLAG_RTINHERIT)
695 1129476 : di_flags |= XFS_DIFLAG_RTINHERIT;
696 1129482 : if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
697 0 : di_flags |= XFS_DIFLAG_EXTSZINHERIT;
698 0 : ip->i_extsize = pip->i_extsize;
699 : }
700 1129482 : if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT)
701 2 : di_flags |= XFS_DIFLAG_PROJINHERIT;
702 2343006 : } else if (S_ISREG(mode)) {
703 2343011 : if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
704 2337447 : xfs_has_realtime(ip->i_mount))
705 2337443 : di_flags |= XFS_DIFLAG_REALTIME;
706 2343011 : if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
707 426 : di_flags |= XFS_DIFLAG_EXTSIZE;
708 426 : ip->i_extsize = pip->i_extsize;
709 : }
710 : }
711 3472488 : if ((pip->i_diflags & XFS_DIFLAG_NOATIME) &&
712 0 : xfs_inherit_noatime)
713 0 : di_flags |= XFS_DIFLAG_NOATIME;
714 3472488 : if ((pip->i_diflags & XFS_DIFLAG_NODUMP) &&
715 0 : xfs_inherit_nodump)
716 0 : di_flags |= XFS_DIFLAG_NODUMP;
717 3472488 : if ((pip->i_diflags & XFS_DIFLAG_SYNC) &&
718 0 : xfs_inherit_sync)
719 0 : di_flags |= XFS_DIFLAG_SYNC;
720 3472488 : if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) &&
721 2 : xfs_inherit_nosymlinks)
722 0 : di_flags |= XFS_DIFLAG_NOSYMLINKS;
723 3472488 : if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) &&
724 0 : xfs_inherit_nodefrag)
725 0 : di_flags |= XFS_DIFLAG_NODEFRAG;
726 3472488 : if (pip->i_diflags & XFS_DIFLAG_FILESTREAM)
727 4771 : di_flags |= XFS_DIFLAG_FILESTREAM;
728 :
729 3472488 : ip->i_diflags |= di_flags;
730 :
731 : /*
732 : * Inode verifiers on older kernels only check that the extent size
733 : * hint is an integer multiple of the rt extent size on realtime files.
734 : * They did not check the hint alignment on a directory with both
735 : * rtinherit and extszinherit flags set. If the misaligned hint is
736 : * propagated from a directory into a new realtime file, new file
737 : * allocations will fail due to math errors in the rt allocator and/or
738 : * trip the verifiers. Validate the hint settings in the new file so
739 : * that we don't let broken hints propagate.
740 : */
741 3472488 : failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize,
742 : VFS_I(ip)->i_mode, ip->i_diflags);
743 3472446 : if (failaddr) {
744 0 : ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
745 : XFS_DIFLAG_EXTSZINHERIT);
746 0 : ip->i_extsize = 0;
747 : }
748 3472446 : }
749 :
750 : /* Propagate di_flags2 from a parent inode to a child inode. */
751 : static void
752 15624729 : xfs_inode_inherit_flags2(
753 : struct xfs_inode *ip,
754 : const struct xfs_inode *pip)
755 : {
756 15624729 : xfs_failaddr_t failaddr;
757 :
758 15624729 : if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
759 360 : ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
760 360 : ip->i_cowextsize = pip->i_cowextsize;
761 : }
762 15624729 : if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
763 30 : ip->i_diflags2 |= XFS_DIFLAG2_DAX;
764 :
765 : /* Don't let invalid cowextsize hints propagate. */
766 15624729 : failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
767 : VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2);
768 15624840 : if (failaddr) {
769 0 : ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
770 0 : ip->i_cowextsize = 0;
771 : }
772 15624840 : }
773 :
774 : /*
775 : * Initialise a newly allocated inode and return the in-core inode to the
776 : * caller locked exclusively.
777 : */
778 : int
779 47345042 : xfs_init_new_inode(
780 : struct mnt_idmap *idmap,
781 : struct xfs_trans *tp,
782 : struct xfs_inode *pip,
783 : xfs_ino_t ino,
784 : umode_t mode,
785 : xfs_nlink_t nlink,
786 : dev_t rdev,
787 : prid_t prid,
788 : bool init_xattrs,
789 : struct xfs_inode **ipp)
790 : {
791 47345042 : struct inode *dir = pip ? VFS_I(pip) : NULL;
792 47345042 : struct xfs_mount *mp = tp->t_mountp;
793 47345042 : struct xfs_inode *ip;
794 47345042 : unsigned int flags;
795 47345042 : int error;
796 47345042 : struct timespec64 tv;
797 47345042 : struct inode *inode;
798 :
799 : /*
800 : * Protect against obviously corrupt allocation btree records. Later
801 : * xfs_iget checks will catch re-allocation of other active in-memory
802 : * and on-disk inodes. If we don't catch reallocating the parent inode
803 : * here we will deadlock in xfs_iget() so we have to do these checks
804 : * first.
805 : */
806 47345042 : if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) {
807 0 : xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
808 0 : return -EFSCORRUPTED;
809 : }
810 :
811 : /*
812 : * Get the in-core inode with the lock held exclusively to prevent
813 : * others from looking at until we're done.
814 : */
815 47341748 : error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
816 47345183 : if (error)
817 : return error;
818 :
819 47345181 : ASSERT(ip != NULL);
820 47345181 : inode = VFS_I(ip);
821 47345181 : set_nlink(inode, nlink);
822 47343883 : inode->i_rdev = rdev;
823 47343883 : ip->i_projid = prid;
824 :
825 47343883 : if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) {
826 0 : inode_fsuid_set(inode, idmap);
827 0 : inode->i_gid = dir->i_gid;
828 0 : inode->i_mode = mode;
829 : } else {
830 47343883 : inode_init_owner(idmap, inode, dir, mode);
831 : }
832 :
833 : /*
834 : * If the group ID of the new file does not match the effective group
835 : * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
836 : * (and only if the irix_sgid_inherit compatibility variable is set).
837 : */
838 47343942 : if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
839 0 : !vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)))
840 0 : inode->i_mode &= ~S_ISGID;
841 :
842 47343942 : ip->i_disk_size = 0;
843 47343942 : ip->i_df.if_nextents = 0;
844 47343942 : ASSERT(ip->i_nblocks == 0);
845 :
846 47343942 : tv = current_time(inode);
847 47344994 : inode->i_mtime = tv;
848 47344994 : inode->i_atime = tv;
849 47344994 : inode->i_ctime = tv;
850 :
851 47344994 : ip->i_extsize = 0;
852 47344994 : ip->i_diflags = 0;
853 :
854 47344994 : if (xfs_has_v3inodes(mp)) {
855 47343586 : inode_set_iversion(inode, 1);
856 47343586 : ip->i_cowextsize = 0;
857 47343586 : ip->i_crtime = tv;
858 : }
859 :
860 47344994 : flags = XFS_ILOG_CORE;
861 47344994 : switch (mode & S_IFMT) {
862 4432216 : case S_IFIFO:
863 : case S_IFCHR:
864 : case S_IFBLK:
865 : case S_IFSOCK:
866 4432216 : ip->i_df.if_format = XFS_DINODE_FMT_DEV;
867 4432216 : flags |= XFS_ILOG_DEV;
868 4432216 : break;
869 15633149 : case S_IFREG:
870 : case S_IFDIR:
871 15633149 : if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
872 3472480 : xfs_inode_inherit_flags(ip, pip);
873 15633109 : if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
874 15624762 : xfs_inode_inherit_flags2(ip, pip);
875 42912333 : fallthrough;
876 : case S_IFLNK:
877 42912333 : ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
878 42912333 : ip->i_df.if_bytes = 0;
879 42912333 : ip->i_df.if_u1.if_root = NULL;
880 42912333 : break;
881 0 : default:
882 0 : ASSERT(0);
883 : }
884 :
885 : /*
886 : * If we need to create attributes immediately after allocating the
887 : * inode, initialise an empty attribute fork right now. We use the
888 : * default fork offset for attributes here as we don't know exactly what
889 : * size or how many attributes we might be adding. We can do this
890 : * safely here because we know the data fork is completely empty and
891 : * this saves us from needing to run a separate transaction to set the
892 : * fork offset in the immediate future.
893 : */
894 47344549 : if (init_xattrs && xfs_has_attr(mp)) {
895 3323 : ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
896 3323 : xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
897 : }
898 :
899 : /*
900 : * Log the new values stuffed into the inode.
901 : */
902 47344549 : xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
903 47343118 : xfs_trans_log_inode(tp, ip, flags);
904 :
905 : /* now that we have an i_mode we can setup the inode structure */
906 47345745 : xfs_setup_inode(ip);
907 :
908 47345351 : *ipp = ip;
909 47345351 : return 0;
910 : }
911 :
912 : /*
913 : * Decrement the link count on an inode & log the change. If this causes the
914 : * link count to go to zero, move the inode to AGI unlinked list so that it can
915 : * be freed when the last active reference goes away via xfs_inactive().
916 : */
917 : static int /* error */
918 40077383 : xfs_droplink(
919 : xfs_trans_t *tp,
920 : xfs_inode_t *ip)
921 : {
922 40077383 : xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
923 :
924 40077270 : drop_nlink(VFS_I(ip));
925 40078209 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
926 :
927 40078528 : if (VFS_I(ip)->i_nlink)
928 : return 0;
929 :
930 33341670 : return xfs_iunlink(tp, ip);
931 : }
932 :
933 : /*
934 : * Increment the link count on an inode & log the change.
935 : */
936 : static void
937 12410089 : xfs_bumplink(
938 : xfs_trans_t *tp,
939 : xfs_inode_t *ip)
940 : {
941 12410089 : xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
942 :
943 12410157 : inc_nlink(VFS_I(ip));
944 12410172 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
945 12410171 : }
946 :
947 : int
948 18065174 : xfs_create(
949 : struct mnt_idmap *idmap,
950 : xfs_inode_t *dp,
951 : struct xfs_name *name,
952 : umode_t mode,
953 : dev_t rdev,
954 : bool init_xattrs,
955 : xfs_inode_t **ipp)
956 : {
957 18065174 : int is_dir = S_ISDIR(mode);
958 18065174 : struct xfs_mount *mp = dp->i_mount;
959 18065174 : struct xfs_inode *ip = NULL;
960 18065174 : struct xfs_trans *tp = NULL;
961 18065174 : int error;
962 18065174 : bool unlock_dp_on_error = false;
963 18065174 : prid_t prid;
964 18065174 : struct xfs_dquot *udqp = NULL;
965 18065174 : struct xfs_dquot *gdqp = NULL;
966 18065174 : struct xfs_dquot *pdqp = NULL;
967 18065174 : struct xfs_trans_res *tres;
968 18065174 : uint resblks;
969 18065174 : xfs_ino_t ino;
970 :
971 18065174 : trace_xfs_create(dp, name);
972 :
973 36130716 : if (xfs_is_shutdown(mp))
974 : return -EIO;
975 :
976 18065358 : prid = xfs_get_initial_prid(dp);
977 :
978 : /*
979 : * Make sure that we have allocated dquot(s) on disk.
980 : */
981 18065358 : error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns),
982 : mapped_fsgid(idmap, &init_user_ns), prid,
983 : XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
984 : &udqp, &gdqp, &pdqp);
985 18065695 : if (error)
986 : return error;
987 :
988 18063922 : if (is_dir) {
989 3317293 : resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
990 3317293 : tres = &M_RES(mp)->tr_mkdir;
991 : } else {
992 14746629 : resblks = XFS_CREATE_SPACE_RES(mp, name->len);
993 14746629 : tres = &M_RES(mp)->tr_create;
994 : }
995 :
996 : /*
997 : * Initially assume that the file does not exist and
998 : * reserve the resources for that case. If that is not
999 : * the case we'll drop the one we have and get a more
1000 : * appropriate transaction later.
1001 : */
1002 18063922 : error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
1003 : &tp);
1004 18063872 : if (error == -ENOSPC) {
1005 : /* flush outstanding delalloc blocks and retry */
1006 288825 : xfs_flush_inodes(mp);
1007 288773 : error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp,
1008 : resblks, &tp);
1009 : }
1010 18063848 : if (error)
1011 279548 : goto out_release_dquots;
1012 :
1013 17784300 : xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1014 17784226 : unlock_dp_on_error = true;
1015 :
1016 : /*
1017 : * A newly created regular or special file just has one directory
1018 : * entry pointing to them, but a directory also the "." entry
1019 : * pointing to itself.
1020 : */
1021 17784226 : error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
1022 17784751 : if (!error)
1023 32022964 : error = xfs_init_new_inode(idmap, tp, dp, ino, mode,
1024 : is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip);
1025 17784344 : if (error)
1026 146827 : goto out_trans_cancel;
1027 :
1028 : /*
1029 : * Now we join the directory inode to the transaction. We do not do it
1030 : * earlier because xfs_dialloc might commit the previous transaction
1031 : * (and release all the locks). An error from here on will result in
1032 : * the transaction cancel unlocking dp so don't do it explicitly in the
1033 : * error path.
1034 : */
1035 17637517 : xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1036 17637351 : unlock_dp_on_error = false;
1037 :
1038 35274702 : error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1039 17637719 : resblks - XFS_IALLOC_SPACE_RES(mp));
1040 17637154 : if (error) {
1041 360 : ASSERT(error != -ENOSPC);
1042 360 : goto out_trans_cancel;
1043 : }
1044 17636794 : xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1045 17636320 : xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1046 :
1047 17636591 : if (is_dir) {
1048 3252166 : error = xfs_dir_init(tp, ip, dp);
1049 3252163 : if (error)
1050 0 : goto out_trans_cancel;
1051 :
1052 3252163 : xfs_bumplink(tp, dp);
1053 : }
1054 :
1055 : /*
1056 : * If this is a synchronous mount, make sure that the
1057 : * create transaction goes to disk before returning to
1058 : * the user.
1059 : */
1060 17636588 : if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
1061 236 : xfs_trans_set_sync(tp);
1062 :
1063 : /*
1064 : * Attach the dquot(s) to the inodes and modify them incore.
1065 : * These ids of the inode couldn't have changed since the new
1066 : * inode has been locked ever since it was created.
1067 : */
1068 17636588 : xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1069 :
1070 17637095 : error = xfs_trans_commit(tp);
1071 17636773 : if (error)
1072 198 : goto out_release_inode;
1073 :
1074 17636575 : xfs_qm_dqrele(udqp);
1075 17636219 : xfs_qm_dqrele(gdqp);
1076 17637006 : xfs_qm_dqrele(pdqp);
1077 :
1078 17637003 : *ipp = ip;
1079 17637003 : return 0;
1080 :
1081 147187 : out_trans_cancel:
1082 147187 : xfs_trans_cancel(tp);
1083 147386 : out_release_inode:
1084 : /*
1085 : * Wait until after the current transaction is aborted to finish the
1086 : * setup of the inode and release the inode. This prevents recursive
1087 : * transactions and deadlocks from xfs_inactive.
1088 : */
1089 147386 : if (ip) {
1090 558 : xfs_finish_inode_setup(ip);
1091 558 : xfs_irele(ip);
1092 : }
1093 146828 : out_release_dquots:
1094 426934 : xfs_qm_dqrele(udqp);
1095 426922 : xfs_qm_dqrele(gdqp);
1096 426949 : xfs_qm_dqrele(pdqp);
1097 :
1098 426951 : if (unlock_dp_on_error)
1099 146827 : xfs_iunlock(dp, XFS_ILOCK_EXCL);
1100 : return error;
1101 : }
1102 :
1103 : int
1104 2456628 : xfs_create_tmpfile(
1105 : struct mnt_idmap *idmap,
1106 : struct xfs_inode *dp,
1107 : umode_t mode,
1108 : struct xfs_inode **ipp)
1109 : {
1110 2456628 : struct xfs_mount *mp = dp->i_mount;
1111 2456628 : struct xfs_inode *ip = NULL;
1112 2456628 : struct xfs_trans *tp = NULL;
1113 2456628 : int error;
1114 2456628 : prid_t prid;
1115 2456628 : struct xfs_dquot *udqp = NULL;
1116 2456628 : struct xfs_dquot *gdqp = NULL;
1117 2456628 : struct xfs_dquot *pdqp = NULL;
1118 2456628 : struct xfs_trans_res *tres;
1119 2456628 : uint resblks;
1120 2456628 : xfs_ino_t ino;
1121 :
1122 4913256 : if (xfs_is_shutdown(mp))
1123 : return -EIO;
1124 :
1125 2456628 : prid = xfs_get_initial_prid(dp);
1126 :
1127 : /*
1128 : * Make sure that we have allocated dquot(s) on disk.
1129 : */
1130 2456628 : error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns),
1131 : mapped_fsgid(idmap, &init_user_ns), prid,
1132 : XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1133 : &udqp, &gdqp, &pdqp);
1134 2456710 : if (error)
1135 : return error;
1136 :
1137 2456710 : resblks = XFS_IALLOC_SPACE_RES(mp);
1138 2456710 : tres = &M_RES(mp)->tr_create_tmpfile;
1139 :
1140 2456710 : error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
1141 : &tp);
1142 2456711 : if (error)
1143 36724 : goto out_release_dquots;
1144 :
1145 2419987 : error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
1146 2419981 : if (!error)
1147 2419972 : error = xfs_init_new_inode(idmap, tp, dp, ino, mode,
1148 : 0, 0, prid, false, &ip);
1149 2419981 : if (error)
1150 1 : goto out_trans_cancel;
1151 :
1152 2419980 : if (xfs_has_wsync(mp))
1153 0 : xfs_trans_set_sync(tp);
1154 :
1155 : /*
1156 : * Attach the dquot(s) to the inodes and modify them incore.
1157 : * These ids of the inode couldn't have changed since the new
1158 : * inode has been locked ever since it was created.
1159 : */
1160 2419980 : xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1161 :
1162 2419984 : error = xfs_iunlink(tp, ip);
1163 2418079 : if (error)
1164 0 : goto out_trans_cancel;
1165 :
1166 2418079 : error = xfs_trans_commit(tp);
1167 2419984 : if (error)
1168 2 : goto out_release_inode;
1169 :
1170 2419982 : xfs_qm_dqrele(udqp);
1171 2419979 : xfs_qm_dqrele(gdqp);
1172 2419984 : xfs_qm_dqrele(pdqp);
1173 :
1174 2419984 : *ipp = ip;
1175 2419984 : return 0;
1176 :
1177 1 : out_trans_cancel:
1178 1 : xfs_trans_cancel(tp);
1179 3 : out_release_inode:
1180 : /*
1181 : * Wait until after the current transaction is aborted to finish the
1182 : * setup of the inode and release the inode. This prevents recursive
1183 : * transactions and deadlocks from xfs_inactive.
1184 : */
1185 3 : if (ip) {
1186 2 : xfs_finish_inode_setup(ip);
1187 2 : xfs_irele(ip);
1188 : }
1189 1 : out_release_dquots:
1190 36727 : xfs_qm_dqrele(udqp);
1191 36727 : xfs_qm_dqrele(gdqp);
1192 36727 : xfs_qm_dqrele(pdqp);
1193 :
1194 36727 : return error;
1195 : }
1196 :
1197 : int
1198 3109955 : xfs_link(
1199 : xfs_inode_t *tdp,
1200 : xfs_inode_t *sip,
1201 : struct xfs_name *target_name)
1202 : {
1203 3109955 : xfs_mount_t *mp = tdp->i_mount;
1204 3109955 : xfs_trans_t *tp;
1205 3109955 : int error, nospace_error = 0;
1206 3109955 : int resblks;
1207 :
1208 3109955 : trace_xfs_link(tdp, target_name);
1209 :
1210 3109971 : ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
1211 :
1212 6219942 : if (xfs_is_shutdown(mp))
1213 : return -EIO;
1214 :
1215 3109971 : error = xfs_qm_dqattach(sip);
1216 3109978 : if (error)
1217 0 : goto std_return;
1218 :
1219 3109978 : error = xfs_qm_dqattach(tdp);
1220 3109973 : if (error)
1221 2 : goto std_return;
1222 :
1223 3109971 : resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1224 3109971 : error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks,
1225 : &tp, &nospace_error);
1226 3109969 : if (error)
1227 0 : goto std_return;
1228 :
1229 : /*
1230 : * If we are using project inheritance, we only allow hard link
1231 : * creation in our tree when the project IDs are the same; else
1232 : * the tree quota mechanism could be circumvented.
1233 : */
1234 3109969 : if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
1235 : tdp->i_projid != sip->i_projid)) {
1236 0 : error = -EXDEV;
1237 0 : goto error_return;
1238 : }
1239 :
1240 3109969 : if (!resblks) {
1241 28535 : error = xfs_dir_canenter(tp, tdp, target_name);
1242 28535 : if (error)
1243 165 : goto error_return;
1244 : }
1245 :
1246 : /*
1247 : * Handle initial link state of O_TMPFILE inode
1248 : */
1249 3109804 : if (VFS_I(sip)->i_nlink == 0) {
1250 8208 : struct xfs_perag *pag;
1251 :
1252 8208 : pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino));
1253 8208 : error = xfs_iunlink_remove(tp, pag, sip);
1254 8208 : xfs_perag_put(pag);
1255 8208 : if (error)
1256 0 : goto error_return;
1257 : }
1258 :
1259 3109804 : error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1260 : resblks);
1261 3109824 : if (error)
1262 2 : goto error_return;
1263 3109822 : xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1264 3109810 : xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1265 :
1266 3109820 : xfs_bumplink(tp, sip);
1267 :
1268 : /*
1269 : * If this is a synchronous mount, make sure that the
1270 : * link transaction goes to disk before returning to
1271 : * the user.
1272 : */
1273 3109822 : if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
1274 0 : xfs_trans_set_sync(tp);
1275 :
1276 3109822 : return xfs_trans_commit(tp);
1277 :
1278 167 : error_return:
1279 167 : xfs_trans_cancel(tp);
1280 169 : std_return:
1281 169 : if (error == -ENOSPC && nospace_error)
1282 165 : error = nospace_error;
1283 : return error;
1284 : }
1285 :
1286 : /* Clear the reflink flag and the cowblocks tag if possible. */
1287 : static void
1288 9914183 : xfs_itruncate_clear_reflink_flags(
1289 : struct xfs_inode *ip)
1290 : {
1291 9914183 : struct xfs_ifork *dfork;
1292 9914183 : struct xfs_ifork *cfork;
1293 :
1294 9914183 : if (!xfs_is_reflink_inode(ip))
1295 : return;
1296 3616560 : dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK);
1297 3616560 : cfork = xfs_ifork_ptr(ip, XFS_COW_FORK);
1298 3616560 : if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
1299 689405 : ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1300 3616560 : if (cfork->if_bytes == 0)
1301 3411589 : xfs_inode_clear_cowblocks_tag(ip);
1302 : }
1303 :
1304 : /*
1305 : * Free up the underlying blocks past new_size. The new size must be smaller
1306 : * than the current size. This routine can be used both for the attribute and
1307 : * data fork, and does not modify the inode size, which is left to the caller.
1308 : *
1309 : * The transaction passed to this routine must have made a permanent log
1310 : * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the
1311 : * given transaction and start new ones, so make sure everything involved in
1312 : * the transaction is tidy before calling here. Some transaction will be
1313 : * returned to the caller to be committed. The incoming transaction must
1314 : * already include the inode, and both inode locks must be held exclusively.
1315 : * The inode must also be "held" within the transaction. On return the inode
1316 : * will be "held" within the returned transaction. This routine does NOT
1317 : * require any disk space to be reserved for it within the transaction.
1318 : *
1319 : * If we get an error, we must return with the inode locked and linked into the
1320 : * current transaction. This keeps things simple for the higher level code,
1321 : * because it always knows that the inode is locked and held in the transaction
1322 : * that returns to it whether errors occur or not. We don't mark the inode
1323 : * dirty on error so that transactions can be easily aborted if possible.
1324 : */
1325 : int
1326 10520143 : xfs_itruncate_extents_flags(
1327 : struct xfs_trans **tpp,
1328 : struct xfs_inode *ip,
1329 : int whichfork,
1330 : xfs_fsize_t new_size,
1331 : int flags)
1332 : {
1333 10520143 : struct xfs_mount *mp = ip->i_mount;
1334 10520143 : struct xfs_trans *tp = *tpp;
1335 10520143 : xfs_fileoff_t first_unmap_block;
1336 10520143 : xfs_filblks_t unmap_len;
1337 10520143 : int error = 0;
1338 :
1339 10520143 : ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1340 17823928 : ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
1341 : xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1342 21040286 : ASSERT(new_size <= XFS_ISIZE(ip));
1343 10520143 : ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1344 10520143 : ASSERT(ip->i_itemp != NULL);
1345 10520143 : ASSERT(ip->i_itemp->ili_lock_flags == 0);
1346 10520143 : ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1347 :
1348 10520143 : trace_xfs_itruncate_extents_start(ip, new_size);
1349 :
1350 10520018 : flags |= xfs_bmapi_aflag(whichfork);
1351 :
1352 : /*
1353 : * Since it is possible for space to become allocated beyond
1354 : * the end of the file (in a crash where the space is allocated
1355 : * but the inode size is not yet updated), simply remove any
1356 : * blocks which show up between the new EOF and the maximum
1357 : * possible file size.
1358 : *
1359 : * We have to free all the blocks to the bmbt maximum offset, even if
1360 : * the page cache can't scale that far.
1361 : */
1362 10520018 : first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1363 10520018 : if (!xfs_verify_fileoff(mp, first_unmap_block)) {
1364 0 : WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF);
1365 0 : return 0;
1366 : }
1367 :
1368 10520085 : unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1;
1369 42216984 : while (unmap_len > 0) {
1370 31698362 : ASSERT(tp->t_highest_agno == NULLAGNUMBER);
1371 31698362 : error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
1372 : flags, XFS_ITRUNC_MAX_EXTENTS);
1373 31698194 : if (error)
1374 461 : goto out;
1375 :
1376 : /* free the just unmapped extents */
1377 31697733 : error = xfs_defer_finish(&tp);
1378 31698155 : if (error)
1379 1256 : goto out;
1380 : }
1381 :
1382 10518622 : if (whichfork == XFS_DATA_FORK) {
1383 : /* Remove all pending CoW reservations. */
1384 9914265 : error = xfs_reflink_cancel_cow_blocks(ip, &tp,
1385 : first_unmap_block, XFS_MAX_FILEOFF, true);
1386 9914141 : if (error)
1387 0 : goto out;
1388 :
1389 9914141 : xfs_itruncate_clear_reflink_flags(ip);
1390 : }
1391 :
1392 : /*
1393 : * Always re-log the inode so that our permanent transaction can keep
1394 : * on rolling it forward in the log.
1395 : */
1396 10518548 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1397 :
1398 10518768 : trace_xfs_itruncate_extents_end(ip, new_size);
1399 :
1400 10520493 : out:
1401 10520493 : *tpp = tp;
1402 10520493 : return error;
1403 : }
1404 :
1405 : int
1406 321209600 : xfs_release(
1407 : xfs_inode_t *ip)
1408 : {
1409 321209600 : xfs_mount_t *mp = ip->i_mount;
1410 321209600 : int error = 0;
1411 :
1412 321209600 : if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0))
1413 : return 0;
1414 :
1415 : /* If this is a read-only mount, don't do this (would generate I/O) */
1416 642422204 : if (xfs_is_readonly(mp))
1417 : return 0;
1418 :
1419 637433682 : if (!xfs_is_shutdown(mp)) {
1420 318536661 : int truncated;
1421 :
1422 : /*
1423 : * If we previously truncated this file and removed old data
1424 : * in the process, we want to initiate "early" writeout on
1425 : * the last close. This is an attempt to combat the notorious
1426 : * NULL files problem which is particularly noticeable from a
1427 : * truncate down, buffered (re-)write (delalloc), followed by
1428 : * a crash. What we are effectively doing here is
1429 : * significantly reducing the time window where we'd otherwise
1430 : * be exposed to that problem.
1431 : */
1432 318536661 : truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1433 318549969 : if (truncated) {
1434 1323366 : xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
1435 1323367 : if (ip->i_delayed_blks > 0) {
1436 105978 : error = filemap_flush(VFS_I(ip)->i_mapping);
1437 105978 : if (error)
1438 : return error;
1439 : }
1440 : }
1441 : }
1442 :
1443 318730129 : if (VFS_I(ip)->i_nlink == 0)
1444 : return 0;
1445 :
1446 : /*
1447 : * If we can't get the iolock just skip truncating the blocks past EOF
1448 : * because we could deadlock with the mmap_lock otherwise. We'll get
1449 : * another chance to drop them once the last reference to the inode is
1450 : * dropped, so we'll never leak blocks permanently.
1451 : */
1452 317725591 : if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
1453 : return 0;
1454 :
1455 317500106 : if (xfs_can_free_eofblocks(ip, false)) {
1456 : /*
1457 : * Check if the inode is being opened, written and closed
1458 : * frequently and we have delayed allocation blocks outstanding
1459 : * (e.g. streaming writes from the NFS server), truncating the
1460 : * blocks past EOF will cause fragmentation to occur.
1461 : *
1462 : * In this case don't do the truncation, but we have to be
1463 : * careful how we detect this case. Blocks beyond EOF show up as
1464 : * i_delayed_blks even when the inode is clean, so we need to
1465 : * truncate them away first before checking for a dirty release.
1466 : * Hence on the first dirty close we will still remove the
1467 : * speculative allocation, but after that we will leave it in
1468 : * place.
1469 : */
1470 39181541 : if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
1471 14949033 : goto out_unlock;
1472 :
1473 4641791 : error = xfs_free_eofblocks(ip);
1474 4641777 : if (error)
1475 5 : goto out_unlock;
1476 :
1477 : /* delalloc blocks after truncation means it really is dirty */
1478 4641772 : if (ip->i_delayed_blks)
1479 4627444 : xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1480 : }
1481 :
1482 297914135 : out_unlock:
1483 317490640 : xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1484 317490640 : return error;
1485 : }
1486 :
1487 : /*
1488 : * xfs_inactive_truncate
1489 : *
1490 : * Called to perform a truncate when an inode becomes unlinked.
1491 : */
1492 : STATIC int
1493 2459784 : xfs_inactive_truncate(
1494 : struct xfs_inode *ip)
1495 : {
1496 2459784 : struct xfs_mount *mp = ip->i_mount;
1497 2459784 : struct xfs_trans *tp;
1498 2459784 : int error;
1499 :
1500 2459784 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
1501 2461134 : if (error) {
1502 1000 : ASSERT(xfs_is_shutdown(mp));
1503 500 : return error;
1504 : }
1505 2460634 : xfs_ilock(ip, XFS_ILOCK_EXCL);
1506 2460593 : xfs_trans_ijoin(tp, ip, 0);
1507 :
1508 : /*
1509 : * Log the inode size first to prevent stale data exposure in the event
1510 : * of a system crash before the truncate completes. See the related
1511 : * comment in xfs_vn_setattr_size() for details.
1512 : */
1513 2459501 : ip->i_disk_size = 0;
1514 2459501 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1515 :
1516 2460756 : error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
1517 2460790 : if (error)
1518 1180 : goto error_trans_cancel;
1519 :
1520 2459610 : ASSERT(ip->i_df.if_nextents == 0);
1521 :
1522 2459610 : error = xfs_trans_commit(tp);
1523 2459539 : if (error)
1524 0 : goto error_unlock;
1525 :
1526 2459539 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
1527 2459539 : return 0;
1528 :
1529 : error_trans_cancel:
1530 1180 : xfs_trans_cancel(tp);
1531 1180 : error_unlock:
1532 1180 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
1533 1180 : return error;
1534 : }
1535 :
1536 : /*
1537 : * xfs_inactive_ifree()
1538 : *
1539 : * Perform the inode free when an inode is unlinked.
1540 : */
1541 : STATIC int
1542 34309758 : xfs_inactive_ifree(
1543 : struct xfs_inode *ip)
1544 : {
1545 34309758 : struct xfs_mount *mp = ip->i_mount;
1546 34309758 : struct xfs_trans *tp;
1547 34309758 : int error;
1548 :
1549 : /*
1550 : * We try to use a per-AG reservation for any block needed by the finobt
1551 : * tree, but as the finobt feature predates the per-AG reservation
1552 : * support a degraded file system might not have enough space for the
1553 : * reservation at mount time. In that case try to dip into the reserved
1554 : * pool and pray.
1555 : *
1556 : * Send a warning if the reservation does happen to fail, as the inode
1557 : * now remains allocated and sits on the unlinked list until the fs is
1558 : * repaired.
1559 : */
1560 34309758 : if (unlikely(mp->m_finobt_nores)) {
1561 0 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
1562 : XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
1563 : &tp);
1564 : } else {
1565 34309758 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
1566 : }
1567 34312191 : if (error) {
1568 143 : if (error == -ENOSPC) {
1569 0 : xfs_warn_ratelimited(mp,
1570 : "Failed to remove inode(s) from unlinked list. "
1571 : "Please free space, unmount and run xfs_repair.");
1572 : } else {
1573 286 : ASSERT(xfs_is_shutdown(mp));
1574 : }
1575 143 : return error;
1576 : }
1577 :
1578 : /*
1579 : * We do not hold the inode locked across the entire rolling transaction
1580 : * here. We only need to hold it for the first transaction that
1581 : * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the
1582 : * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode
1583 : * here breaks the relationship between cluster buffer invalidation and
1584 : * stale inode invalidation on cluster buffer item journal commit
1585 : * completion, and can result in leaving dirty stale inodes hanging
1586 : * around in memory.
1587 : *
1588 : * We have no need for serialising this inode operation against other
1589 : * operations - we freed the inode and hence reallocation is required
1590 : * and that will serialise on reallocating the space the deferops need
1591 : * to free. Hence we can unlock the inode on the first commit of
1592 : * the transaction rather than roll it right through the deferops. This
1593 : * avoids relogging the XFS_ISTALE inode.
1594 : *
1595 : * We check that xfs_ifree() hasn't grown an internal transaction roll
1596 : * by asserting that the inode is still locked when it returns.
1597 : */
1598 34312048 : xfs_ilock(ip, XFS_ILOCK_EXCL);
1599 34311975 : xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1600 :
1601 34309300 : error = xfs_ifree(tp, ip);
1602 34312250 : ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1603 34312250 : if (error) {
1604 : /*
1605 : * If we fail to free the inode, shut down. The cancel
1606 : * might do that, we need to make sure. Otherwise the
1607 : * inode might be lost for a long time or forever.
1608 : */
1609 316 : if (!xfs_is_shutdown(mp)) {
1610 5 : xfs_notice(mp, "%s: xfs_ifree returned error %d",
1611 : __func__, error);
1612 5 : xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1613 : }
1614 158 : xfs_trans_cancel(tp);
1615 158 : return error;
1616 : }
1617 :
1618 : /*
1619 : * Credit the quota account(s). The inode is gone.
1620 : */
1621 34312092 : xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1622 :
1623 34311464 : return xfs_trans_commit(tp);
1624 : }
1625 :
1626 : /*
1627 : * Returns true if we need to update the on-disk metadata before we can free
1628 : * the memory used by this inode. Updates include freeing post-eof
1629 : * preallocations; freeing COW staging extents; and marking the inode free in
1630 : * the inobt if it is on the unlinked list.
1631 : */
1632 : bool
1633 988849574 : xfs_inode_needs_inactive(
1634 : struct xfs_inode *ip)
1635 : {
1636 988849574 : struct xfs_mount *mp = ip->i_mount;
1637 988849574 : struct xfs_ifork *cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
1638 :
1639 : /*
1640 : * If the inode is already free, then there can be nothing
1641 : * to clean up here.
1642 : */
1643 988849574 : if (VFS_I(ip)->i_mode == 0)
1644 : return false;
1645 :
1646 : /* If this is a read-only mount, don't do this (would generate I/O) */
1647 1977699148 : if (xfs_is_readonly(mp))
1648 : return false;
1649 :
1650 : /* If the log isn't running, push inodes straight to reclaim. */
1651 1934972584 : if (xfs_is_shutdown(mp) || xfs_has_norecovery(mp))
1652 : return false;
1653 :
1654 : /* Metadata inodes require explicit resource cleanup. */
1655 705858502 : if (xfs_is_metadata_inode(ip))
1656 : return false;
1657 :
1658 : /* Want to clean out the cow blocks if there are any. */
1659 705812735 : if (cow_ifp && cow_ifp->if_bytes > 0)
1660 : return true;
1661 :
1662 : /* Unlinked files must be freed. */
1663 705807989 : if (VFS_I(ip)->i_nlink == 0)
1664 : return true;
1665 :
1666 : /*
1667 : * This file isn't being freed, so check if there are post-eof blocks
1668 : * to free. @force is true because we are evicting an inode from the
1669 : * cache. Post-eof blocks must be freed, lest we end up with broken
1670 : * free space accounting.
1671 : *
1672 : * Note: don't bother with iolock here since lockdep complains about
1673 : * acquiring it in reclaim context. We have the only reference to the
1674 : * inode at this point anyways.
1675 : */
1676 671499144 : return xfs_can_free_eofblocks(ip, true);
1677 : }
1678 :
1679 : /*
1680 : * xfs_inactive
1681 : *
1682 : * This is called when the vnode reference count for the vnode
1683 : * goes to zero. If the file has been unlinked, then it must
1684 : * now be truncated. Also, we clear all of the read-ahead state
1685 : * kept for the inode here since the file is now closed.
1686 : */
1687 : int
1688 34464829 : xfs_inactive(
1689 : xfs_inode_t *ip)
1690 : {
1691 34464829 : struct xfs_mount *mp;
1692 34464829 : int error = 0;
1693 34464829 : int truncate = 0;
1694 :
1695 : /*
1696 : * If the inode is already free, then there can be nothing
1697 : * to clean up here.
1698 : */
1699 34464829 : if (VFS_I(ip)->i_mode == 0) {
1700 0 : ASSERT(ip->i_df.if_broot_bytes == 0);
1701 0 : goto out;
1702 : }
1703 :
1704 34464829 : mp = ip->i_mount;
1705 68931790 : ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
1706 :
1707 : /* If this is a read-only mount, don't do this (would generate I/O) */
1708 68933922 : if (xfs_is_readonly(mp))
1709 0 : goto out;
1710 :
1711 : /* Metadata inodes require explicit resource cleanup. */
1712 34466961 : if (xfs_is_metadata_inode(ip))
1713 0 : goto out;
1714 :
1715 : /* Try to clean out the cow blocks if there are any. */
1716 68933922 : if (xfs_inode_has_cow_data(ip))
1717 4746 : xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
1718 :
1719 34466961 : if (VFS_I(ip)->i_nlink != 0) {
1720 : /*
1721 : * force is true because we are evicting an inode from the
1722 : * cache. Post-eof blocks must be freed, lest we end up with
1723 : * broken free space accounting.
1724 : *
1725 : * Note: don't bother with iolock here since lockdep complains
1726 : * about acquiring it in reclaim context. We have the only
1727 : * reference to the inode at this point anyways.
1728 : */
1729 155168 : if (xfs_can_free_eofblocks(ip, true))
1730 151449 : error = xfs_free_eofblocks(ip);
1731 :
1732 155172 : goto out;
1733 : }
1734 :
1735 34311793 : if (S_ISREG(VFS_I(ip)->i_mode) &&
1736 6578784 : (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 ||
1737 4128489 : ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
1738 : truncate = 1;
1739 :
1740 34311793 : error = xfs_qm_dqattach(ip);
1741 34308402 : if (error)
1742 24 : goto out;
1743 :
1744 34308378 : if (S_ISLNK(VFS_I(ip)->i_mode))
1745 26671143 : error = xfs_inactive_symlink(ip);
1746 7637235 : else if (truncate)
1747 2459773 : error = xfs_inactive_truncate(ip);
1748 34310842 : if (error)
1749 1689 : goto out;
1750 :
1751 : /*
1752 : * If there are attributes associated with the file then blow them away
1753 : * now. The code calls a routine that recursively deconstructs the
1754 : * attribute fork. If also blows away the in-core attribute fork.
1755 : */
1756 34309153 : if (xfs_inode_has_attr_fork(ip)) {
1757 1040363 : error = xfs_attr_inactive(ip);
1758 1040629 : if (error)
1759 74 : goto out;
1760 : }
1761 :
1762 34309345 : ASSERT(ip->i_forkoff == 0);
1763 :
1764 : /*
1765 : * Free the inode.
1766 : */
1767 34309345 : error = xfs_inactive_ifree(ip);
1768 :
1769 34469392 : out:
1770 : /*
1771 : * We're done making metadata updates for this inode, so we can release
1772 : * the attached dquots.
1773 : */
1774 34469392 : xfs_qm_dqdetach(ip);
1775 34469386 : return error;
1776 : }
1777 :
1778 : /*
1779 : * In-Core Unlinked List Lookups
1780 : * =============================
1781 : *
1782 : * Every inode is supposed to be reachable from some other piece of metadata
1783 : * with the exception of the root directory. Inodes with a connection to a
1784 : * file descriptor but not linked from anywhere in the on-disk directory tree
1785 : * are collectively known as unlinked inodes, though the filesystem itself
1786 : * maintains links to these inodes so that on-disk metadata are consistent.
1787 : *
1788 : * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI
1789 : * header contains a number of buckets that point to an inode, and each inode
1790 : * record has a pointer to the next inode in the hash chain. This
1791 : * singly-linked list causes scaling problems in the iunlink remove function
1792 : * because we must walk that list to find the inode that points to the inode
1793 : * being removed from the unlinked hash bucket list.
1794 : *
1795 : * Hence we keep an in-memory double linked list to link each inode on an
1796 : * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer
1797 : * based lists would require having 64 list heads in the perag, one for each
1798 : * list. This is expensive in terms of memory (think millions of AGs) and cache
1799 : * misses on lookups. Instead, use the fact that inodes on the unlinked list
1800 : * must be referenced at the VFS level to keep them on the list and hence we
1801 : * have an existence guarantee for inodes on the unlinked list.
1802 : *
1803 : * Given we have an existence guarantee, we can use lockless inode cache lookups
1804 : * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode
1805 : * for the double linked unlinked list, and we don't need any extra locking to
1806 : * keep the list safe as all manipulations are done under the AGI buffer lock.
1807 : * Keeping the list up to date does not require memory allocation, just finding
1808 : * the XFS inode and updating the next/prev unlinked list aginos.
1809 : */
1810 :
1811 : /*
1812 : * Find an inode on the unlinked list. This does not take references to the
1813 : * inode as we have existence guarantees by holding the AGI buffer lock and that
1814 : * only unlinked, referenced inodes can be on the unlinked inode list. If we
1815 : * don't find the inode in cache, then let the caller handle the situation.
1816 : */
1817 : static struct xfs_inode *
1818 18677543 : xfs_iunlink_lookup(
1819 : struct xfs_perag *pag,
1820 : xfs_agino_t agino)
1821 : {
1822 18677543 : struct xfs_inode *ip;
1823 :
1824 18677543 : rcu_read_lock();
1825 18675531 : ip = radix_tree_lookup(&pag->pag_ici_root, agino);
1826 :
1827 : /*
1828 : * Inode not in memory or in RCU freeing limbo should not happen.
1829 : * Warn about this and let the caller handle the failure.
1830 : */
1831 37336128 : if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
1832 0 : rcu_read_unlock();
1833 0 : return NULL;
1834 : }
1835 37344539 : ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM));
1836 18678178 : rcu_read_unlock();
1837 18678178 : return ip;
1838 : }
1839 :
1840 : /* Update the prev pointer of the next agino. */
1841 : static int
1842 71518563 : xfs_iunlink_update_backref(
1843 : struct xfs_perag *pag,
1844 : xfs_agino_t prev_agino,
1845 : xfs_agino_t next_agino)
1846 : {
1847 71518563 : struct xfs_inode *ip;
1848 :
1849 : /* No update necessary if we are at the end of the list. */
1850 71518563 : if (next_agino == NULLAGINO)
1851 : return 0;
1852 :
1853 15677260 : ip = xfs_iunlink_lookup(pag, next_agino);
1854 15676472 : if (!ip)
1855 : return -EFSCORRUPTED;
1856 15676472 : ip->i_prev_unlinked = prev_agino;
1857 15676472 : return 0;
1858 : }
1859 :
1860 : /*
1861 : * Point the AGI unlinked bucket at an inode and log the results. The caller
1862 : * is responsible for validating the old value.
1863 : */
1864 : STATIC int
1865 68517248 : xfs_iunlink_update_bucket(
1866 : struct xfs_trans *tp,
1867 : struct xfs_perag *pag,
1868 : struct xfs_buf *agibp,
1869 : unsigned int bucket_index,
1870 : xfs_agino_t new_agino)
1871 : {
1872 68517248 : struct xfs_agi *agi = agibp->b_addr;
1873 68517248 : xfs_agino_t old_value;
1874 68517248 : int offset;
1875 :
1876 109494609 : ASSERT(xfs_verify_agino_or_null(pag, new_agino));
1877 :
1878 68517248 : old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
1879 68514140 : trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
1880 : old_value, new_agino);
1881 :
1882 : /*
1883 : * We should never find the head of the list already set to the value
1884 : * passed in because either we're adding or removing ourselves from the
1885 : * head of the list.
1886 : */
1887 68514155 : if (old_value == new_agino) {
1888 0 : xfs_buf_mark_corrupt(agibp);
1889 0 : return -EFSCORRUPTED;
1890 : }
1891 :
1892 137028310 : agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
1893 68514265 : offset = offsetof(struct xfs_agi, agi_unlinked) +
1894 : (sizeof(xfs_agino_t) * bucket_index);
1895 68514265 : xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
1896 68514265 : return 0;
1897 : }
1898 :
1899 : static int
1900 35760302 : xfs_iunlink_insert_inode(
1901 : struct xfs_trans *tp,
1902 : struct xfs_perag *pag,
1903 : struct xfs_buf *agibp,
1904 : struct xfs_inode *ip)
1905 : {
1906 35760302 : struct xfs_mount *mp = tp->t_mountp;
1907 35760302 : struct xfs_agi *agi = agibp->b_addr;
1908 35760302 : xfs_agino_t next_agino;
1909 35760302 : xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1910 35760302 : short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1911 35760302 : int error;
1912 :
1913 : /*
1914 : * Get the index into the agi hash table for the list this inode will
1915 : * go on. Make sure the pointer isn't garbage and that this inode
1916 : * isn't already on the list.
1917 : */
1918 35760302 : next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
1919 35760622 : if (next_agino == agino ||
1920 : !xfs_verify_agino_or_null(pag, next_agino)) {
1921 0 : xfs_buf_mark_corrupt(agibp);
1922 0 : return -EFSCORRUPTED;
1923 : }
1924 :
1925 : /*
1926 : * Update the prev pointer in the next inode to point back to this
1927 : * inode.
1928 : */
1929 35761072 : error = xfs_iunlink_update_backref(pag, agino, next_agino);
1930 35760625 : if (error)
1931 : return error;
1932 :
1933 35760625 : if (next_agino != NULLAGINO) {
1934 : /*
1935 : * There is already another inode in the bucket, so point this
1936 : * inode to the current head of the list.
1937 : */
1938 8221944 : error = xfs_iunlink_log_inode(tp, ip, pag, next_agino);
1939 8222101 : if (error)
1940 : return error;
1941 8222101 : ip->i_next_unlinked = next_agino;
1942 : }
1943 :
1944 : /* Point the head of the list to point to this inode. */
1945 35760782 : return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
1946 : }
1947 :
1948 : /*
1949 : * This is called when the inode's link count has gone to 0 or we are creating
1950 : * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
1951 : *
1952 : * We place the on-disk inode on a list in the AGI. It will be pulled from this
1953 : * list when the inode is freed.
1954 : */
1955 : STATIC int
1956 35760973 : xfs_iunlink(
1957 : struct xfs_trans *tp,
1958 : struct xfs_inode *ip)
1959 : {
1960 35760973 : struct xfs_mount *mp = tp->t_mountp;
1961 35760973 : struct xfs_perag *pag;
1962 35760973 : struct xfs_buf *agibp;
1963 35760973 : int error;
1964 :
1965 35760973 : ASSERT(VFS_I(ip)->i_nlink == 0);
1966 35760973 : ASSERT(VFS_I(ip)->i_mode != 0);
1967 35760973 : trace_xfs_iunlink(ip);
1968 :
1969 35761620 : pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1970 :
1971 : /* Get the agi buffer first. It ensures lock ordering on the list. */
1972 35761533 : error = xfs_read_agi(pag, tp, &agibp);
1973 35760277 : if (error)
1974 10 : goto out;
1975 :
1976 35760267 : error = xfs_iunlink_insert_inode(tp, pag, agibp, ip);
1977 35760831 : out:
1978 35760831 : xfs_perag_put(pag);
1979 35761116 : return error;
1980 : }
1981 :
1982 : static int
1983 35761389 : xfs_iunlink_remove_inode(
1984 : struct xfs_trans *tp,
1985 : struct xfs_perag *pag,
1986 : struct xfs_buf *agibp,
1987 : struct xfs_inode *ip)
1988 : {
1989 35761389 : struct xfs_mount *mp = tp->t_mountp;
1990 35761389 : struct xfs_agi *agi = agibp->b_addr;
1991 35761389 : xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1992 35761389 : xfs_agino_t head_agino;
1993 35761389 : short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1994 35761389 : int error;
1995 :
1996 35761389 : trace_xfs_iunlink_remove(ip);
1997 :
1998 : /*
1999 : * Get the index into the agi hash table for the list this inode will
2000 : * go on. Make sure the head pointer isn't garbage.
2001 : */
2002 35762586 : head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2003 35762935 : if (!xfs_verify_agino(pag, head_agino)) {
2004 952 : XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
2005 : agi, sizeof(*agi));
2006 0 : return -EFSCORRUPTED;
2007 : }
2008 :
2009 : /*
2010 : * Set our inode's next_unlinked pointer to NULL and then return
2011 : * the old pointer value so that we can update whatever was previous
2012 : * to us in the list to point to whatever was next in the list.
2013 : */
2014 35762011 : error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO);
2015 35761259 : if (error)
2016 : return error;
2017 :
2018 : /*
2019 : * Update the prev pointer in the next inode to point back to previous
2020 : * inode in the chain.
2021 : */
2022 35761480 : error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
2023 : ip->i_next_unlinked);
2024 35760138 : if (error)
2025 : return error;
2026 :
2027 35760138 : if (head_agino != agino) {
2028 3002678 : struct xfs_inode *prev_ip;
2029 :
2030 3002678 : prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
2031 3002376 : if (!prev_ip)
2032 : return -EFSCORRUPTED;
2033 :
2034 3002376 : error = xfs_iunlink_log_inode(tp, prev_ip, pag,
2035 : ip->i_next_unlinked);
2036 3003179 : prev_ip->i_next_unlinked = ip->i_next_unlinked;
2037 : } else {
2038 : /* Point the head of the list to the next unlinked inode. */
2039 32757460 : error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
2040 : ip->i_next_unlinked);
2041 : }
2042 :
2043 35761197 : ip->i_next_unlinked = NULLAGINO;
2044 35761197 : ip->i_prev_unlinked = NULLAGINO;
2045 35761197 : return error;
2046 : }
2047 :
2048 : /*
2049 : * Pull the on-disk inode from the AGI unlinked list.
2050 : */
2051 : STATIC int
2052 35762950 : xfs_iunlink_remove(
2053 : struct xfs_trans *tp,
2054 : struct xfs_perag *pag,
2055 : struct xfs_inode *ip)
2056 : {
2057 35762950 : struct xfs_buf *agibp;
2058 35762950 : int error;
2059 :
2060 35762950 : trace_xfs_iunlink_remove(ip);
2061 :
2062 : /* Get the agi buffer first. It ensures lock ordering on the list. */
2063 35762915 : error = xfs_read_agi(pag, tp, &agibp);
2064 35762504 : if (error)
2065 : return error;
2066 :
2067 35760883 : return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
2068 : }
2069 :
2070 : /*
2071 : * Look up the inode number specified and if it is not already marked XFS_ISTALE
2072 : * mark it stale. We should only find clean inodes in this lookup that aren't
2073 : * already stale.
2074 : */
2075 : static void
2076 3772537 : xfs_ifree_mark_inode_stale(
2077 : struct xfs_perag *pag,
2078 : struct xfs_inode *free_ip,
2079 : xfs_ino_t inum)
2080 : {
2081 3772537 : struct xfs_mount *mp = pag->pag_mount;
2082 3772537 : struct xfs_inode_log_item *iip;
2083 3772537 : struct xfs_inode *ip;
2084 :
2085 3772537 : retry:
2086 3772537 : rcu_read_lock();
2087 3772500 : ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
2088 :
2089 : /* Inode not in memory, nothing to do */
2090 3772551 : if (!ip) {
2091 573922 : rcu_read_unlock();
2092 573922 : return;
2093 : }
2094 :
2095 : /*
2096 : * because this is an RCU protected lookup, we could find a recently
2097 : * freed or even reallocated inode during the lookup. We need to check
2098 : * under the i_flags_lock for a valid inode here. Skip it if it is not
2099 : * valid, the wrong inode or stale.
2100 : */
2101 3198629 : spin_lock(&ip->i_flags_lock);
2102 3198628 : if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE))
2103 12094 : goto out_iflags_unlock;
2104 :
2105 : /*
2106 : * Don't try to lock/unlock the current inode, but we _cannot_ skip the
2107 : * other inodes that we did not find in the list attached to the buffer
2108 : * and are not already marked stale. If we can't lock it, back off and
2109 : * retry.
2110 : */
2111 3186534 : if (ip != free_ip) {
2112 3120886 : if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2113 0 : spin_unlock(&ip->i_flags_lock);
2114 0 : rcu_read_unlock();
2115 0 : delay(1);
2116 0 : goto retry;
2117 : }
2118 : }
2119 3186488 : ip->i_flags |= XFS_ISTALE;
2120 :
2121 : /*
2122 : * If the inode is flushing, it is already attached to the buffer. All
2123 : * we needed to do here is mark the inode stale so buffer IO completion
2124 : * will remove it from the AIL.
2125 : */
2126 3186488 : iip = ip->i_itemp;
2127 3186488 : if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
2128 910 : ASSERT(!list_empty(&iip->ili_item.li_bio_list));
2129 910 : ASSERT(iip->ili_last_fields);
2130 910 : goto out_iunlock;
2131 : }
2132 :
2133 : /*
2134 : * Inodes not attached to the buffer can be released immediately.
2135 : * Everything else has to go through xfs_iflush_abort() on journal
2136 : * commit as the flock synchronises removal of the inode from the
2137 : * cluster buffer against inode reclaim.
2138 : */
2139 3185578 : if (!iip || list_empty(&iip->ili_item.li_bio_list))
2140 256707 : goto out_iunlock;
2141 :
2142 2928871 : __xfs_iflags_set(ip, XFS_IFLUSHING);
2143 2928871 : spin_unlock(&ip->i_flags_lock);
2144 2928892 : rcu_read_unlock();
2145 :
2146 : /* we have a dirty inode in memory that has not yet been flushed. */
2147 2928900 : spin_lock(&iip->ili_lock);
2148 2928955 : iip->ili_last_fields = iip->ili_fields;
2149 2928955 : iip->ili_fields = 0;
2150 2928955 : iip->ili_fsync_fields = 0;
2151 2928955 : spin_unlock(&iip->ili_lock);
2152 2928914 : ASSERT(iip->ili_last_fields);
2153 :
2154 2928914 : if (ip != free_ip)
2155 2878033 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
2156 : return;
2157 :
2158 257617 : out_iunlock:
2159 257617 : if (ip != free_ip)
2160 242832 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
2161 14785 : out_iflags_unlock:
2162 269768 : spin_unlock(&ip->i_flags_lock);
2163 269793 : rcu_read_unlock();
2164 : }
2165 :
2166 : /*
2167 : * A big issue when freeing the inode cluster is that we _cannot_ skip any
2168 : * inodes that are in memory - they all must be marked stale and attached to
2169 : * the cluster buffer.
2170 : */
2171 : static int
2172 65666 : xfs_ifree_cluster(
2173 : struct xfs_trans *tp,
2174 : struct xfs_perag *pag,
2175 : struct xfs_inode *free_ip,
2176 : struct xfs_icluster *xic)
2177 : {
2178 65666 : struct xfs_mount *mp = free_ip->i_mount;
2179 65666 : struct xfs_ino_geometry *igeo = M_IGEO(mp);
2180 65666 : struct xfs_buf *bp;
2181 65666 : xfs_daddr_t blkno;
2182 65666 : xfs_ino_t inum = xic->first_ino;
2183 65666 : int nbufs;
2184 65666 : int i, j;
2185 65666 : int ioffset;
2186 65666 : int error;
2187 :
2188 65666 : nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
2189 :
2190 196992 : for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
2191 : /*
2192 : * The allocation bitmap tells us which inodes of the chunk were
2193 : * physically allocated. Skip the cluster if an inode falls into
2194 : * a sparse region.
2195 : */
2196 131326 : ioffset = inum - xic->first_ino;
2197 131326 : if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
2198 13433 : ASSERT(ioffset % igeo->inodes_per_cluster == 0);
2199 13433 : continue;
2200 : }
2201 :
2202 117893 : blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2203 : XFS_INO_TO_AGBNO(mp, inum));
2204 :
2205 : /*
2206 : * We obtain and lock the backing buffer first in the process
2207 : * here to ensure dirty inodes attached to the buffer remain in
2208 : * the flushing state while we mark them stale.
2209 : *
2210 : * If we scan the in-memory inodes first, then buffer IO can
2211 : * complete before we get a lock on it, and hence we may fail
2212 : * to mark all the active inodes on the buffer stale.
2213 : */
2214 117893 : error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2215 117893 : mp->m_bsize * igeo->blocks_per_cluster,
2216 : XBF_UNMAPPED, &bp);
2217 117894 : if (error)
2218 0 : return error;
2219 :
2220 : /*
2221 : * This buffer may not have been correctly initialised as we
2222 : * didn't read it from disk. That's not important because we are
2223 : * only using to mark the buffer as stale in the log, and to
2224 : * attach stale cached inodes on it. That means it will never be
2225 : * dispatched for IO. If it is, we want to know about it, and we
2226 : * want it to fail. We can acheive this by adding a write
2227 : * verifier to the buffer.
2228 : */
2229 117894 : bp->b_ops = &xfs_inode_buf_ops;
2230 :
2231 : /*
2232 : * Now we need to set all the cached clean inodes as XFS_ISTALE,
2233 : * too. This requires lookups, and will skip inodes that we've
2234 : * already marked XFS_ISTALE.
2235 : */
2236 3890476 : for (i = 0; i < igeo->inodes_per_cluster; i++)
2237 3772582 : xfs_ifree_mark_inode_stale(pag, free_ip, inum + i);
2238 :
2239 117894 : xfs_trans_stale_inode_buf(tp, bp);
2240 117894 : xfs_trans_binval(tp, bp);
2241 : }
2242 : return 0;
2243 : }
2244 :
2245 : /*
2246 : * This is called to return an inode to the inode free list. The inode should
2247 : * already be truncated to 0 length and have no pages associated with it. This
2248 : * routine also assumes that the inode is already a part of the transaction.
2249 : *
2250 : * The on-disk copy of the inode will have been added to the list of unlinked
2251 : * inodes in the AGI. We need to remove the inode from that list atomically with
2252 : * respect to freeing it here.
2253 : */
2254 : int
2255 34310290 : xfs_ifree(
2256 : struct xfs_trans *tp,
2257 : struct xfs_inode *ip)
2258 : {
2259 34310290 : struct xfs_mount *mp = ip->i_mount;
2260 34310290 : struct xfs_perag *pag;
2261 34310290 : struct xfs_icluster xic = { 0 };
2262 34310290 : struct xfs_inode_log_item *iip = ip->i_itemp;
2263 34310290 : int error;
2264 :
2265 34310290 : ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2266 34310290 : ASSERT(VFS_I(ip)->i_nlink == 0);
2267 34310290 : ASSERT(ip->i_df.if_nextents == 0);
2268 34310290 : ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
2269 34310290 : ASSERT(ip->i_nblocks == 0);
2270 :
2271 34310290 : pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2272 :
2273 : /*
2274 : * Free the inode first so that we guarantee that the AGI lock is going
2275 : * to be taken before we remove the inode from the unlinked list. This
2276 : * makes the AGI lock -> unlinked list modification order the same as
2277 : * used in O_TMPFILE creation.
2278 : */
2279 34310634 : error = xfs_difree(tp, pag, ip->i_ino, &xic);
2280 34311507 : if (error)
2281 158 : goto out;
2282 :
2283 34311349 : error = xfs_iunlink_remove(tp, pag, ip);
2284 34310188 : if (error)
2285 0 : goto out;
2286 :
2287 : /*
2288 : * Free any local-format data sitting around before we reset the
2289 : * data fork to extents format. Note that the attr fork data has
2290 : * already been freed by xfs_attr_inactive.
2291 : */
2292 34310188 : if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
2293 9052715 : kmem_free(ip->i_df.if_u1.if_data);
2294 9052744 : ip->i_df.if_u1.if_data = NULL;
2295 9052744 : ip->i_df.if_bytes = 0;
2296 : }
2297 :
2298 34310217 : VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
2299 34310217 : ip->i_diflags = 0;
2300 34310217 : ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
2301 34310217 : ip->i_forkoff = 0; /* mark the attr fork not in use */
2302 34310217 : ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
2303 68621669 : if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS))
2304 0 : xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS);
2305 :
2306 : /* Don't attempt to replay owner changes for a deleted inode */
2307 34311452 : spin_lock(&iip->ili_lock);
2308 34311309 : iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);
2309 34311309 : spin_unlock(&iip->ili_lock);
2310 :
2311 : /*
2312 : * Bump the generation count so no one will be confused
2313 : * by reincarnations of this inode.
2314 : */
2315 34310067 : VFS_I(ip)->i_generation++;
2316 34310067 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2317 :
2318 34312259 : if (xic.deleted)
2319 65666 : error = xfs_ifree_cluster(tp, pag, ip, &xic);
2320 34246593 : out:
2321 34312417 : xfs_perag_put(pag);
2322 34312287 : return error;
2323 : }
2324 :
2325 : /*
2326 : * This is called to unpin an inode. The caller must have the inode locked
2327 : * in at least shared mode so that the buffer cannot be subsequently pinned
2328 : * once someone is waiting for it to be unpinned.
2329 : */
2330 : static void
2331 12 : xfs_iunpin(
2332 : struct xfs_inode *ip)
2333 : {
2334 12 : ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2335 :
2336 12 : trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2337 :
2338 : /* Give the log a push to start the unpinning I/O */
2339 12 : xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
2340 :
2341 12 : }
2342 :
2343 : static void
2344 12 : __xfs_iunpin_wait(
2345 : struct xfs_inode *ip)
2346 : {
2347 12 : wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
2348 12 : DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2349 :
2350 12 : xfs_iunpin(ip);
2351 :
2352 12 : do {
2353 12 : prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
2354 12 : if (xfs_ipincount(ip))
2355 7 : io_schedule();
2356 12 : } while (xfs_ipincount(ip));
2357 12 : finish_wait(wq, &wait.wq_entry);
2358 12 : }
2359 :
2360 : void
2361 262683621 : xfs_iunpin_wait(
2362 : struct xfs_inode *ip)
2363 : {
2364 262683621 : if (xfs_ipincount(ip))
2365 12 : __xfs_iunpin_wait(ip);
2366 262683621 : }
2367 :
2368 : /*
2369 : * Removing an inode from the namespace involves removing the directory entry
2370 : * and dropping the link count on the inode. Removing the directory entry can
2371 : * result in locking an AGF (directory blocks were freed) and removing a link
2372 : * count can result in placing the inode on an unlinked list which results in
2373 : * locking an AGI.
2374 : *
2375 : * The big problem here is that we have an ordering constraint on AGF and AGI
2376 : * locking - inode allocation locks the AGI, then can allocate a new extent for
2377 : * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
2378 : * removes the inode from the unlinked list, requiring that we lock the AGI
2379 : * first, and then freeing the inode can result in an inode chunk being freed
2380 : * and hence freeing disk space requiring that we lock an AGF.
2381 : *
2382 : * Hence the ordering that is imposed by other parts of the code is AGI before
2383 : * AGF. This means we cannot remove the directory entry before we drop the inode
2384 : * reference count and put it on the unlinked list as this results in a lock
2385 : * order of AGF then AGI, and this can deadlock against inode allocation and
2386 : * freeing. Therefore we must drop the link counts before we remove the
2387 : * directory entry.
2388 : *
2389 : * This is still safe from a transactional point of view - it is not until we
2390 : * get to xfs_defer_finish() that we have the possibility of multiple
2391 : * transactions in this operation. Hence as long as we remove the directory
2392 : * entry and drop the link count in the first transaction of the remove
2393 : * operation, there are no transactional constraints on the ordering here.
2394 : */
2395 : int
2396 35736090 : xfs_remove(
2397 : xfs_inode_t *dp,
2398 : struct xfs_name *name,
2399 : xfs_inode_t *ip)
2400 : {
2401 35736090 : xfs_mount_t *mp = dp->i_mount;
2402 35736090 : xfs_trans_t *tp = NULL;
2403 35736090 : int is_dir = S_ISDIR(VFS_I(ip)->i_mode);
2404 35736090 : int dontcare;
2405 35736090 : int error = 0;
2406 35736090 : uint resblks;
2407 :
2408 35736090 : trace_xfs_remove(dp, name);
2409 :
2410 71472320 : if (xfs_is_shutdown(mp))
2411 : return -EIO;
2412 :
2413 35735638 : error = xfs_qm_dqattach(dp);
2414 35735460 : if (error)
2415 1 : goto std_return;
2416 :
2417 35735459 : error = xfs_qm_dqattach(ip);
2418 35734099 : if (error)
2419 0 : goto std_return;
2420 :
2421 : /*
2422 : * We try to get the real space reservation first, allowing for
2423 : * directory btree deletion(s) implying possible bmap insert(s). If we
2424 : * can't get the space reservation then we use 0 instead, and avoid the
2425 : * bmap btree insert(s) in the directory code by, if the bmap insert
2426 : * tries to happen, instead trimming the LAST block from the directory.
2427 : *
2428 : * Ignore EDQUOT and ENOSPC being returned via nospace_error because
2429 : * the directory code can handle a reservationless update and we don't
2430 : * want to prevent a user from trying to free space by deleting things.
2431 : */
2432 35734099 : resblks = XFS_REMOVE_SPACE_RES(mp);
2433 35734099 : error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks,
2434 : &tp, &dontcare);
2435 35735935 : if (error) {
2436 0 : ASSERT(error != -ENOSPC);
2437 0 : goto std_return;
2438 : }
2439 :
2440 : /*
2441 : * If we're removing a directory perform some additional validation.
2442 : */
2443 35735935 : if (is_dir) {
2444 1374629 : ASSERT(VFS_I(ip)->i_nlink >= 2);
2445 1374629 : if (VFS_I(ip)->i_nlink != 2) {
2446 689917 : error = -ENOTEMPTY;
2447 689917 : goto out_trans_cancel;
2448 : }
2449 684712 : if (!xfs_dir_isempty(ip)) {
2450 394880 : error = -ENOTEMPTY;
2451 394880 : goto out_trans_cancel;
2452 : }
2453 :
2454 : /* Drop the link from ip's "..". */
2455 289832 : error = xfs_droplink(tp, dp);
2456 289832 : if (error)
2457 0 : goto out_trans_cancel;
2458 :
2459 : /* Drop the "." link from ip to self. */
2460 289832 : error = xfs_droplink(tp, ip);
2461 289832 : if (error)
2462 0 : goto out_trans_cancel;
2463 :
2464 : /*
2465 : * Point the unlinked child directory's ".." entry to the root
2466 : * directory to eliminate back-references to inodes that may
2467 : * get freed before the child directory is closed. If the fs
2468 : * gets shrunk, this can lead to dirent inode validation errors.
2469 : */
2470 289832 : if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) {
2471 263002 : error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
2472 : tp->t_mountp->m_sb.sb_rootino, 0);
2473 263002 : if (error)
2474 0 : goto out_trans_cancel;
2475 : }
2476 : } else {
2477 : /*
2478 : * When removing a non-directory we need to log the parent
2479 : * inode here. For a directory this is done implicitly
2480 : * by the xfs_droplink call for the ".." entry.
2481 : */
2482 34361306 : xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2483 : }
2484 34651308 : xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2485 :
2486 : /* Drop the link from dp to ip. */
2487 34650833 : error = xfs_droplink(tp, ip);
2488 34650591 : if (error)
2489 9 : goto out_trans_cancel;
2490 :
2491 34650582 : error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks);
2492 34648535 : if (error) {
2493 2 : ASSERT(error != -ENOENT);
2494 2 : goto out_trans_cancel;
2495 : }
2496 :
2497 : /*
2498 : * If this is a synchronous mount, make sure that the
2499 : * remove transaction goes to disk before returning to
2500 : * the user.
2501 : */
2502 34648533 : if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
2503 218 : xfs_trans_set_sync(tp);
2504 :
2505 34648533 : error = xfs_trans_commit(tp);
2506 34651395 : if (error)
2507 0 : goto std_return;
2508 :
2509 34651395 : if (is_dir && xfs_inode_is_filestream(ip))
2510 802 : xfs_filestream_deassociate(ip);
2511 :
2512 : return 0;
2513 :
2514 1084808 : out_trans_cancel:
2515 1084808 : xfs_trans_cancel(tp);
2516 : std_return:
2517 : return error;
2518 : }
2519 :
2520 : /*
2521 : * Enter all inodes for a rename transaction into a sorted array.
2522 : */
2523 : #define __XFS_SORT_INODES 5
2524 : STATIC void
2525 22740419 : xfs_sort_for_rename(
2526 : struct xfs_inode *dp1, /* in: old (source) directory inode */
2527 : struct xfs_inode *dp2, /* in: new (target) directory inode */
2528 : struct xfs_inode *ip1, /* in: inode of old entry */
2529 : struct xfs_inode *ip2, /* in: inode of new entry */
2530 : struct xfs_inode *wip, /* in: whiteout inode */
2531 : struct xfs_inode **i_tab,/* out: sorted array of inodes */
2532 : int *num_inodes) /* in/out: inodes in array */
2533 : {
2534 22740419 : int i, j;
2535 :
2536 22740419 : ASSERT(*num_inodes == __XFS_SORT_INODES);
2537 22740419 : memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
2538 :
2539 : /*
2540 : * i_tab contains a list of pointers to inodes. We initialize
2541 : * the table here & we'll sort it. We will then use it to
2542 : * order the acquisition of the inode locks.
2543 : *
2544 : * Note that the table may contain duplicates. e.g., dp1 == dp2.
2545 : */
2546 22740419 : i = 0;
2547 22740419 : i_tab[i++] = dp1;
2548 22740419 : i_tab[i++] = dp2;
2549 22740419 : i_tab[i++] = ip1;
2550 22740419 : if (ip2)
2551 6955141 : i_tab[i++] = ip2;
2552 22740419 : if (wip)
2553 1443646 : i_tab[i++] = wip;
2554 22740419 : *num_inodes = i;
2555 :
2556 : /*
2557 : * Sort the elements via bubble sort. (Remember, there are at
2558 : * most 5 elements to sort, so this is adequate.)
2559 : */
2560 99360328 : for (i = 0; i < *num_inodes; i++) {
2561 263503694 : for (j = 1; j < *num_inodes; j++) {
2562 186883785 : if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
2563 42467975 : struct xfs_inode *temp = i_tab[j];
2564 42467975 : i_tab[j] = i_tab[j-1];
2565 42467975 : i_tab[j-1] = temp;
2566 : }
2567 : }
2568 : }
2569 22740419 : }
2570 :
2571 : static int
2572 22738120 : xfs_finish_rename(
2573 : struct xfs_trans *tp)
2574 : {
2575 : /*
2576 : * If this is a synchronous mount, make sure that the rename transaction
2577 : * goes to disk before returning to the user.
2578 : */
2579 22738120 : if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp))
2580 0 : xfs_trans_set_sync(tp);
2581 :
2582 22738120 : return xfs_trans_commit(tp);
2583 : }
2584 :
2585 : /*
2586 : * xfs_cross_rename()
2587 : *
2588 : * responsible for handling RENAME_EXCHANGE flag in renameat2() syscall
2589 : */
2590 : STATIC int
2591 6712346 : xfs_cross_rename(
2592 : struct xfs_trans *tp,
2593 : struct xfs_inode *dp1,
2594 : struct xfs_name *name1,
2595 : struct xfs_inode *ip1,
2596 : struct xfs_inode *dp2,
2597 : struct xfs_name *name2,
2598 : struct xfs_inode *ip2,
2599 : int spaceres)
2600 : {
2601 6712346 : int error = 0;
2602 6712346 : int ip1_flags = 0;
2603 6712346 : int ip2_flags = 0;
2604 6712346 : int dp2_flags = 0;
2605 :
2606 : /* Swap inode number for dirent in first parent */
2607 6712346 : error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres);
2608 6712347 : if (error)
2609 127 : goto out_trans_abort;
2610 :
2611 : /* Swap inode number for dirent in second parent */
2612 6712220 : error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres);
2613 6712220 : if (error)
2614 2 : goto out_trans_abort;
2615 :
2616 : /*
2617 : * If we're renaming one or more directories across different parents,
2618 : * update the respective ".." entries (and link counts) to match the new
2619 : * parents.
2620 : */
2621 6712218 : if (dp1 != dp2) {
2622 6498156 : dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2623 :
2624 6498156 : if (S_ISDIR(VFS_I(ip2)->i_mode)) {
2625 2119436 : error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
2626 : dp1->i_ino, spaceres);
2627 2119436 : if (error)
2628 0 : goto out_trans_abort;
2629 :
2630 : /* transfer ip2 ".." reference to dp1 */
2631 2119436 : if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
2632 8 : error = xfs_droplink(tp, dp2);
2633 8 : if (error)
2634 0 : goto out_trans_abort;
2635 8 : xfs_bumplink(tp, dp1);
2636 : }
2637 :
2638 : /*
2639 : * Although ip1 isn't changed here, userspace needs
2640 : * to be warned about the change, so that applications
2641 : * relying on it (like backup ones), will properly
2642 : * notify the change
2643 : */
2644 : ip1_flags |= XFS_ICHGTIME_CHG;
2645 : ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2646 : }
2647 :
2648 6498156 : if (S_ISDIR(VFS_I(ip1)->i_mode)) {
2649 2119436 : error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
2650 : dp2->i_ino, spaceres);
2651 2119436 : if (error)
2652 0 : goto out_trans_abort;
2653 :
2654 : /* transfer ip1 ".." reference to dp2 */
2655 2119436 : if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
2656 8 : error = xfs_droplink(tp, dp1);
2657 8 : if (error)
2658 0 : goto out_trans_abort;
2659 8 : xfs_bumplink(tp, dp2);
2660 : }
2661 :
2662 : /*
2663 : * Although ip2 isn't changed here, userspace needs
2664 : * to be warned about the change, so that applications
2665 : * relying on it (like backup ones), will properly
2666 : * notify the change
2667 : */
2668 2119436 : ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2669 2119436 : ip2_flags |= XFS_ICHGTIME_CHG;
2670 : }
2671 : }
2672 :
2673 6712218 : if (ip1_flags) {
2674 2119444 : xfs_trans_ichgtime(tp, ip1, ip1_flags);
2675 2119444 : xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
2676 : }
2677 6712218 : if (ip2_flags) {
2678 2119444 : xfs_trans_ichgtime(tp, ip2, ip2_flags);
2679 2119444 : xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
2680 : }
2681 6712218 : if (dp2_flags) {
2682 6498156 : xfs_trans_ichgtime(tp, dp2, dp2_flags);
2683 6498156 : xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
2684 : }
2685 6712218 : xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2686 6712218 : xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
2687 6712218 : return xfs_finish_rename(tp);
2688 :
2689 129 : out_trans_abort:
2690 129 : xfs_trans_cancel(tp);
2691 129 : return error;
2692 : }
2693 :
2694 : /*
2695 : * xfs_rename_alloc_whiteout()
2696 : *
2697 : * Return a referenced, unlinked, unlocked inode that can be used as a
2698 : * whiteout in a rename transaction. We use a tmpfile inode here so that if we
2699 : * crash between allocating the inode and linking it into the rename transaction
2700 : * recovery will free the inode and we won't leak it.
2701 : */
2702 : static int
2703 1480373 : xfs_rename_alloc_whiteout(
2704 : struct mnt_idmap *idmap,
2705 : struct xfs_name *src_name,
2706 : struct xfs_inode *dp,
2707 : struct xfs_inode **wip)
2708 : {
2709 1480373 : struct xfs_inode *tmpfile;
2710 1480373 : struct qstr name;
2711 1480373 : int error;
2712 :
2713 1480373 : error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE,
2714 : &tmpfile);
2715 1480372 : if (error)
2716 : return error;
2717 :
2718 1443646 : name.name = src_name->name;
2719 1443646 : name.len = src_name->len;
2720 1443646 : error = xfs_inode_init_security(VFS_I(tmpfile), VFS_I(dp), &name);
2721 1443646 : if (error) {
2722 0 : xfs_finish_inode_setup(tmpfile);
2723 0 : xfs_irele(tmpfile);
2724 0 : return error;
2725 : }
2726 :
2727 : /*
2728 : * Prepare the tmpfile inode as if it were created through the VFS.
2729 : * Complete the inode setup and flag it as linkable. nlink is already
2730 : * zero, so we can skip the drop_nlink.
2731 : */
2732 1443646 : xfs_setup_iops(tmpfile);
2733 1443646 : xfs_finish_inode_setup(tmpfile);
2734 1443646 : VFS_I(tmpfile)->i_state |= I_LINKABLE;
2735 :
2736 1443646 : *wip = tmpfile;
2737 1443646 : return 0;
2738 : }
2739 :
2740 : /*
2741 : * xfs_rename
2742 : */
2743 : int
2744 22777130 : xfs_rename(
2745 : struct mnt_idmap *idmap,
2746 : struct xfs_inode *src_dp,
2747 : struct xfs_name *src_name,
2748 : struct xfs_inode *src_ip,
2749 : struct xfs_inode *target_dp,
2750 : struct xfs_name *target_name,
2751 : struct xfs_inode *target_ip,
2752 : unsigned int flags)
2753 : {
2754 22777130 : struct xfs_mount *mp = src_dp->i_mount;
2755 22777130 : struct xfs_trans *tp;
2756 22777130 : struct xfs_inode *wip = NULL; /* whiteout inode */
2757 22777130 : struct xfs_inode *inodes[__XFS_SORT_INODES];
2758 22777130 : int i;
2759 22777130 : int num_inodes = __XFS_SORT_INODES;
2760 22777130 : bool new_parent = (src_dp != target_dp);
2761 22777130 : bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
2762 22777130 : int spaceres;
2763 22777130 : bool retried = false;
2764 22777130 : int error, nospace_error = 0;
2765 :
2766 22777130 : trace_xfs_rename(src_dp, target_dp, src_name, target_name);
2767 :
2768 22777128 : if ((flags & RENAME_EXCHANGE) && !target_ip)
2769 : return -EINVAL;
2770 :
2771 : /*
2772 : * If we are doing a whiteout operation, allocate the whiteout inode
2773 : * we will be placing at the target and ensure the type is set
2774 : * appropriately.
2775 : */
2776 22777128 : if (flags & RENAME_WHITEOUT) {
2777 1480368 : error = xfs_rename_alloc_whiteout(idmap, src_name,
2778 : target_dp, &wip);
2779 1480373 : if (error)
2780 : return error;
2781 :
2782 : /* setup target dirent info as whiteout */
2783 1443646 : src_name->type = XFS_DIR3_FT_CHRDEV;
2784 : }
2785 :
2786 22740406 : xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
2787 : inodes, &num_inodes);
2788 :
2789 22741322 : retry:
2790 22741322 : nospace_error = 0;
2791 22741322 : spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
2792 22741322 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
2793 22741331 : if (error == -ENOSPC) {
2794 199502 : nospace_error = error;
2795 199502 : spaceres = 0;
2796 199502 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
2797 : &tp);
2798 : }
2799 22741331 : if (error)
2800 269 : goto out_release_wip;
2801 :
2802 : /*
2803 : * Attach the dquots to the inodes
2804 : */
2805 22741062 : error = xfs_qm_vop_rename_dqattach(inodes);
2806 22741062 : if (error)
2807 730 : goto out_trans_cancel;
2808 :
2809 : /*
2810 : * Lock all the participating inodes. Depending upon whether
2811 : * the target_name exists in the target directory, and
2812 : * whether the target directory is the same as the source
2813 : * directory, we can lock from 2 to 5 inodes.
2814 : */
2815 22740332 : xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
2816 :
2817 : /*
2818 : * Join all the inodes to the transaction. From this point on,
2819 : * we can rely on either trans_commit or trans_cancel to unlock
2820 : * them.
2821 : */
2822 22740329 : xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
2823 22740322 : if (new_parent)
2824 21303132 : xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
2825 22740322 : xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
2826 22740323 : if (target_ip)
2827 6954147 : xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
2828 22740319 : if (wip)
2829 1443628 : xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
2830 :
2831 : /*
2832 : * If we are using project inheritance, we only allow renames
2833 : * into our tree when the project IDs are the same; else the
2834 : * tree quota mechanism would be circumvented.
2835 : */
2836 22740324 : if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
2837 : target_dp->i_projid != src_ip->i_projid)) {
2838 0 : error = -EXDEV;
2839 0 : goto out_trans_cancel;
2840 : }
2841 :
2842 : /* RENAME_EXCHANGE is unique from here on. */
2843 22740324 : if (flags & RENAME_EXCHANGE)
2844 6712346 : return xfs_cross_rename(tp, src_dp, src_name, src_ip,
2845 : target_dp, target_name, target_ip,
2846 : spaceres);
2847 :
2848 : /*
2849 : * Try to reserve quota to handle an expansion of the target directory.
2850 : * We'll allow the rename to continue in reservationless mode if we hit
2851 : * a space usage constraint. If we trigger reservationless mode, save
2852 : * the errno if there isn't any free space in the target directory.
2853 : */
2854 16027978 : if (spaceres != 0) {
2855 15881564 : error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres,
2856 : 0, false);
2857 15881572 : if (error == -EDQUOT || error == -ENOSPC) {
2858 1651 : if (!retried) {
2859 909 : xfs_trans_cancel(tp);
2860 909 : xfs_blockgc_free_quota(target_dp, 0);
2861 909 : retried = true;
2862 909 : goto retry;
2863 : }
2864 :
2865 : nospace_error = error;
2866 : spaceres = 0;
2867 : error = 0;
2868 : }
2869 15880663 : if (error)
2870 0 : goto out_trans_cancel;
2871 : }
2872 :
2873 : /*
2874 : * Check for expected errors before we dirty the transaction
2875 : * so we can return an error without a transaction abort.
2876 : */
2877 16027077 : if (target_ip == NULL) {
2878 : /*
2879 : * If there's no space reservation, check the entry will
2880 : * fit before actually inserting it.
2881 : */
2882 15785273 : if (!spaceres) {
2883 147158 : error = xfs_dir_canenter(tp, target_dp, target_name);
2884 147158 : if (error)
2885 1139 : goto out_trans_cancel;
2886 : }
2887 : } else {
2888 : /*
2889 : * If target exists and it's a directory, check that whether
2890 : * it can be destroyed.
2891 : */
2892 242106 : if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
2893 302 : (!xfs_dir_isempty(target_ip) ||
2894 284 : (VFS_I(target_ip)->i_nlink > 2))) {
2895 18 : error = -EEXIST;
2896 18 : goto out_trans_cancel;
2897 : }
2898 : }
2899 :
2900 : /*
2901 : * Lock the AGI buffers we need to handle bumping the nlink of the
2902 : * whiteout inode off the unlinked list and to handle dropping the
2903 : * nlink of the target inode. Per locking order rules, do this in
2904 : * increasing AG order and before directory block allocation tries to
2905 : * grab AGFs because we grab AGIs before AGFs.
2906 : *
2907 : * The (vfs) caller must ensure that if src is a directory then
2908 : * target_ip is either null or an empty directory.
2909 : */
2910 65788648 : for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
2911 98082275 : if (inodes[i] == wip ||
2912 48319545 : (inodes[i] == target_ip &&
2913 241785 : (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
2914 1662992 : struct xfs_perag *pag;
2915 1662992 : struct xfs_buf *bp;
2916 :
2917 4988976 : pag = xfs_perag_get(mp,
2918 1662992 : XFS_INO_TO_AGNO(mp, inodes[i]->i_ino));
2919 1662992 : error = xfs_read_agi(pag, tp, &bp);
2920 1662990 : xfs_perag_put(pag);
2921 1662990 : if (error)
2922 5 : goto out_trans_cancel;
2923 : }
2924 : }
2925 :
2926 : /*
2927 : * Directory entry creation below may acquire the AGF. Remove
2928 : * the whiteout from the unlinked list first to preserve correct
2929 : * AGI/AGF locking order. This dirties the transaction so failures
2930 : * after this point will abort and log recovery will clean up the
2931 : * mess.
2932 : *
2933 : * For whiteouts, we need to bump the link count on the whiteout
2934 : * inode. After this point, we have a real link, clear the tmpfile
2935 : * state flag from the inode so it doesn't accidentally get misused
2936 : * in future.
2937 : */
2938 16025915 : if (wip) {
2939 1443187 : struct xfs_perag *pag;
2940 :
2941 1443187 : ASSERT(VFS_I(wip)->i_nlink == 0);
2942 :
2943 1443187 : pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino));
2944 1443186 : error = xfs_iunlink_remove(tp, pag, wip);
2945 1443188 : xfs_perag_put(pag);
2946 1443188 : if (error)
2947 0 : goto out_trans_cancel;
2948 :
2949 1443188 : xfs_bumplink(tp, wip);
2950 1443188 : VFS_I(wip)->i_state &= ~I_LINKABLE;
2951 : }
2952 :
2953 : /*
2954 : * Set up the target.
2955 : */
2956 16025916 : if (target_ip == NULL) {
2957 : /*
2958 : * If target does not exist and the rename crosses
2959 : * directories, adjust the target directory link count
2960 : * to account for the ".." reference from the new entry.
2961 : */
2962 15784133 : error = xfs_dir_createname(tp, target_dp, target_name,
2963 : src_ip->i_ino, spaceres);
2964 15784130 : if (error)
2965 12 : goto out_trans_cancel;
2966 :
2967 15784118 : xfs_trans_ichgtime(tp, target_dp,
2968 : XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2969 :
2970 15784119 : if (new_parent && src_is_directory) {
2971 4605058 : xfs_bumplink(tp, target_dp);
2972 : }
2973 : } else { /* target_ip != NULL */
2974 : /*
2975 : * Link the source inode under the target name.
2976 : * If the source inode is a directory and we are moving
2977 : * it across directories, its ".." entry will be
2978 : * inconsistent until we replace that down below.
2979 : *
2980 : * In case there is already an entry with the same
2981 : * name at the destination directory, remove it first.
2982 : */
2983 241783 : error = xfs_dir_replace(tp, target_dp, target_name,
2984 : src_ip->i_ino, spaceres);
2985 241783 : if (error)
2986 0 : goto out_trans_cancel;
2987 :
2988 241783 : xfs_trans_ichgtime(tp, target_dp,
2989 : XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2990 :
2991 : /*
2992 : * Decrement the link count on the target since the target
2993 : * dir no longer points to it.
2994 : */
2995 241783 : error = xfs_droplink(tp, target_ip);
2996 241783 : if (error)
2997 1 : goto out_trans_cancel;
2998 :
2999 241782 : if (src_is_directory) {
3000 : /*
3001 : * Drop the link from the old "." entry.
3002 : */
3003 284 : error = xfs_droplink(tp, target_ip);
3004 284 : if (error)
3005 0 : goto out_trans_cancel;
3006 : }
3007 : } /* target_ip != NULL */
3008 :
3009 : /*
3010 : * Remove the source.
3011 : */
3012 16025901 : if (new_parent && src_is_directory) {
3013 : /*
3014 : * Rewrite the ".." entry to point to the new
3015 : * directory.
3016 : */
3017 4605066 : error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
3018 : target_dp->i_ino, spaceres);
3019 4605066 : ASSERT(error != -EEXIST);
3020 4605066 : if (error)
3021 0 : goto out_trans_cancel;
3022 : }
3023 :
3024 : /*
3025 : * We always want to hit the ctime on the source inode.
3026 : *
3027 : * This isn't strictly required by the standards since the source
3028 : * inode isn't really being changed, but old unix file systems did
3029 : * it and some incremental backup programs won't work without it.
3030 : */
3031 16025901 : xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
3032 16025892 : xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
3033 :
3034 : /*
3035 : * Adjust the link count on src_dp. This is necessary when
3036 : * renaming a directory, either within one parent when
3037 : * the target existed, or across two parent directories.
3038 : */
3039 16025903 : if (src_is_directory && (new_parent || target_ip != NULL)) {
3040 :
3041 : /*
3042 : * Decrement link count on src_directory since the
3043 : * entry that's moved no longer points to it.
3044 : */
3045 4605342 : error = xfs_droplink(tp, src_dp);
3046 4605342 : if (error)
3047 0 : goto out_trans_cancel;
3048 : }
3049 :
3050 : /*
3051 : * For whiteouts, we only need to update the source dirent with the
3052 : * inode number of the whiteout inode rather than removing it
3053 : * altogether.
3054 : */
3055 16025903 : if (wip)
3056 1443188 : error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
3057 : spaceres);
3058 : else
3059 14582715 : error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
3060 : spaceres);
3061 :
3062 16025904 : if (error)
3063 2 : goto out_trans_cancel;
3064 :
3065 16025902 : xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3066 16025902 : xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
3067 16025902 : if (new_parent)
3068 14803630 : xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
3069 :
3070 16025902 : error = xfs_finish_rename(tp);
3071 16025902 : if (wip)
3072 1443188 : xfs_irele(wip);
3073 : return error;
3074 :
3075 1907 : out_trans_cancel:
3076 1907 : xfs_trans_cancel(tp);
3077 2176 : out_release_wip:
3078 2176 : if (wip)
3079 458 : xfs_irele(wip);
3080 2176 : if (error == -ENOSPC && nospace_error)
3081 1139 : error = nospace_error;
3082 : return error;
3083 : }
3084 :
3085 : static int
3086 120317354 : xfs_iflush(
3087 : struct xfs_inode *ip,
3088 : struct xfs_buf *bp)
3089 : {
3090 120317354 : struct xfs_inode_log_item *iip = ip->i_itemp;
3091 120317354 : struct xfs_dinode *dip;
3092 120317354 : struct xfs_mount *mp = ip->i_mount;
3093 120317354 : int error;
3094 :
3095 120317354 : ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3096 240634708 : ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
3097 120317354 : ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
3098 : ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3099 120317354 : ASSERT(iip->ili_item.li_buf == bp);
3100 :
3101 120317354 : dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
3102 :
3103 : /*
3104 : * We don't flush the inode if any of the following checks fail, but we
3105 : * do still update the log item and attach to the backing buffer as if
3106 : * the flush happened. This is a formality to facilitate predictable
3107 : * error handling as the caller will shutdown and fail the buffer.
3108 : */
3109 120317354 : error = -EFSCORRUPTED;
3110 120317354 : if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
3111 : mp, XFS_ERRTAG_IFLUSH_1)) {
3112 0 : xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3113 : "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT,
3114 : __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
3115 0 : goto flush_out;
3116 : }
3117 120317354 : if (S_ISREG(VFS_I(ip)->i_mode)) {
3118 73719436 : if (XFS_TEST_ERROR(
3119 : ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
3120 : ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
3121 : mp, XFS_ERRTAG_IFLUSH_3)) {
3122 0 : xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3123 : "%s: Bad regular inode %llu, ptr "PTR_FMT,
3124 : __func__, ip->i_ino, ip);
3125 0 : goto flush_out;
3126 : }
3127 46597918 : } else if (S_ISDIR(VFS_I(ip)->i_mode)) {
3128 30483883 : if (XFS_TEST_ERROR(
3129 : ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
3130 : ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
3131 : ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
3132 : mp, XFS_ERRTAG_IFLUSH_4)) {
3133 0 : xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3134 : "%s: Bad directory inode %llu, ptr "PTR_FMT,
3135 : __func__, ip->i_ino, ip);
3136 0 : goto flush_out;
3137 : }
3138 : }
3139 240634708 : if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
3140 : ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
3141 0 : xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3142 : "%s: detected corrupt incore inode %llu, "
3143 : "total extents = %llu nblocks = %lld, ptr "PTR_FMT,
3144 : __func__, ip->i_ino,
3145 : ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af),
3146 : ip->i_nblocks, ip);
3147 0 : goto flush_out;
3148 : }
3149 120317354 : if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize,
3150 : mp, XFS_ERRTAG_IFLUSH_6)) {
3151 0 : xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3152 : "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT,
3153 : __func__, ip->i_ino, ip->i_forkoff, ip);
3154 0 : goto flush_out;
3155 : }
3156 :
3157 : /*
3158 : * Inode item log recovery for v2 inodes are dependent on the flushiter
3159 : * count for correct sequencing. We bump the flush iteration count so
3160 : * we can detect flushes which postdate a log record during recovery.
3161 : * This is redundant as we now log every change and hence this can't
3162 : * happen but we need to still do it to ensure backwards compatibility
3163 : * with old kernels that predate logging all inode changes.
3164 : */
3165 120317354 : if (!xfs_has_v3inodes(mp))
3166 242 : ip->i_flushiter++;
3167 :
3168 : /*
3169 : * If there are inline format data / attr forks attached to this inode,
3170 : * make sure they are not corrupt.
3171 : */
3172 152197249 : if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
3173 31879895 : xfs_ifork_verify_local_data(ip))
3174 0 : goto flush_out;
3175 120317354 : if (xfs_inode_has_attr_fork(ip) &&
3176 62127950 : ip->i_af.if_format == XFS_DINODE_FMT_LOCAL &&
3177 18466547 : xfs_ifork_verify_local_attr(ip))
3178 0 : goto flush_out;
3179 :
3180 : /*
3181 : * Copy the dirty parts of the inode into the on-disk inode. We always
3182 : * copy out the core of the inode, because if the inode is dirty at all
3183 : * the core must be.
3184 : */
3185 120317354 : xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
3186 :
3187 : /* Wrap, we never let the log put out DI_MAX_FLUSH */
3188 120317354 : if (!xfs_has_v3inodes(mp)) {
3189 242 : if (ip->i_flushiter == DI_MAX_FLUSH)
3190 0 : ip->i_flushiter = 0;
3191 : }
3192 :
3193 120317354 : xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
3194 120317354 : if (xfs_inode_has_attr_fork(ip))
3195 43661403 : xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
3196 :
3197 : /*
3198 : * We've recorded everything logged in the inode, so we'd like to clear
3199 : * the ili_fields bits so we don't log and flush things unnecessarily.
3200 : * However, we can't stop logging all this information until the data
3201 : * we've copied into the disk buffer is written to disk. If we did we
3202 : * might overwrite the copy of the inode in the log with all the data
3203 : * after re-logging only part of it, and in the face of a crash we
3204 : * wouldn't have all the data we need to recover.
3205 : *
3206 : * What we do is move the bits to the ili_last_fields field. When
3207 : * logging the inode, these bits are moved back to the ili_fields field.
3208 : * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since
3209 : * we know that the information those bits represent is permanently on
3210 : * disk. As long as the flush completes before the inode is logged
3211 : * again, then both ili_fields and ili_last_fields will be cleared.
3212 : */
3213 : error = 0;
3214 120317354 : flush_out:
3215 120317354 : spin_lock(&iip->ili_lock);
3216 120317354 : iip->ili_last_fields = iip->ili_fields;
3217 120317354 : iip->ili_fields = 0;
3218 120317354 : iip->ili_fsync_fields = 0;
3219 120317354 : spin_unlock(&iip->ili_lock);
3220 :
3221 : /*
3222 : * Store the current LSN of the inode so that we can tell whether the
3223 : * item has moved in the AIL from xfs_buf_inode_iodone().
3224 : */
3225 120317354 : xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
3226 : &iip->ili_item.li_lsn);
3227 :
3228 : /* generate the checksum. */
3229 120317354 : xfs_dinode_calc_crc(mp, dip);
3230 120317354 : return error;
3231 : }
3232 :
3233 : /*
3234 : * Non-blocking flush of dirty inode metadata into the backing buffer.
3235 : *
3236 : * The caller must have a reference to the inode and hold the cluster buffer
3237 : * locked. The function will walk across all the inodes on the cluster buffer it
3238 : * can find and lock without blocking, and flush them to the cluster buffer.
3239 : *
3240 : * On successful flushing of at least one inode, the caller must write out the
3241 : * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
3242 : * the caller needs to release the buffer. On failure, the filesystem will be
3243 : * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
3244 : * will be returned.
3245 : */
3246 : int
3247 20489739 : xfs_iflush_cluster(
3248 : struct xfs_buf *bp)
3249 : {
3250 20489739 : struct xfs_mount *mp = bp->b_mount;
3251 20489739 : struct xfs_log_item *lip, *n;
3252 20489739 : struct xfs_inode *ip;
3253 20489739 : struct xfs_inode_log_item *iip;
3254 20489739 : int clcount = 0;
3255 20489739 : int error = 0;
3256 :
3257 : /*
3258 : * We must use the safe variant here as on shutdown xfs_iflush_abort()
3259 : * will remove itself from the list.
3260 : */
3261 145932482 : list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
3262 125442743 : iip = (struct xfs_inode_log_item *)lip;
3263 125442743 : ip = iip->ili_inode;
3264 :
3265 : /*
3266 : * Quick and dirty check to avoid locks if possible.
3267 : */
3268 125442743 : if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING))
3269 137246 : continue;
3270 125305497 : if (xfs_ipincount(ip))
3271 3113715 : continue;
3272 :
3273 : /*
3274 : * The inode is still attached to the buffer, which means it is
3275 : * dirty but reclaim might try to grab it. Check carefully for
3276 : * that, and grab the ilock while still holding the i_flags_lock
3277 : * to guarantee reclaim will not be able to reclaim this inode
3278 : * once we drop the i_flags_lock.
3279 : */
3280 122191782 : spin_lock(&ip->i_flags_lock);
3281 122191782 : ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
3282 122191782 : if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) {
3283 0 : spin_unlock(&ip->i_flags_lock);
3284 0 : continue;
3285 : }
3286 :
3287 : /*
3288 : * ILOCK will pin the inode against reclaim and prevent
3289 : * concurrent transactions modifying the inode while we are
3290 : * flushing the inode. If we get the lock, set the flushing
3291 : * state before we drop the i_flags_lock.
3292 : */
3293 122191782 : if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3294 940679 : spin_unlock(&ip->i_flags_lock);
3295 940679 : continue;
3296 : }
3297 121251103 : __xfs_iflags_set(ip, XFS_IFLUSHING);
3298 121251103 : spin_unlock(&ip->i_flags_lock);
3299 :
3300 : /*
3301 : * Abort flushing this inode if we are shut down because the
3302 : * inode may not currently be in the AIL. This can occur when
3303 : * log I/O failure unpins the inode without inserting into the
3304 : * AIL, leaving a dirty/unpinned inode attached to the buffer
3305 : * that otherwise looks like it should be flushed.
3306 : */
3307 242502206 : if (xlog_is_shutdown(mp->m_log)) {
3308 933748 : xfs_iunpin_wait(ip);
3309 933748 : xfs_iflush_abort(ip);
3310 933748 : xfs_iunlock(ip, XFS_ILOCK_SHARED);
3311 933748 : error = -EIO;
3312 933748 : continue;
3313 : }
3314 :
3315 : /* don't block waiting on a log force to unpin dirty inodes */
3316 120317355 : if (xfs_ipincount(ip)) {
3317 1 : xfs_iflags_clear(ip, XFS_IFLUSHING);
3318 1 : xfs_iunlock(ip, XFS_ILOCK_SHARED);
3319 1 : continue;
3320 : }
3321 :
3322 120317354 : if (!xfs_inode_clean(ip))
3323 120317354 : error = xfs_iflush(ip, bp);
3324 : else
3325 0 : xfs_iflags_clear(ip, XFS_IFLUSHING);
3326 120317354 : xfs_iunlock(ip, XFS_ILOCK_SHARED);
3327 120317354 : if (error)
3328 : break;
3329 120317354 : clcount++;
3330 : }
3331 :
3332 20489739 : if (error) {
3333 : /*
3334 : * Shutdown first so we kill the log before we release this
3335 : * buffer. If it is an INODE_ALLOC buffer and pins the tail
3336 : * of the log, failing it before the _log_ is shut down can
3337 : * result in the log tail being moved forward in the journal
3338 : * on disk because log writes can still be taking place. Hence
3339 : * unpinning the tail will allow the ICREATE intent to be
3340 : * removed from the log an recovery will fail with uninitialised
3341 : * inode cluster buffers.
3342 : */
3343 246124 : xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3344 246124 : bp->b_flags |= XBF_ASYNC;
3345 246124 : xfs_buf_ioend_fail(bp);
3346 246124 : return error;
3347 : }
3348 :
3349 20243615 : if (!clcount)
3350 : return -EAGAIN;
3351 :
3352 20138114 : XFS_STATS_INC(mp, xs_icluster_flushcnt);
3353 20138114 : XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
3354 20138114 : return 0;
3355 :
3356 : }
3357 :
3358 : /* Release an inode. */
3359 : void
3360 51286519471 : xfs_irele(
3361 : struct xfs_inode *ip)
3362 : {
3363 51286519471 : trace_xfs_irele(ip, _RET_IP_);
3364 51339075807 : iput(VFS_I(ip));
3365 51228782932 : }
3366 :
3367 : /*
3368 : * Ensure all commited transactions touching the inode are written to the log.
3369 : */
3370 : int
3371 303910 : xfs_log_force_inode(
3372 : struct xfs_inode *ip)
3373 : {
3374 303910 : xfs_csn_t seq = 0;
3375 :
3376 303910 : xfs_ilock(ip, XFS_ILOCK_SHARED);
3377 303910 : if (xfs_ipincount(ip))
3378 22160 : seq = ip->i_itemp->ili_commit_seq;
3379 303910 : xfs_iunlock(ip, XFS_ILOCK_SHARED);
3380 :
3381 303910 : if (!seq)
3382 : return 0;
3383 22160 : return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL);
3384 : }
3385 :
3386 : /*
3387 : * Grab the exclusive iolock for a data copy from src to dest, making sure to
3388 : * abide vfs locking order (lowest pointer value goes first) and breaking the
3389 : * layout leases before proceeding. The loop is needed because we cannot call
3390 : * the blocking break_layout() with the iolocks held, and therefore have to
3391 : * back out both locks.
3392 : */
3393 : static int
3394 104027372 : xfs_iolock_two_inodes_and_break_layout(
3395 : struct inode *src,
3396 : struct inode *dest)
3397 : {
3398 104027372 : int error;
3399 :
3400 104027372 : if (src > dest)
3401 51261790 : swap(src, dest);
3402 :
3403 104027372 : retry:
3404 : /* Wait to break both inodes' layouts before we start locking. */
3405 104027372 : error = break_layout(src, true);
3406 104027362 : if (error)
3407 0 : return error;
3408 104027362 : if (src != dest) {
3409 102370363 : error = break_layout(dest, true);
3410 102370016 : if (error)
3411 0 : return error;
3412 : }
3413 :
3414 : /* Lock one inode and make sure nobody got in and leased it. */
3415 104027015 : inode_lock(src);
3416 104027413 : error = break_layout(src, false);
3417 104028126 : if (error) {
3418 0 : inode_unlock(src);
3419 0 : if (error == -EWOULDBLOCK)
3420 0 : goto retry;
3421 0 : return error;
3422 : }
3423 :
3424 104028126 : if (src == dest)
3425 : return 0;
3426 :
3427 : /* Lock the other inode and make sure nobody got in and leased it. */
3428 102371127 : inode_lock_nested(dest, I_MUTEX_NONDIR2);
3429 102369551 : error = break_layout(dest, false);
3430 102369865 : if (error) {
3431 0 : inode_unlock(src);
3432 0 : inode_unlock(dest);
3433 0 : if (error == -EWOULDBLOCK)
3434 0 : goto retry;
3435 0 : return error;
3436 : }
3437 :
3438 : return 0;
3439 : }
3440 :
3441 : static int
3442 : xfs_mmaplock_two_inodes_and_break_dax_layout(
3443 : struct xfs_inode *ip1,
3444 : struct xfs_inode *ip2)
3445 : {
3446 : int error;
3447 : bool retry;
3448 : struct page *page;
3449 :
3450 : if (ip1->i_ino > ip2->i_ino)
3451 : swap(ip1, ip2);
3452 :
3453 : again:
3454 : retry = false;
3455 : /* Lock the first inode */
3456 : xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
3457 : error = xfs_break_dax_layouts(VFS_I(ip1), &retry);
3458 : if (error || retry) {
3459 : xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
3460 : if (error == 0 && retry)
3461 : goto again;
3462 : return error;
3463 : }
3464 :
3465 : if (ip1 == ip2)
3466 : return 0;
3467 :
3468 : /* Nested lock the second inode */
3469 : xfs_ilock(ip2, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1));
3470 : /*
3471 : * We cannot use xfs_break_dax_layouts() directly here because it may
3472 : * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
3473 : * for this nested lock case.
3474 : */
3475 : page = dax_layout_busy_page(VFS_I(ip2)->i_mapping);
3476 : if (page && page_ref_count(page) != 1) {
3477 : xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
3478 : xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
3479 : goto again;
3480 : }
3481 :
3482 : return 0;
3483 : }
3484 :
3485 : /*
3486 : * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
3487 : * mmap activity.
3488 : */
3489 : int
3490 104024276 : xfs_ilock2_io_mmap(
3491 : struct xfs_inode *ip1,
3492 : struct xfs_inode *ip2)
3493 : {
3494 104024276 : int ret;
3495 :
3496 104024276 : ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
3497 104027427 : if (ret)
3498 : return ret;
3499 :
3500 104027447 : if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
3501 : ret = xfs_mmaplock_two_inodes_and_break_dax_layout(ip1, ip2);
3502 : if (ret) {
3503 : inode_unlock(VFS_I(ip2));
3504 : if (ip1 != ip2)
3505 : inode_unlock(VFS_I(ip1));
3506 : return ret;
3507 : }
3508 : } else
3509 104027447 : filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping,
3510 : VFS_I(ip2)->i_mapping);
3511 :
3512 104027447 : return 0;
3513 : }
3514 :
3515 : /* Unlock both inodes to allow IO and mmap activity. */
3516 : void
3517 104022586 : xfs_iunlock2_io_mmap(
3518 : struct xfs_inode *ip1,
3519 : struct xfs_inode *ip2)
3520 : {
3521 104022586 : if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
3522 : xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
3523 : if (ip1 != ip2)
3524 : xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
3525 : } else
3526 104022586 : filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping,
3527 : VFS_I(ip2)->i_mapping);
3528 :
3529 104022760 : inode_unlock(VFS_I(ip2));
3530 104022146 : if (ip1 != ip2)
3531 102365148 : inode_unlock(VFS_I(ip1));
3532 104024746 : }
|