Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 : * All Rights Reserved.
5 : */
6 : #include <linux/iversion.h>
7 :
8 : #include "xfs.h"
9 : #include "xfs_fs.h"
10 : #include "xfs_shared.h"
11 : #include "xfs_format.h"
12 : #include "xfs_log_format.h"
13 : #include "xfs_trans_resv.h"
14 : #include "xfs_mount.h"
15 : #include "xfs_defer.h"
16 : #include "xfs_inode.h"
17 : #include "xfs_dir2.h"
18 : #include "xfs_attr.h"
19 : #include "xfs_trans_space.h"
20 : #include "xfs_trans.h"
21 : #include "xfs_buf_item.h"
22 : #include "xfs_inode_item.h"
23 : #include "xfs_iunlink_item.h"
24 : #include "xfs_ialloc.h"
25 : #include "xfs_bmap.h"
26 : #include "xfs_bmap_util.h"
27 : #include "xfs_errortag.h"
28 : #include "xfs_error.h"
29 : #include "xfs_quota.h"
30 : #include "xfs_filestream.h"
31 : #include "xfs_trace.h"
32 : #include "xfs_icache.h"
33 : #include "xfs_symlink.h"
34 : #include "xfs_trans_priv.h"
35 : #include "xfs_log.h"
36 : #include "xfs_bmap_btree.h"
37 : #include "xfs_reflink.h"
38 : #include "xfs_ag.h"
39 : #include "xfs_log_priv.h"
40 :
41 : struct kmem_cache *xfs_inode_cache;
42 :
43 : /*
44 : * Used in xfs_itruncate_extents(). This is the maximum number of extents
45 : * freed from a file in a single transaction.
46 : */
47 : #define XFS_ITRUNC_MAX_EXTENTS 2
48 :
49 : STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
50 : STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
51 : struct xfs_inode *);
52 :
53 : /*
54 : * helper function to extract extent size hint from inode
55 : */
56 : xfs_extlen_t
57 847540330 : xfs_get_extsz_hint(
58 : struct xfs_inode *ip)
59 : {
60 : /*
61 : * No point in aligning allocations if we need to COW to actually
62 : * write to them.
63 : */
64 847540330 : if (xfs_is_always_cow_inode(ip))
65 : return 0;
66 797680931 : if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
67 : return ip->i_extsize;
68 731069450 : if (XFS_IS_REALTIME_INODE(ip))
69 458259514 : return ip->i_mount->m_sb.sb_rextsize;
70 : return 0;
71 : }
72 :
73 : /*
74 : * Helper function to extract CoW extent size hint from inode.
75 : * Between the extent size hint and the CoW extent size hint, we
76 : * return the greater of the two. If the value is zero (automatic),
77 : * use the default size.
78 : */
79 : xfs_extlen_t
80 7886603 : xfs_get_cowextsz_hint(
81 : struct xfs_inode *ip)
82 : {
83 7886603 : xfs_extlen_t a, b;
84 :
85 7886603 : a = 0;
86 7886603 : if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
87 107936 : a = ip->i_cowextsize;
88 7886603 : b = xfs_get_extsz_hint(ip);
89 :
90 7885997 : a = max(a, b);
91 7885997 : if (a == 0)
92 7780896 : return XFS_DEFAULT_COWEXTSZ_HINT;
93 : return a;
94 : }
95 :
96 : /*
97 : * These two are wrapper routines around the xfs_ilock() routine used to
98 : * centralize some grungy code. They are used in places that wish to lock the
99 : * inode solely for reading the extents. The reason these places can't just
100 : * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
101 : * bringing in of the extents from disk for a file in b-tree format. If the
102 : * inode is in b-tree format, then we need to lock the inode exclusively until
103 : * the extents are read in. Locking it exclusively all the time would limit
104 : * our parallelism unnecessarily, though. What we do instead is check to see
105 : * if the extents have been read in yet, and only lock the inode exclusively
106 : * if they have not.
107 : *
108 : * The functions return a value which should be given to the corresponding
109 : * xfs_iunlock() call.
110 : */
111 : uint
112 551707172 : xfs_ilock_data_map_shared(
113 : struct xfs_inode *ip)
114 : {
115 551707172 : uint lock_mode = XFS_ILOCK_SHARED;
116 :
117 551707172 : if (xfs_need_iread_extents(&ip->i_df))
118 45319 : lock_mode = XFS_ILOCK_EXCL;
119 551604989 : xfs_ilock(ip, lock_mode);
120 551292337 : return lock_mode;
121 : }
122 :
123 : uint
124 113588753 : xfs_ilock_attr_map_shared(
125 : struct xfs_inode *ip)
126 : {
127 113588753 : uint lock_mode = XFS_ILOCK_SHARED;
128 :
129 209537708 : if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
130 2 : lock_mode = XFS_ILOCK_EXCL;
131 113442137 : xfs_ilock(ip, lock_mode);
132 113455758 : return lock_mode;
133 : }
134 :
135 : /*
136 : * You can't set both SHARED and EXCL for the same lock,
137 : * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED,
138 : * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values
139 : * to set in lock_flags.
140 : */
141 : static inline void
142 >12654*10^7 : xfs_lock_flags_assert(
143 : uint lock_flags)
144 : {
145 >12654*10^7 : ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
146 : (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
147 >12654*10^7 : ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
148 : (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
149 >12654*10^7 : ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
150 : (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
151 >12654*10^7 : ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
152 >12654*10^7 : ASSERT(lock_flags != 0);
153 >12654*10^7 : }
154 :
155 : /*
156 : * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
157 : * multi-reader locks: invalidate_lock and the i_lock. This routine allows
158 : * various combinations of the locks to be obtained.
159 : *
160 : * The 3 locks should always be ordered so that the IO lock is obtained first,
161 : * the mmap lock second and the ilock last in order to prevent deadlock.
162 : *
163 : * Basic locking order:
164 : *
165 : * i_rwsem -> invalidate_lock -> page_lock -> i_ilock
166 : *
167 : * mmap_lock locking order:
168 : *
169 : * i_rwsem -> page lock -> mmap_lock
170 : * mmap_lock -> invalidate_lock -> page_lock
171 : *
172 : * The difference in mmap_lock locking order mean that we cannot hold the
173 : * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths
174 : * can fault in pages during copy in/out (for buffered IO) or require the
175 : * mmap_lock in get_user_pages() to map the user pages into the kernel address
176 : * space for direct IO. Similarly the i_rwsem cannot be taken inside a page
177 : * fault because page faults already hold the mmap_lock.
178 : *
179 : * Hence to serialise fully against both syscall and mmap based IO, we need to
180 : * take both the i_rwsem and the invalidate_lock. These locks should *only* be
181 : * both taken in places where we need to invalidate the page cache in a race
182 : * free manner (e.g. truncate, hole punch and other extent manipulation
183 : * functions).
184 : */
185 : void
186 62658014184 : xfs_ilock(
187 : xfs_inode_t *ip,
188 : uint lock_flags)
189 : {
190 62658014184 : trace_xfs_ilock(ip, lock_flags, _RET_IP_);
191 :
192 62230731864 : xfs_lock_flags_assert(lock_flags);
193 :
194 62477312212 : if (lock_flags & XFS_IOLOCK_EXCL) {
195 2468579819 : down_write_nested(&VFS_I(ip)->i_rwsem,
196 : XFS_IOLOCK_DEP(lock_flags));
197 60008732393 : } else if (lock_flags & XFS_IOLOCK_SHARED) {
198 936904110 : down_read_nested(&VFS_I(ip)->i_rwsem,
199 : XFS_IOLOCK_DEP(lock_flags));
200 : }
201 :
202 62481173422 : if (lock_flags & XFS_MMAPLOCK_EXCL) {
203 325269182 : down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
204 : XFS_MMAPLOCK_DEP(lock_flags));
205 62155904240 : } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
206 78215983 : down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
207 : XFS_MMAPLOCK_DEP(lock_flags));
208 : }
209 :
210 62481496951 : if (lock_flags & XFS_ILOCK_EXCL)
211 4296205140 : mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
212 58185291811 : else if (lock_flags & XFS_ILOCK_SHARED)
213 54575946939 : mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
214 63584845856 : }
215 :
216 : /*
217 : * This is just like xfs_ilock(), except that the caller
218 : * is guaranteed not to sleep. It returns 1 if it gets
219 : * the requested locks and 0 otherwise. If the IO lock is
220 : * obtained but the inode lock cannot be, then the IO lock
221 : * is dropped before returning.
222 : *
223 : * ip -- the inode being locked
224 : * lock_flags -- this parameter indicates the inode's locks to be
225 : * to be locked. See the comment for xfs_ilock() for a list
226 : * of valid values.
227 : */
228 : int
229 2503512719 : xfs_ilock_nowait(
230 : xfs_inode_t *ip,
231 : uint lock_flags)
232 : {
233 2503512719 : trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
234 :
235 2502987718 : xfs_lock_flags_assert(lock_flags);
236 :
237 2503902831 : if (lock_flags & XFS_IOLOCK_EXCL) {
238 502757897 : if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
239 3574081 : goto out;
240 2001144934 : } else if (lock_flags & XFS_IOLOCK_SHARED) {
241 0 : if (!down_read_trylock(&VFS_I(ip)->i_rwsem))
242 0 : goto out;
243 : }
244 :
245 2500921862 : if (lock_flags & XFS_MMAPLOCK_EXCL) {
246 505961 : if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
247 100 : goto out_undo_iolock;
248 2500415901 : } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
249 0 : if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
250 0 : goto out_undo_iolock;
251 : }
252 :
253 2500921794 : if (lock_flags & XFS_ILOCK_EXCL) {
254 1286645660 : if (!mrtryupdate(&ip->i_lock))
255 212791 : goto out_undo_mmaplock;
256 1214276134 : } else if (lock_flags & XFS_ILOCK_SHARED) {
257 714228338 : if (!mrtryaccess(&ip->i_lock))
258 75672257 : goto out_undo_mmaplock;
259 : }
260 : return 1;
261 :
262 75871803 : out_undo_mmaplock:
263 75871803 : if (lock_flags & XFS_MMAPLOCK_EXCL)
264 0 : up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
265 75871803 : else if (lock_flags & XFS_MMAPLOCK_SHARED)
266 0 : up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
267 75871803 : out_undo_iolock:
268 75871903 : if (lock_flags & XFS_IOLOCK_EXCL)
269 0 : up_write(&VFS_I(ip)->i_rwsem);
270 75871903 : else if (lock_flags & XFS_IOLOCK_SHARED)
271 0 : up_read(&VFS_I(ip)->i_rwsem);
272 75871903 : out:
273 : return 0;
274 : }
275 :
276 : /*
277 : * xfs_iunlock() is used to drop the inode locks acquired with
278 : * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
279 : * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
280 : * that we know which locks to drop.
281 : *
282 : * ip -- the inode being unlocked
283 : * lock_flags -- this parameter indicates the inode's locks to be
284 : * to be unlocked. See the comment for xfs_ilock() for a list
285 : * of valid values for this parameter.
286 : *
287 : */
288 : void
289 62844470726 : xfs_iunlock(
290 : xfs_inode_t *ip,
291 : uint lock_flags)
292 : {
293 62844470726 : xfs_lock_flags_assert(lock_flags);
294 :
295 62271691110 : if (lock_flags & XFS_IOLOCK_EXCL)
296 2959399957 : up_write(&VFS_I(ip)->i_rwsem);
297 59312291153 : else if (lock_flags & XFS_IOLOCK_SHARED)
298 941437779 : up_read(&VFS_I(ip)->i_rwsem);
299 :
300 62285949045 : if (lock_flags & XFS_MMAPLOCK_EXCL)
301 326173888 : up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
302 61959775157 : else if (lock_flags & XFS_MMAPLOCK_SHARED)
303 78404169 : up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
304 :
305 62286109989 : if (lock_flags & XFS_ILOCK_EXCL)
306 5593875760 : mrunlock_excl(&ip->i_lock);
307 56692234229 : else if (lock_flags & XFS_ILOCK_SHARED)
308 55123430512 : mrunlock_shared(&ip->i_lock);
309 :
310 63173550611 : trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
311 61952378134 : }
312 :
313 : /*
314 : * give up write locks. the i/o lock cannot be held nested
315 : * if it is being demoted.
316 : */
317 : void
318 4986346 : xfs_ilock_demote(
319 : xfs_inode_t *ip,
320 : uint lock_flags)
321 : {
322 4986346 : ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
323 4986346 : ASSERT((lock_flags &
324 : ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
325 :
326 4986346 : if (lock_flags & XFS_ILOCK_EXCL)
327 0 : mrdemote(&ip->i_lock);
328 4986346 : if (lock_flags & XFS_MMAPLOCK_EXCL)
329 0 : downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
330 4986346 : if (lock_flags & XFS_IOLOCK_EXCL)
331 4958839 : downgrade_write(&VFS_I(ip)->i_rwsem);
332 :
333 5101407 : trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
334 4891288 : }
335 :
336 : #if defined(DEBUG) || defined(XFS_WARN)
337 : static inline bool
338 : __xfs_rwsem_islocked(
339 : struct rw_semaphore *rwsem,
340 : bool shared)
341 : {
342 1845477745 : if (!debug_locks)
343 0 : return rwsem_is_locked(rwsem);
344 :
345 : if (!shared)
346 : return lockdep_is_held_type(rwsem, 0);
347 :
348 : /*
349 : * We are checking that the lock is held at least in shared
350 : * mode but don't care that it might be held exclusively
351 : * (i.e. shared | excl). Hence we check if the lock is held
352 : * in any mode rather than an explicit shared mode.
353 : */
354 : return lockdep_is_held_type(rwsem, -1);
355 : }
356 :
357 : bool
358 20026908545 : xfs_isilocked(
359 : struct xfs_inode *ip,
360 : uint lock_flags)
361 : {
362 20026908545 : if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
363 18200003622 : if (!(lock_flags & XFS_ILOCK_SHARED))
364 9292563537 : return !!ip->i_lock.mr_writer;
365 8907440085 : return rwsem_is_locked(&ip->i_lock.mr_lock);
366 : }
367 :
368 1826904923 : if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
369 80934174 : return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock,
370 : (lock_flags & XFS_MMAPLOCK_SHARED));
371 : }
372 :
373 1745970749 : if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
374 1764543571 : return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
375 : (lock_flags & XFS_IOLOCK_SHARED));
376 : }
377 :
378 0 : ASSERT(0);
379 0 : return false;
380 : }
381 : #endif
382 :
383 : /*
384 : * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
385 : * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
386 : * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
387 : * errors and warnings.
388 : */
389 : #if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
390 : static bool
391 : xfs_lockdep_subclass_ok(
392 : int subclass)
393 : {
394 : return subclass < MAX_LOCKDEP_SUBCLASSES;
395 : }
396 : #else
397 : #define xfs_lockdep_subclass_ok(subclass) (true)
398 : #endif
399 :
400 : /*
401 : * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
402 : * value. This can be called for any type of inode lock combination, including
403 : * parent locking. Care must be taken to ensure we don't overrun the subclass
404 : * storage fields in the class mask we build.
405 : */
406 : static inline uint
407 284329731 : xfs_lock_inumorder(
408 : uint lock_mode,
409 : uint subclass)
410 : {
411 284329731 : uint class = 0;
412 :
413 284329731 : ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
414 : XFS_ILOCK_RTSUM)));
415 284329731 : ASSERT(xfs_lockdep_subclass_ok(subclass));
416 :
417 284329731 : if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
418 0 : ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
419 0 : class += subclass << XFS_IOLOCK_SHIFT;
420 : }
421 :
422 284329731 : if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
423 0 : ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS);
424 0 : class += subclass << XFS_MMAPLOCK_SHIFT;
425 : }
426 :
427 284329731 : if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) {
428 284348046 : ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS);
429 284348046 : class += subclass << XFS_ILOCK_SHIFT;
430 : }
431 :
432 284329731 : return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class;
433 : }
434 :
435 : /*
436 : * The following routine will lock n inodes in exclusive mode. We assume the
437 : * caller calls us with the inodes in i_ino order.
438 : *
439 : * We need to detect deadlock where an inode that we lock is in the AIL and we
440 : * start waiting for another inode that is locked by a thread in a long running
441 : * transaction (such as truncate). This can result in deadlock since the long
442 : * running trans might need to wait for the inode we just locked in order to
443 : * push the tail and free space in the log.
444 : *
445 : * xfs_lock_inodes() can only be used to lock one type of lock at a time -
446 : * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
447 : * lock more than one at a time, lockdep will report false positives saying we
448 : * have violated locking orders.
449 : */
450 : static void
451 46001517 : xfs_lock_inodes(
452 : struct xfs_inode **ips,
453 : int inodes,
454 : uint lock_mode)
455 : {
456 46001517 : int attempts = 0;
457 46001517 : uint i;
458 46001517 : int j;
459 46001517 : bool try_lock;
460 46001517 : struct xfs_log_item *lp;
461 :
462 : /*
463 : * Currently supports between 2 and 5 inodes with exclusive locking. We
464 : * support an arbitrary depth of locking here, but absolute limits on
465 : * inodes depend on the type of locking and the limits placed by
466 : * lockdep annotations in xfs_lock_inumorder. These are all checked by
467 : * the asserts.
468 : */
469 46001517 : ASSERT(ips && inodes >= 2 && inodes <= 5);
470 46001517 : ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL |
471 : XFS_ILOCK_EXCL));
472 46001517 : ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
473 : XFS_ILOCK_SHARED)));
474 46001517 : ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
475 : inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
476 46001517 : ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
477 : inodes <= XFS_ILOCK_MAX_SUBCLASS + 1);
478 :
479 46001517 : if (lock_mode & XFS_IOLOCK_EXCL) {
480 0 : ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)));
481 46001517 : } else if (lock_mode & XFS_MMAPLOCK_EXCL)
482 0 : ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
483 :
484 46001517 : again:
485 46016994 : try_lock = false;
486 46016994 : i = 0;
487 199909550 : for (; i < inodes; i++) {
488 153906970 : ASSERT(ips[i]);
489 :
490 153906970 : if (i && (ips[i] == ips[i - 1])) /* Already locked */
491 4508019 : continue;
492 :
493 : /*
494 : * If try_lock is not set yet, make sure all locked inodes are
495 : * not in the AIL. If any are, set try_lock to be used later.
496 : */
497 149398951 : if (!try_lock) {
498 197920785 : for (j = (i - 1); j >= 0 && !try_lock; j--) {
499 86715275 : lp = &ips[j]->i_itemp->ili_item;
500 172722235 : if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
501 32421130 : try_lock = true;
502 : }
503 : }
504 :
505 : /*
506 : * If any of the previous locks we have locked is in the AIL,
507 : * we must TRY to get the second and subsequent locks. If
508 : * we can't get any, we must release all we have
509 : * and try again.
510 : */
511 149398951 : if (!try_lock) {
512 78784989 : xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
513 78785116 : continue;
514 : }
515 :
516 : /* try_lock means we have an inode locked that is in the AIL. */
517 70613962 : ASSERT(i != 0);
518 70613962 : if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
519 70599421 : continue;
520 :
521 : /*
522 : * Unlock all previous guys and try again. xfs_iunlock will try
523 : * to push the tail if the inode is in the AIL.
524 : */
525 15477 : attempts++;
526 49531 : for (j = i - 1; j >= 0; j--) {
527 : /*
528 : * Check to see if we've already unlocked this one. Not
529 : * the first one going back, and the inode ptr is the
530 : * same.
531 : */
532 34054 : if (j != (i - 1) && ips[j] == ips[j + 1])
533 10968 : continue;
534 :
535 23086 : xfs_iunlock(ips[j], lock_mode);
536 : }
537 :
538 15477 : if ((attempts % 5) == 0) {
539 3035 : delay(1); /* Don't just spin the CPU */
540 : }
541 15477 : goto again;
542 : }
543 46002580 : }
544 :
545 : /*
546 : * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and
547 : * mmaplock must be double-locked separately since we use i_rwsem and
548 : * invalidate_lock for that. We now support taking one lock EXCL and the
549 : * other SHARED.
550 : */
551 : void
552 67503305 : xfs_lock_two_inodes(
553 : struct xfs_inode *ip0,
554 : uint ip0_mode,
555 : struct xfs_inode *ip1,
556 : uint ip1_mode)
557 : {
558 67503305 : int attempts = 0;
559 67503305 : struct xfs_log_item *lp;
560 :
561 67503305 : ASSERT(hweight32(ip0_mode) == 1);
562 67503305 : ASSERT(hweight32(ip1_mode) == 1);
563 67503305 : ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
564 67503305 : ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
565 67503305 : ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
566 67503305 : ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
567 67503305 : ASSERT(ip0->i_ino != ip1->i_ino);
568 :
569 67503305 : if (ip0->i_ino > ip1->i_ino) {
570 13328786 : swap(ip0, ip1);
571 13328786 : swap(ip0_mode, ip1_mode);
572 : }
573 :
574 67503305 : again:
575 67515071 : xfs_ilock(ip0, xfs_lock_inumorder(ip0_mode, 0));
576 :
577 : /*
578 : * If the first lock we have locked is in the AIL, we must TRY to get
579 : * the second lock. If we can't get it, we must release the first one
580 : * and try again.
581 : */
582 67525877 : lp = &ip0->i_itemp->ili_item;
583 67525877 : if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
584 48236708 : if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) {
585 11765 : xfs_iunlock(ip0, ip0_mode);
586 11766 : if ((++attempts % 5) == 0)
587 2266 : delay(1); /* Don't just spin the CPU */
588 11766 : goto again;
589 : }
590 : } else {
591 19289169 : xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1));
592 : }
593 67520739 : }
594 :
595 : uint
596 53537294884 : xfs_ip2xflags(
597 : struct xfs_inode *ip)
598 : {
599 53537294884 : uint flags = 0;
600 :
601 53537294884 : if (ip->i_diflags & XFS_DIFLAG_ANY) {
602 16012670344 : if (ip->i_diflags & XFS_DIFLAG_REALTIME)
603 7029725567 : flags |= FS_XFLAG_REALTIME;
604 16012670344 : if (ip->i_diflags & XFS_DIFLAG_PREALLOC)
605 8300985975 : flags |= FS_XFLAG_PREALLOC;
606 16012670344 : if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE)
607 956 : flags |= FS_XFLAG_IMMUTABLE;
608 16012670344 : if (ip->i_diflags & XFS_DIFLAG_APPEND)
609 907 : flags |= FS_XFLAG_APPEND;
610 16012670344 : if (ip->i_diflags & XFS_DIFLAG_SYNC)
611 7506 : flags |= FS_XFLAG_SYNC;
612 16012670344 : if (ip->i_diflags & XFS_DIFLAG_NOATIME)
613 235 : flags |= FS_XFLAG_NOATIME;
614 16012670344 : if (ip->i_diflags & XFS_DIFLAG_NODUMP)
615 248 : flags |= FS_XFLAG_NODUMP;
616 16012670344 : if (ip->i_diflags & XFS_DIFLAG_RTINHERIT)
617 3544004403 : flags |= FS_XFLAG_RTINHERIT;
618 16012670344 : if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT)
619 3733 : flags |= FS_XFLAG_PROJINHERIT;
620 16012670344 : if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS)
621 44 : flags |= FS_XFLAG_NOSYMLINKS;
622 16012670344 : if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
623 16277 : flags |= FS_XFLAG_EXTSIZE;
624 16012670344 : if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT)
625 11628 : flags |= FS_XFLAG_EXTSZINHERIT;
626 16012670344 : if (ip->i_diflags & XFS_DIFLAG_NODEFRAG)
627 4 : flags |= FS_XFLAG_NODEFRAG;
628 16012670344 : if (ip->i_diflags & XFS_DIFLAG_FILESTREAM)
629 80340 : flags |= FS_XFLAG_FILESTREAM;
630 : }
631 :
632 53537294884 : if (ip->i_diflags2 & XFS_DIFLAG2_ANY) {
633 52867866936 : if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
634 473 : flags |= FS_XFLAG_DAX;
635 52867866936 : if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
636 38738 : flags |= FS_XFLAG_COWEXTSIZE;
637 : }
638 :
639 53537294884 : if (xfs_inode_has_attr_fork(ip))
640 7037594455 : flags |= FS_XFLAG_HASATTR;
641 53537294884 : return flags;
642 : }
643 :
644 : /*
645 : * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
646 : * is allowed, otherwise it has to be an exact match. If a CI match is found,
647 : * ci_name->name will point to a the actual name (caller must free) or
648 : * will be set to NULL if an exact match is found.
649 : */
650 : int
651 170609237 : xfs_lookup(
652 : struct xfs_inode *dp,
653 : const struct xfs_name *name,
654 : struct xfs_inode **ipp,
655 : struct xfs_name *ci_name)
656 : {
657 170609237 : xfs_ino_t inum;
658 170609237 : int error;
659 :
660 170609237 : trace_xfs_lookup(dp, name);
661 :
662 340463194 : if (xfs_is_shutdown(dp->i_mount))
663 : return -EIO;
664 :
665 170175156 : error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
666 170545976 : if (error)
667 122845206 : goto out_unlock;
668 :
669 47700770 : error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
670 47746188 : if (error)
671 4187 : goto out_free_name;
672 :
673 : return 0;
674 :
675 : out_free_name:
676 4187 : if (ci_name)
677 0 : kmem_free(ci_name->name);
678 4187 : out_unlock:
679 122849393 : *ipp = NULL;
680 122849393 : return error;
681 : }
682 :
683 : /* Propagate di_flags from a parent inode to a child inode. */
684 : static void
685 22209839 : xfs_inode_inherit_flags(
686 : struct xfs_inode *ip,
687 : const struct xfs_inode *pip)
688 : {
689 22209839 : unsigned int di_flags = 0;
690 22209839 : xfs_failaddr_t failaddr;
691 22209839 : umode_t mode = VFS_I(ip)->i_mode;
692 :
693 22209839 : if (S_ISDIR(mode)) {
694 2865618 : if (pip->i_diflags & XFS_DIFLAG_RTINHERIT)
695 2865543 : di_flags |= XFS_DIFLAG_RTINHERIT;
696 2865618 : if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
697 0 : di_flags |= XFS_DIFLAG_EXTSZINHERIT;
698 0 : ip->i_extsize = pip->i_extsize;
699 : }
700 2865618 : if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT)
701 6 : di_flags |= XFS_DIFLAG_PROJINHERIT;
702 19344221 : } else if (S_ISREG(mode)) {
703 19327736 : if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
704 19275496 : xfs_has_realtime(ip->i_mount))
705 18840291 : di_flags |= XFS_DIFLAG_REALTIME;
706 19327736 : if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
707 2146 : di_flags |= XFS_DIFLAG_EXTSIZE;
708 2146 : ip->i_extsize = pip->i_extsize;
709 : }
710 : }
711 22209839 : if ((pip->i_diflags & XFS_DIFLAG_NOATIME) &&
712 0 : xfs_inherit_noatime)
713 0 : di_flags |= XFS_DIFLAG_NOATIME;
714 22209839 : if ((pip->i_diflags & XFS_DIFLAG_NODUMP) &&
715 0 : xfs_inherit_nodump)
716 0 : di_flags |= XFS_DIFLAG_NODUMP;
717 22209839 : if ((pip->i_diflags & XFS_DIFLAG_SYNC) &&
718 0 : xfs_inherit_sync)
719 0 : di_flags |= XFS_DIFLAG_SYNC;
720 22209839 : if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) &&
721 11 : xfs_inherit_nosymlinks)
722 0 : di_flags |= XFS_DIFLAG_NOSYMLINKS;
723 22209839 : if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) &&
724 0 : xfs_inherit_nodefrag)
725 0 : di_flags |= XFS_DIFLAG_NODEFRAG;
726 22209839 : if (pip->i_diflags & XFS_DIFLAG_FILESTREAM)
727 26199 : di_flags |= XFS_DIFLAG_FILESTREAM;
728 :
729 22209839 : ip->i_diflags |= di_flags;
730 :
731 : /*
732 : * Inode verifiers on older kernels only check that the extent size
733 : * hint is an integer multiple of the rt extent size on realtime files.
734 : * They did not check the hint alignment on a directory with both
735 : * rtinherit and extszinherit flags set. If the misaligned hint is
736 : * propagated from a directory into a new realtime file, new file
737 : * allocations will fail due to math errors in the rt allocator and/or
738 : * trip the verifiers. Validate the hint settings in the new file so
739 : * that we don't let broken hints propagate.
740 : */
741 22209839 : failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize,
742 : VFS_I(ip)->i_mode, ip->i_diflags);
743 22030495 : if (failaddr) {
744 3 : ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
745 : XFS_DIFLAG_EXTSZINHERIT);
746 3 : ip->i_extsize = 0;
747 : }
748 22030495 : }
749 :
750 : /* Propagate di_flags2 from a parent inode to a child inode. */
751 : static void
752 50770299 : xfs_inode_inherit_flags2(
753 : struct xfs_inode *ip,
754 : const struct xfs_inode *pip)
755 : {
756 50770299 : xfs_failaddr_t failaddr;
757 :
758 50770299 : if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
759 944 : ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
760 944 : ip->i_cowextsize = pip->i_cowextsize;
761 : }
762 50770299 : if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
763 165 : ip->i_diflags2 |= XFS_DIFLAG2_DAX;
764 :
765 : /* Don't let invalid cowextsize hints propagate. */
766 50770299 : failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
767 50770299 : VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2);
768 50970933 : if (failaddr) {
769 0 : ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
770 0 : ip->i_cowextsize = 0;
771 : }
772 50970933 : }
773 :
774 : /*
775 : * Initialise a newly allocated inode and return the in-core inode to the
776 : * caller locked exclusively.
777 : */
778 : int
779 84709894 : xfs_init_new_inode(
780 : struct mnt_idmap *idmap,
781 : struct xfs_trans *tp,
782 : struct xfs_inode *pip,
783 : xfs_ino_t ino,
784 : umode_t mode,
785 : xfs_nlink_t nlink,
786 : dev_t rdev,
787 : prid_t prid,
788 : bool init_xattrs,
789 : struct xfs_inode **ipp)
790 : {
791 84709894 : struct inode *dir = pip ? VFS_I(pip) : NULL;
792 84709894 : struct xfs_mount *mp = tp->t_mountp;
793 84709894 : struct xfs_inode *ip;
794 84709894 : unsigned int flags;
795 84709894 : int error;
796 84709894 : struct timespec64 tv;
797 84709894 : struct inode *inode;
798 :
799 : /*
800 : * Protect against obviously corrupt allocation btree records. Later
801 : * xfs_iget checks will catch re-allocation of other active in-memory
802 : * and on-disk inodes. If we don't catch reallocating the parent inode
803 : * here we will deadlock in xfs_iget() so we have to do these checks
804 : * first.
805 : */
806 84709894 : if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) {
807 0 : xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
808 0 : return -EFSCORRUPTED;
809 : }
810 :
811 : /*
812 : * Get the in-core inode with the lock held exclusively to prevent
813 : * others from looking at until we're done.
814 : */
815 84577589 : error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
816 84875274 : if (error)
817 : return error;
818 :
819 84875263 : ASSERT(ip != NULL);
820 84875263 : inode = VFS_I(ip);
821 84875263 : set_nlink(inode, nlink);
822 84678880 : inode->i_rdev = rdev;
823 84678880 : ip->i_projid = prid;
824 :
825 84678880 : if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) {
826 0 : inode_fsuid_set(inode, idmap);
827 0 : inode->i_gid = dir->i_gid;
828 0 : inode->i_mode = mode;
829 : } else {
830 84678880 : inode_init_owner(idmap, inode, dir, mode);
831 : }
832 :
833 : /*
834 : * If the group ID of the new file does not match the effective group
835 : * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
836 : * (and only if the irix_sgid_inherit compatibility variable is set).
837 : */
838 84627484 : if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
839 0 : !vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)))
840 0 : inode->i_mode &= ~S_ISGID;
841 :
842 84627484 : ip->i_disk_size = 0;
843 84627484 : ip->i_df.if_nextents = 0;
844 84627484 : ASSERT(ip->i_nblocks == 0);
845 :
846 84627484 : tv = current_time(inode);
847 84278527 : inode->i_mtime = tv;
848 84278527 : inode->i_atime = tv;
849 84278527 : inode->i_ctime = tv;
850 :
851 84278527 : ip->i_extsize = 0;
852 84278527 : ip->i_diflags = 0;
853 :
854 84278527 : if (xfs_has_v3inodes(mp)) {
855 84470210 : inode_set_iversion(inode, 1);
856 84470210 : ip->i_cowextsize = 0;
857 84470210 : ip->i_crtime = tv;
858 : }
859 :
860 84278527 : flags = XFS_ILOG_CORE;
861 84278527 : switch (mode & S_IFMT) {
862 7561403 : case S_IFIFO:
863 : case S_IFCHR:
864 : case S_IFBLK:
865 : case S_IFSOCK:
866 7561403 : ip->i_df.if_format = XFS_DINODE_FMT_DEV;
867 7561403 : flags |= XFS_ILOG_DEV;
868 7561403 : break;
869 51014476 : case S_IFREG:
870 : case S_IFDIR:
871 51014476 : if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
872 22145128 : xfs_inode_inherit_flags(ip, pip);
873 50788886 : if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
874 50785694 : xfs_inode_inherit_flags2(ip, pip);
875 76668201 : fallthrough;
876 : case S_IFLNK:
877 76668201 : ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
878 76668201 : ip->i_df.if_bytes = 0;
879 76668201 : ip->i_df.if_u1.if_root = NULL;
880 76668201 : break;
881 0 : default:
882 0 : ASSERT(0);
883 : }
884 :
885 : /*
886 : * If we need to create attributes immediately after allocating the
887 : * inode, initialise an empty attribute fork right now. We use the
888 : * default fork offset for attributes here as we don't know exactly what
889 : * size or how many attributes we might be adding. We can do this
890 : * safely here because we know the data fork is completely empty and
891 : * this saves us from needing to run a separate transaction to set the
892 : * fork offset in the immediate future.
893 : */
894 84229604 : if (init_xattrs && xfs_has_attr(mp)) {
895 170965 : ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
896 170965 : xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
897 : }
898 :
899 : /*
900 : * Log the new values stuffed into the inode.
901 : */
902 84229604 : xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
903 84489644 : xfs_trans_log_inode(tp, ip, flags);
904 :
905 : /* now that we have an i_mode we can setup the inode structure */
906 84972673 : xfs_setup_inode(ip);
907 :
908 84532313 : *ipp = ip;
909 84532313 : return 0;
910 : }
911 :
912 : /*
913 : * Decrement the link count on an inode & log the change. If this causes the
914 : * link count to go to zero, move the inode to AGI unlinked list so that it can
915 : * be freed when the last active reference goes away via xfs_inactive().
916 : */
917 : static int /* error */
918 66592166 : xfs_droplink(
919 : xfs_trans_t *tp,
920 : xfs_inode_t *ip)
921 : {
922 66592166 : xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
923 :
924 66547074 : drop_nlink(VFS_I(ip));
925 66610073 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
926 :
927 66648568 : if (VFS_I(ip)->i_nlink)
928 : return 0;
929 :
930 49757616 : return xfs_iunlink(tp, ip);
931 : }
932 :
933 : /*
934 : * Increment the link count on an inode & log the change.
935 : */
936 : static void
937 26878812 : xfs_bumplink(
938 : xfs_trans_t *tp,
939 : xfs_inode_t *ip)
940 : {
941 26878812 : xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
942 :
943 26878608 : inc_nlink(VFS_I(ip));
944 26876613 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
945 26883865 : }
946 :
947 : int
948 49039323 : xfs_create(
949 : struct mnt_idmap *idmap,
950 : xfs_inode_t *dp,
951 : struct xfs_name *name,
952 : umode_t mode,
953 : dev_t rdev,
954 : bool init_xattrs,
955 : xfs_inode_t **ipp)
956 : {
957 49039323 : int is_dir = S_ISDIR(mode);
958 49039323 : struct xfs_mount *mp = dp->i_mount;
959 49039323 : struct xfs_inode *ip = NULL;
960 49039323 : struct xfs_trans *tp = NULL;
961 49039323 : int error;
962 49039323 : bool unlock_dp_on_error = false;
963 49039323 : prid_t prid;
964 49039323 : struct xfs_dquot *udqp = NULL;
965 49039323 : struct xfs_dquot *gdqp = NULL;
966 49039323 : struct xfs_dquot *pdqp = NULL;
967 49039323 : struct xfs_trans_res *tres;
968 49039323 : uint resblks;
969 49039323 : xfs_ino_t ino;
970 :
971 49039323 : trace_xfs_create(dp, name);
972 :
973 97435922 : if (xfs_is_shutdown(mp))
974 : return -EIO;
975 :
976 48717939 : prid = xfs_get_initial_prid(dp);
977 :
978 : /*
979 : * Make sure that we have allocated dquot(s) on disk.
980 : */
981 48717939 : error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns),
982 : mapped_fsgid(idmap, &init_user_ns), prid,
983 : XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
984 : &udqp, &gdqp, &pdqp);
985 49249303 : if (error)
986 : return error;
987 :
988 49248351 : if (is_dir) {
989 6667733 : resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
990 6667733 : tres = &M_RES(mp)->tr_mkdir;
991 : } else {
992 42580618 : resblks = XFS_CREATE_SPACE_RES(mp, name->len);
993 42580618 : tres = &M_RES(mp)->tr_create;
994 : }
995 :
996 : /*
997 : * Initially assume that the file does not exist and
998 : * reserve the resources for that case. If that is not
999 : * the case we'll drop the one we have and get a more
1000 : * appropriate transaction later.
1001 : */
1002 49248351 : error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
1003 : &tp);
1004 49348243 : if (error == -ENOSPC) {
1005 : /* flush outstanding delalloc blocks and retry */
1006 460454 : xfs_flush_inodes(mp);
1007 460382 : error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp,
1008 : resblks, &tp);
1009 : }
1010 49348127 : if (error)
1011 426122 : goto out_release_dquots;
1012 :
1013 48922005 : xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1014 48890477 : unlock_dp_on_error = true;
1015 :
1016 : /*
1017 : * A newly created regular or special file just has one directory
1018 : * entry pointing to them, but a directory also the "." entry
1019 : * pointing to itself.
1020 : */
1021 48890477 : error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
1022 49063629 : if (!error)
1023 91245020 : error = xfs_init_new_inode(idmap, tp, dp, ino, mode,
1024 : is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip);
1025 48948721 : if (error)
1026 158574 : goto out_trans_cancel;
1027 :
1028 : /*
1029 : * Now we join the directory inode to the transaction. We do not do it
1030 : * earlier because xfs_dialloc might commit the previous transaction
1031 : * (and release all the locks). An error from here on will result in
1032 : * the transaction cancel unlocking dp so don't do it explicitly in the
1033 : * error path.
1034 : */
1035 48790147 : xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1036 48902919 : unlock_dp_on_error = false;
1037 :
1038 97805838 : error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1039 48904937 : resblks - XFS_IALLOC_SPACE_RES(mp));
1040 48785067 : if (error) {
1041 173 : ASSERT(error != -ENOSPC);
1042 173 : goto out_trans_cancel;
1043 : }
1044 48784894 : xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1045 48757963 : xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1046 :
1047 48935400 : if (is_dir) {
1048 6561948 : error = xfs_dir_init(tp, ip, dp);
1049 6560664 : if (error)
1050 0 : goto out_trans_cancel;
1051 :
1052 6560664 : xfs_bumplink(tp, dp);
1053 : }
1054 :
1055 : /*
1056 : * If this is a synchronous mount, make sure that the
1057 : * create transaction goes to disk before returning to
1058 : * the user.
1059 : */
1060 48935315 : if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
1061 708 : xfs_trans_set_sync(tp);
1062 :
1063 : /*
1064 : * Attach the dquot(s) to the inodes and modify them incore.
1065 : * These ids of the inode couldn't have changed since the new
1066 : * inode has been locked ever since it was created.
1067 : */
1068 48935315 : xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1069 :
1070 48858232 : error = xfs_trans_commit(tp);
1071 48892820 : if (error)
1072 278 : goto out_release_inode;
1073 :
1074 48892542 : xfs_qm_dqrele(udqp);
1075 48812353 : xfs_qm_dqrele(gdqp);
1076 48839590 : xfs_qm_dqrele(pdqp);
1077 :
1078 48850178 : *ipp = ip;
1079 48850178 : return 0;
1080 :
1081 158747 : out_trans_cancel:
1082 158747 : xfs_trans_cancel(tp);
1083 150839 : out_release_inode:
1084 : /*
1085 : * Wait until after the current transaction is aborted to finish the
1086 : * setup of the inode and release the inode. This prevents recursive
1087 : * transactions and deadlocks from xfs_inactive.
1088 : */
1089 150839 : if (ip) {
1090 451 : xfs_finish_inode_setup(ip);
1091 451 : xfs_irele(ip);
1092 : }
1093 150388 : out_release_dquots:
1094 576961 : xfs_qm_dqrele(udqp);
1095 585583 : xfs_qm_dqrele(gdqp);
1096 585215 : xfs_qm_dqrele(pdqp);
1097 :
1098 584872 : if (unlock_dp_on_error)
1099 158121 : xfs_iunlock(dp, XFS_ILOCK_EXCL);
1100 : return error;
1101 : }
1102 :
1103 : int
1104 9750662 : xfs_create_tmpfile(
1105 : struct mnt_idmap *idmap,
1106 : struct xfs_inode *dp,
1107 : umode_t mode,
1108 : struct xfs_inode **ipp)
1109 : {
1110 9750662 : struct xfs_mount *mp = dp->i_mount;
1111 9750662 : struct xfs_inode *ip = NULL;
1112 9750662 : struct xfs_trans *tp = NULL;
1113 9750662 : int error;
1114 9750662 : prid_t prid;
1115 9750662 : struct xfs_dquot *udqp = NULL;
1116 9750662 : struct xfs_dquot *gdqp = NULL;
1117 9750662 : struct xfs_dquot *pdqp = NULL;
1118 9750662 : struct xfs_trans_res *tres;
1119 9750662 : uint resblks;
1120 9750662 : xfs_ino_t ino;
1121 :
1122 19501324 : if (xfs_is_shutdown(mp))
1123 : return -EIO;
1124 :
1125 9750661 : prid = xfs_get_initial_prid(dp);
1126 :
1127 : /*
1128 : * Make sure that we have allocated dquot(s) on disk.
1129 : */
1130 9750661 : error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns),
1131 : mapped_fsgid(idmap, &init_user_ns), prid,
1132 : XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1133 : &udqp, &gdqp, &pdqp);
1134 9875916 : if (error)
1135 : return error;
1136 :
1137 9875916 : resblks = XFS_IALLOC_SPACE_RES(mp);
1138 9875916 : tres = &M_RES(mp)->tr_create_tmpfile;
1139 :
1140 9875916 : error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
1141 : &tp);
1142 10275372 : if (error)
1143 57007 : goto out_release_dquots;
1144 :
1145 10218365 : error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
1146 10085557 : if (!error)
1147 10147724 : error = xfs_init_new_inode(idmap, tp, dp, ino, mode,
1148 : 0, 0, prid, false, &ip);
1149 10037187 : if (error)
1150 514 : goto out_trans_cancel;
1151 :
1152 10036673 : if (xfs_has_wsync(mp))
1153 0 : xfs_trans_set_sync(tp);
1154 :
1155 : /*
1156 : * Attach the dquot(s) to the inodes and modify them incore.
1157 : * These ids of the inode couldn't have changed since the new
1158 : * inode has been locked ever since it was created.
1159 : */
1160 10036673 : xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1161 :
1162 10167099 : error = xfs_iunlink(tp, ip);
1163 10232600 : if (error)
1164 0 : goto out_trans_cancel;
1165 :
1166 10232600 : error = xfs_trans_commit(tp);
1167 10296312 : if (error)
1168 1 : goto out_release_inode;
1169 :
1170 10296311 : xfs_qm_dqrele(udqp);
1171 10197446 : xfs_qm_dqrele(gdqp);
1172 10174044 : xfs_qm_dqrele(pdqp);
1173 :
1174 10203239 : *ipp = ip;
1175 10203239 : return 0;
1176 :
1177 514 : out_trans_cancel:
1178 514 : xfs_trans_cancel(tp);
1179 515 : out_release_inode:
1180 : /*
1181 : * Wait until after the current transaction is aborted to finish the
1182 : * setup of the inode and release the inode. This prevents recursive
1183 : * transactions and deadlocks from xfs_inactive.
1184 : */
1185 515 : if (ip) {
1186 1 : xfs_finish_inode_setup(ip);
1187 1 : xfs_irele(ip);
1188 : }
1189 514 : out_release_dquots:
1190 57522 : xfs_qm_dqrele(udqp);
1191 57522 : xfs_qm_dqrele(gdqp);
1192 57522 : xfs_qm_dqrele(pdqp);
1193 :
1194 57522 : return error;
1195 : }
1196 :
1197 : int
1198 8335258 : xfs_link(
1199 : xfs_inode_t *tdp,
1200 : xfs_inode_t *sip,
1201 : struct xfs_name *target_name)
1202 : {
1203 8335258 : xfs_mount_t *mp = tdp->i_mount;
1204 8335258 : xfs_trans_t *tp;
1205 8335258 : int error, nospace_error = 0;
1206 8335258 : int resblks;
1207 :
1208 8335258 : trace_xfs_link(tdp, target_name);
1209 :
1210 8334953 : ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
1211 :
1212 16669906 : if (xfs_is_shutdown(mp))
1213 : return -EIO;
1214 :
1215 8334953 : error = xfs_qm_dqattach(sip);
1216 8334854 : if (error)
1217 0 : goto std_return;
1218 :
1219 8334854 : error = xfs_qm_dqattach(tdp);
1220 8334936 : if (error)
1221 4 : goto std_return;
1222 :
1223 8334932 : resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1224 8334932 : error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks,
1225 : &tp, &nospace_error);
1226 8335421 : if (error)
1227 0 : goto std_return;
1228 :
1229 : /*
1230 : * If we are using project inheritance, we only allow hard link
1231 : * creation in our tree when the project IDs are the same; else
1232 : * the tree quota mechanism could be circumvented.
1233 : */
1234 8335421 : if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
1235 : tdp->i_projid != sip->i_projid)) {
1236 0 : error = -EXDEV;
1237 0 : goto error_return;
1238 : }
1239 :
1240 8335421 : if (!resblks) {
1241 49393 : error = xfs_dir_canenter(tp, tdp, target_name);
1242 49391 : if (error)
1243 160 : goto error_return;
1244 : }
1245 :
1246 : /*
1247 : * Handle initial link state of O_TMPFILE inode
1248 : */
1249 8335259 : if (VFS_I(sip)->i_nlink == 0) {
1250 24880 : struct xfs_perag *pag;
1251 :
1252 24880 : pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino));
1253 24887 : error = xfs_iunlink_remove(tp, pag, sip);
1254 24876 : xfs_perag_put(pag);
1255 24884 : if (error)
1256 0 : goto error_return;
1257 : }
1258 :
1259 8335263 : error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1260 : resblks);
1261 8335367 : if (error)
1262 5 : goto error_return;
1263 8335362 : xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1264 8334815 : xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1265 :
1266 8335660 : xfs_bumplink(tp, sip);
1267 :
1268 : /*
1269 : * If this is a synchronous mount, make sure that the
1270 : * link transaction goes to disk before returning to
1271 : * the user.
1272 : */
1273 8335480 : if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
1274 0 : xfs_trans_set_sync(tp);
1275 :
1276 8335480 : return xfs_trans_commit(tp);
1277 :
1278 165 : error_return:
1279 165 : xfs_trans_cancel(tp);
1280 169 : std_return:
1281 169 : if (error == -ENOSPC && nospace_error)
1282 160 : error = nospace_error;
1283 : return error;
1284 : }
1285 :
1286 : /* Clear the reflink flag and the cowblocks tag if possible. */
1287 : static void
1288 30630467 : xfs_itruncate_clear_reflink_flags(
1289 : struct xfs_inode *ip)
1290 : {
1291 30630467 : struct xfs_ifork *dfork;
1292 30630467 : struct xfs_ifork *cfork;
1293 :
1294 30630467 : if (!xfs_is_reflink_inode(ip))
1295 : return;
1296 6924364 : dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK);
1297 6924364 : cfork = xfs_ifork_ptr(ip, XFS_COW_FORK);
1298 6924364 : if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
1299 920116 : ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1300 6924364 : if (cfork->if_bytes == 0)
1301 3991155 : xfs_inode_clear_cowblocks_tag(ip);
1302 : }
1303 :
1304 : /*
1305 : * Free up the underlying blocks past new_size. The new size must be smaller
1306 : * than the current size. This routine can be used both for the attribute and
1307 : * data fork, and does not modify the inode size, which is left to the caller.
1308 : *
1309 : * The transaction passed to this routine must have made a permanent log
1310 : * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the
1311 : * given transaction and start new ones, so make sure everything involved in
1312 : * the transaction is tidy before calling here. Some transaction will be
1313 : * returned to the caller to be committed. The incoming transaction must
1314 : * already include the inode, and both inode locks must be held exclusively.
1315 : * The inode must also be "held" within the transaction. On return the inode
1316 : * will be "held" within the returned transaction. This routine does NOT
1317 : * require any disk space to be reserved for it within the transaction.
1318 : *
1319 : * If we get an error, we must return with the inode locked and linked into the
1320 : * current transaction. This keeps things simple for the higher level code,
1321 : * because it always knows that the inode is locked and held in the transaction
1322 : * that returns to it whether errors occur or not. We don't mark the inode
1323 : * dirty on error so that transactions can be easily aborted if possible.
1324 : */
1325 : int
1326 31289836 : xfs_itruncate_extents_flags(
1327 : struct xfs_trans **tpp,
1328 : struct xfs_inode *ip,
1329 : int whichfork,
1330 : xfs_fsize_t new_size,
1331 : int flags)
1332 : {
1333 31289836 : struct xfs_mount *mp = ip->i_mount;
1334 31289836 : struct xfs_trans *tp = *tpp;
1335 31289836 : xfs_fileoff_t first_unmap_block;
1336 31289836 : xfs_filblks_t unmap_len;
1337 31289836 : int error = 0;
1338 :
1339 31289836 : ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1340 49862658 : ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
1341 : xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1342 62579672 : ASSERT(new_size <= XFS_ISIZE(ip));
1343 31289836 : ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1344 31289836 : ASSERT(ip->i_itemp != NULL);
1345 31289836 : ASSERT(ip->i_itemp->ili_lock_flags == 0);
1346 31289836 : ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1347 :
1348 31289836 : trace_xfs_itruncate_extents_start(ip, new_size);
1349 :
1350 31273260 : flags |= xfs_bmapi_aflag(whichfork);
1351 :
1352 : /*
1353 : * Since it is possible for space to become allocated beyond
1354 : * the end of the file (in a crash where the space is allocated
1355 : * but the inode size is not yet updated), simply remove any
1356 : * blocks which show up between the new EOF and the maximum
1357 : * possible file size.
1358 : *
1359 : * We have to free all the blocks to the bmbt maximum offset, even if
1360 : * the page cache can't scale that far.
1361 : */
1362 31273260 : first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1363 31273260 : if (!xfs_verify_fileoff(mp, first_unmap_block)) {
1364 0 : WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF);
1365 : return 0;
1366 : }
1367 :
1368 31286096 : unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1;
1369 93821468 : while (unmap_len > 0) {
1370 62500116 : ASSERT(tp->t_highest_agno == NULLAGNUMBER);
1371 62500116 : error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
1372 : flags, XFS_ITRUNC_MAX_EXTENTS);
1373 62552971 : if (error)
1374 383 : goto out;
1375 :
1376 : /* free the just unmapped extents */
1377 62552588 : error = xfs_defer_finish(&tp);
1378 62537307 : if (error)
1379 1935 : goto out;
1380 : }
1381 :
1382 31321352 : if (whichfork == XFS_DATA_FORK) {
1383 : /* Remove all pending CoW reservations. */
1384 30622429 : error = xfs_reflink_cancel_cow_blocks(ip, &tp,
1385 : first_unmap_block, XFS_MAX_FILEOFF, true);
1386 30591800 : if (error)
1387 0 : goto out;
1388 :
1389 30591800 : xfs_itruncate_clear_reflink_flags(ip);
1390 : }
1391 :
1392 : /*
1393 : * Always re-log the inode so that our permanent transaction can keep
1394 : * on rolling it forward in the log.
1395 : */
1396 31311795 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1397 :
1398 31469839 : trace_xfs_itruncate_extents_end(ip, new_size);
1399 :
1400 31467720 : out:
1401 31467720 : *tpp = tp;
1402 31467720 : return error;
1403 : }
1404 :
1405 : int
1406 509826622 : xfs_release(
1407 : xfs_inode_t *ip)
1408 : {
1409 509826622 : xfs_mount_t *mp = ip->i_mount;
1410 509826622 : int error = 0;
1411 :
1412 509826622 : if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0))
1413 : return 0;
1414 :
1415 : /* If this is a read-only mount, don't do this (would generate I/O) */
1416 1019199408 : if (xfs_is_readonly(mp))
1417 : return 0;
1418 :
1419 1013007944 : if (!xfs_is_shutdown(mp)) {
1420 505575253 : int truncated;
1421 :
1422 : /*
1423 : * If we previously truncated this file and removed old data
1424 : * in the process, we want to initiate "early" writeout on
1425 : * the last close. This is an attempt to combat the notorious
1426 : * NULL files problem which is particularly noticeable from a
1427 : * truncate down, buffered (re-)write (delalloc), followed by
1428 : * a crash. What we are effectively doing here is
1429 : * significantly reducing the time window where we'd otherwise
1430 : * be exposed to that problem.
1431 : */
1432 505575253 : truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1433 506589803 : if (truncated) {
1434 1949648 : xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
1435 1949644 : if (ip->i_delayed_blks > 0) {
1436 216841 : error = filemap_flush(VFS_I(ip)->i_mapping);
1437 216848 : if (error)
1438 : return error;
1439 : }
1440 : }
1441 : }
1442 :
1443 507518477 : if (VFS_I(ip)->i_nlink == 0)
1444 : return 0;
1445 :
1446 : /*
1447 : * If we can't get the iolock just skip truncating the blocks past EOF
1448 : * because we could deadlock with the mmap_lock otherwise. We'll get
1449 : * another chance to drop them once the last reference to the inode is
1450 : * dropped, so we'll never leak blocks permanently.
1451 : */
1452 499152060 : if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
1453 : return 0;
1454 :
1455 497888343 : if (xfs_can_free_eofblocks(ip, false)) {
1456 : /*
1457 : * Check if the inode is being opened, written and closed
1458 : * frequently and we have delayed allocation blocks outstanding
1459 : * (e.g. streaming writes from the NFS server), truncating the
1460 : * blocks past EOF will cause fragmentation to occur.
1461 : *
1462 : * In this case don't do the truncation, but we have to be
1463 : * careful how we detect this case. Blocks beyond EOF show up as
1464 : * i_delayed_blks even when the inode is clean, so we need to
1465 : * truncate them away first before checking for a dirty release.
1466 : * Hence on the first dirty close we will still remove the
1467 : * speculative allocation, but after that we will leave it in
1468 : * place.
1469 : */
1470 100306919 : if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
1471 39891606 : goto out_unlock;
1472 :
1473 10259060 : error = xfs_free_eofblocks(ip);
1474 10374378 : if (error)
1475 1 : goto out_unlock;
1476 :
1477 : /* delalloc blocks after truncation means it really is dirty */
1478 10374377 : if (ip->i_delayed_blks)
1479 10334534 : xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1480 : }
1481 :
1482 447814589 : out_unlock:
1483 498054156 : xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1484 498054156 : return error;
1485 : }
1486 :
1487 : /*
1488 : * xfs_inactive_truncate
1489 : *
1490 : * Called to perform a truncate when an inode becomes unlinked.
1491 : */
1492 : STATIC int
1493 11973272 : xfs_inactive_truncate(
1494 : struct xfs_inode *ip)
1495 : {
1496 11973272 : struct xfs_mount *mp = ip->i_mount;
1497 11973272 : struct xfs_trans *tp;
1498 11973272 : int error;
1499 :
1500 11973272 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
1501 11969028 : if (error) {
1502 1936 : ASSERT(xfs_is_shutdown(mp));
1503 968 : return error;
1504 : }
1505 11968060 : xfs_ilock(ip, XFS_ILOCK_EXCL);
1506 11970520 : xfs_trans_ijoin(tp, ip, 0);
1507 :
1508 : /*
1509 : * Log the inode size first to prevent stale data exposure in the event
1510 : * of a system crash before the truncate completes. See the related
1511 : * comment in xfs_vn_setattr_size() for details.
1512 : */
1513 11975171 : ip->i_disk_size = 0;
1514 11975171 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1515 :
1516 11981882 : error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
1517 11989497 : if (error)
1518 1472 : goto error_trans_cancel;
1519 :
1520 11988025 : ASSERT(ip->i_df.if_nextents == 0);
1521 :
1522 11988025 : error = xfs_trans_commit(tp);
1523 11983064 : if (error)
1524 0 : goto error_unlock;
1525 :
1526 11983064 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
1527 11983064 : return 0;
1528 :
1529 : error_trans_cancel:
1530 1472 : xfs_trans_cancel(tp);
1531 1471 : error_unlock:
1532 1471 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
1533 1471 : return error;
1534 : }
1535 :
1536 : /*
1537 : * xfs_inactive_ifree()
1538 : *
1539 : * Perform the inode free when an inode is unlinked.
1540 : */
1541 : STATIC int
1542 57745759 : xfs_inactive_ifree(
1543 : struct xfs_inode *ip)
1544 : {
1545 57745759 : struct xfs_mount *mp = ip->i_mount;
1546 57745759 : struct xfs_trans *tp;
1547 57745759 : int error;
1548 :
1549 : /*
1550 : * We try to use a per-AG reservation for any block needed by the finobt
1551 : * tree, but as the finobt feature predates the per-AG reservation
1552 : * support a degraded file system might not have enough space for the
1553 : * reservation at mount time. In that case try to dip into the reserved
1554 : * pool and pray.
1555 : *
1556 : * Send a warning if the reservation does happen to fail, as the inode
1557 : * now remains allocated and sits on the unlinked list until the fs is
1558 : * repaired.
1559 : */
1560 57745759 : if (unlikely(mp->m_finobt_nores)) {
1561 0 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
1562 : XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
1563 : &tp);
1564 : } else {
1565 57745759 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
1566 : }
1567 57710579 : if (error) {
1568 230 : if (error == -ENOSPC) {
1569 0 : xfs_warn_ratelimited(mp,
1570 : "Failed to remove inode(s) from unlinked list. "
1571 : "Please free space, unmount and run xfs_repair.");
1572 : } else {
1573 460 : ASSERT(xfs_is_shutdown(mp));
1574 : }
1575 230 : return error;
1576 : }
1577 :
1578 : /*
1579 : * We do not hold the inode locked across the entire rolling transaction
1580 : * here. We only need to hold it for the first transaction that
1581 : * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the
1582 : * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode
1583 : * here breaks the relationship between cluster buffer invalidation and
1584 : * stale inode invalidation on cluster buffer item journal commit
1585 : * completion, and can result in leaving dirty stale inodes hanging
1586 : * around in memory.
1587 : *
1588 : * We have no need for serialising this inode operation against other
1589 : * operations - we freed the inode and hence reallocation is required
1590 : * and that will serialise on reallocating the space the deferops need
1591 : * to free. Hence we can unlock the inode on the first commit of
1592 : * the transaction rather than roll it right through the deferops. This
1593 : * avoids relogging the XFS_ISTALE inode.
1594 : *
1595 : * We check that xfs_ifree() hasn't grown an internal transaction roll
1596 : * by asserting that the inode is still locked when it returns.
1597 : */
1598 57710349 : xfs_ilock(ip, XFS_ILOCK_EXCL);
1599 57692623 : xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1600 :
1601 57720524 : error = xfs_ifree(tp, ip);
1602 57859325 : ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1603 57859325 : if (error) {
1604 : /*
1605 : * If we fail to free the inode, shut down. The cancel
1606 : * might do that, we need to make sure. Otherwise the
1607 : * inode might be lost for a long time or forever.
1608 : */
1609 260 : if (!xfs_is_shutdown(mp)) {
1610 6 : xfs_notice(mp, "%s: xfs_ifree returned error %d",
1611 : __func__, error);
1612 6 : xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1613 : }
1614 130 : xfs_trans_cancel(tp);
1615 130 : return error;
1616 : }
1617 :
1618 : /*
1619 : * Credit the quota account(s). The inode is gone.
1620 : */
1621 57859195 : xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1622 :
1623 57802880 : return xfs_trans_commit(tp);
1624 : }
1625 :
1626 : /*
1627 : * Returns true if we need to update the on-disk metadata before we can free
1628 : * the memory used by this inode. Updates include freeing post-eof
1629 : * preallocations; freeing COW staging extents; and marking the inode free in
1630 : * the inobt if it is on the unlinked list.
1631 : */
1632 : bool
1633 1085970267 : xfs_inode_needs_inactive(
1634 : struct xfs_inode *ip)
1635 : {
1636 1085970267 : struct xfs_mount *mp = ip->i_mount;
1637 1085970267 : struct xfs_ifork *cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
1638 :
1639 : /*
1640 : * If the inode is already free, then there can be nothing
1641 : * to clean up here.
1642 : */
1643 1085970267 : if (VFS_I(ip)->i_mode == 0)
1644 : return false;
1645 :
1646 : /* If this is a read-only mount, don't do this (would generate I/O) */
1647 2171940534 : if (xfs_is_readonly(mp))
1648 : return false;
1649 :
1650 : /* If the log isn't running, push inodes straight to reclaim. */
1651 2121486882 : if (xfs_is_shutdown(mp) || xfs_has_norecovery(mp))
1652 : return false;
1653 :
1654 : /* Metadata inodes require explicit resource cleanup. */
1655 723050811 : if (xfs_is_metadata_inode(ip))
1656 : return false;
1657 :
1658 : /* Want to clean out the cow blocks if there are any. */
1659 722897917 : if (cow_ifp && cow_ifp->if_bytes > 0)
1660 : return true;
1661 :
1662 : /* Unlinked files must be freed. */
1663 721861513 : if (VFS_I(ip)->i_nlink == 0)
1664 : return true;
1665 :
1666 : /*
1667 : * This file isn't being freed, so check if there are post-eof blocks
1668 : * to free. @force is true because we are evicting an inode from the
1669 : * cache. Post-eof blocks must be freed, lest we end up with broken
1670 : * free space accounting.
1671 : *
1672 : * Note: don't bother with iolock here since lockdep complains about
1673 : * acquiring it in reclaim context. We have the only reference to the
1674 : * inode at this point anyways.
1675 : */
1676 665068354 : return xfs_can_free_eofblocks(ip, true);
1677 : }
1678 :
1679 : /*
1680 : * xfs_inactive
1681 : *
1682 : * This is called when the vnode reference count for the vnode
1683 : * goes to zero. If the file has been unlinked, then it must
1684 : * now be truncated. Also, we clear all of the read-ahead state
1685 : * kept for the inode here since the file is now closed.
1686 : */
1687 : int
1688 58008966 : xfs_inactive(
1689 : xfs_inode_t *ip)
1690 : {
1691 58008966 : struct xfs_mount *mp;
1692 58008966 : int error = 0;
1693 58008966 : int truncate = 0;
1694 :
1695 : /*
1696 : * If the inode is already free, then there can be nothing
1697 : * to clean up here.
1698 : */
1699 58008966 : if (VFS_I(ip)->i_mode == 0) {
1700 0 : ASSERT(ip->i_df.if_broot_bytes == 0);
1701 0 : goto out;
1702 : }
1703 :
1704 58008966 : mp = ip->i_mount;
1705 115978395 : ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
1706 :
1707 : /* If this is a read-only mount, don't do this (would generate I/O) */
1708 115938858 : if (xfs_is_readonly(mp))
1709 0 : goto out;
1710 :
1711 : /* Metadata inodes require explicit resource cleanup. */
1712 57969429 : if (xfs_is_metadata_inode(ip))
1713 0 : goto out;
1714 :
1715 : /* Try to clean out the cow blocks if there are any. */
1716 115938858 : if (xfs_inode_has_cow_data(ip))
1717 1035842 : xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
1718 :
1719 57967568 : if (VFS_I(ip)->i_nlink != 0) {
1720 : /*
1721 : * force is true because we are evicting an inode from the
1722 : * cache. Post-eof blocks must be freed, lest we end up with
1723 : * broken free space accounting.
1724 : *
1725 : * Note: don't bother with iolock here since lockdep complains
1726 : * about acquiring it in reclaim context. We have the only
1727 : * reference to the inode at this point anyways.
1728 : */
1729 255589 : if (xfs_can_free_eofblocks(ip, true))
1730 133588 : error = xfs_free_eofblocks(ip);
1731 :
1732 255617 : goto out;
1733 : }
1734 :
1735 57711979 : if (S_ISREG(VFS_I(ip)->i_mode) &&
1736 30668871 : (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 ||
1737 18713782 : ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
1738 : truncate = 1;
1739 :
1740 57711979 : error = xfs_qm_dqattach(ip);
1741 57705951 : if (error)
1742 19 : goto out;
1743 :
1744 57705932 : if (S_ISLNK(VFS_I(ip)->i_mode))
1745 24642735 : error = xfs_inactive_symlink(ip);
1746 33063197 : else if (truncate)
1747 11973190 : error = xfs_inactive_truncate(ip);
1748 57744463 : if (error)
1749 2454 : goto out;
1750 :
1751 : /*
1752 : * If there are attributes associated with the file then blow them away
1753 : * now. The code calls a routine that recursively deconstructs the
1754 : * attribute fork. If also blows away the in-core attribute fork.
1755 : */
1756 57742009 : if (xfs_inode_has_attr_fork(ip)) {
1757 1444031 : error = xfs_attr_inactive(ip);
1758 1446525 : if (error)
1759 146 : goto out;
1760 : }
1761 :
1762 57744357 : ASSERT(ip->i_forkoff == 0);
1763 :
1764 : /*
1765 : * Free the inode.
1766 : */
1767 57744357 : error = xfs_inactive_ifree(ip);
1768 :
1769 58102691 : out:
1770 : /*
1771 : * We're done making metadata updates for this inode, so we can release
1772 : * the attached dquots.
1773 : */
1774 58102691 : xfs_qm_dqdetach(ip);
1775 58066688 : return error;
1776 : }
1777 :
1778 : /*
1779 : * In-Core Unlinked List Lookups
1780 : * =============================
1781 : *
1782 : * Every inode is supposed to be reachable from some other piece of metadata
1783 : * with the exception of the root directory. Inodes with a connection to a
1784 : * file descriptor but not linked from anywhere in the on-disk directory tree
1785 : * are collectively known as unlinked inodes, though the filesystem itself
1786 : * maintains links to these inodes so that on-disk metadata are consistent.
1787 : *
1788 : * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI
1789 : * header contains a number of buckets that point to an inode, and each inode
1790 : * record has a pointer to the next inode in the hash chain. This
1791 : * singly-linked list causes scaling problems in the iunlink remove function
1792 : * because we must walk that list to find the inode that points to the inode
1793 : * being removed from the unlinked hash bucket list.
1794 : *
1795 : * Hence we keep an in-memory double linked list to link each inode on an
1796 : * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer
1797 : * based lists would require having 64 list heads in the perag, one for each
1798 : * list. This is expensive in terms of memory (think millions of AGs) and cache
1799 : * misses on lookups. Instead, use the fact that inodes on the unlinked list
1800 : * must be referenced at the VFS level to keep them on the list and hence we
1801 : * have an existence guarantee for inodes on the unlinked list.
1802 : *
1803 : * Given we have an existence guarantee, we can use lockless inode cache lookups
1804 : * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode
1805 : * for the double linked unlinked list, and we don't need any extra locking to
1806 : * keep the list safe as all manipulations are done under the AGI buffer lock.
1807 : * Keeping the list up to date does not require memory allocation, just finding
1808 : * the XFS inode and updating the next/prev unlinked list aginos.
1809 : */
1810 :
1811 : /*
1812 : * Find an inode on the unlinked list. This does not take references to the
1813 : * inode as we have existence guarantees by holding the AGI buffer lock and that
1814 : * only unlinked, referenced inodes can be on the unlinked inode list. If we
1815 : * don't find the inode in cache, then let the caller handle the situation.
1816 : */
1817 : static struct xfs_inode *
1818 55746271 : xfs_iunlink_lookup(
1819 : struct xfs_perag *pag,
1820 : xfs_agino_t agino)
1821 : {
1822 55746271 : struct xfs_inode *ip;
1823 :
1824 55746271 : rcu_read_lock();
1825 55435850 : ip = radix_tree_lookup(&pag->pag_ici_root, agino);
1826 :
1827 : /*
1828 : * Inode not in memory or in RCU freeing limbo should not happen.
1829 : * Warn about this and let the caller handle the failure.
1830 : */
1831 111769996 : if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
1832 0 : rcu_read_unlock();
1833 0 : return NULL;
1834 : }
1835 111895116 : ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM));
1836 56102194 : rcu_read_unlock();
1837 56102194 : return ip;
1838 : }
1839 :
1840 : /* Update the prev pointer of the next agino. */
1841 : static int
1842 120009367 : xfs_iunlink_update_backref(
1843 : struct xfs_perag *pag,
1844 : xfs_agino_t prev_agino,
1845 : xfs_agino_t next_agino)
1846 : {
1847 120009367 : struct xfs_inode *ip;
1848 :
1849 : /* No update necessary if we are at the end of the list. */
1850 120009367 : if (next_agino == NULLAGINO)
1851 : return 0;
1852 :
1853 41758444 : ip = xfs_iunlink_lookup(pag, next_agino);
1854 41999289 : if (!ip)
1855 : return -EFSCORRUPTED;
1856 41999289 : ip->i_prev_unlinked = prev_agino;
1857 41999289 : return 0;
1858 : }
1859 :
1860 : /*
1861 : * Point the AGI unlinked bucket at an inode and log the results. The caller
1862 : * is responsible for validating the old value.
1863 : */
1864 : STATIC int
1865 105988523 : xfs_iunlink_update_bucket(
1866 : struct xfs_trans *tp,
1867 : struct xfs_perag *pag,
1868 : struct xfs_buf *agibp,
1869 : unsigned int bucket_index,
1870 : xfs_agino_t new_agino)
1871 : {
1872 105988523 : struct xfs_agi *agi = agibp->b_addr;
1873 105988523 : xfs_agino_t old_value;
1874 105988523 : int offset;
1875 :
1876 174309332 : ASSERT(xfs_verify_agino_or_null(pag, new_agino));
1877 :
1878 105988523 : old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
1879 106008754 : trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
1880 : old_value, new_agino);
1881 :
1882 : /*
1883 : * We should never find the head of the list already set to the value
1884 : * passed in because either we're adding or removing ourselves from the
1885 : * head of the list.
1886 : */
1887 105701175 : if (old_value == new_agino) {
1888 0 : xfs_buf_mark_corrupt(agibp);
1889 0 : return -EFSCORRUPTED;
1890 : }
1891 :
1892 105701175 : agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
1893 105700386 : offset = offsetof(struct xfs_agi, agi_unlinked) +
1894 : (sizeof(xfs_agino_t) * bucket_index);
1895 105700386 : xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
1896 105700386 : return 0;
1897 : }
1898 :
1899 : static int
1900 59826030 : xfs_iunlink_insert_inode(
1901 : struct xfs_trans *tp,
1902 : struct xfs_perag *pag,
1903 : struct xfs_buf *agibp,
1904 : struct xfs_inode *ip)
1905 : {
1906 59826030 : struct xfs_mount *mp = tp->t_mountp;
1907 59826030 : struct xfs_agi *agi = agibp->b_addr;
1908 59826030 : xfs_agino_t next_agino;
1909 59826030 : xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1910 59826030 : short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1911 59826030 : int error;
1912 :
1913 : /*
1914 : * Get the index into the agi hash table for the list this inode will
1915 : * go on. Make sure the pointer isn't garbage and that this inode
1916 : * isn't already on the list.
1917 : */
1918 59826030 : next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
1919 59903331 : if (next_agino == agino ||
1920 : !xfs_verify_agino_or_null(pag, next_agino)) {
1921 0 : xfs_buf_mark_corrupt(agibp);
1922 0 : return -EFSCORRUPTED;
1923 : }
1924 :
1925 : /*
1926 : * Update the prev pointer in the next inode to point back to this
1927 : * inode.
1928 : */
1929 59903331 : error = xfs_iunlink_update_backref(pag, agino, next_agino);
1930 59966079 : if (error)
1931 : return error;
1932 :
1933 59966079 : if (next_agino != NULLAGINO) {
1934 : /*
1935 : * There is already another inode in the bucket, so point this
1936 : * inode to the current head of the list.
1937 : */
1938 22283954 : error = xfs_iunlink_log_inode(tp, ip, pag, next_agino);
1939 22303593 : if (error)
1940 : return error;
1941 22303593 : ip->i_next_unlinked = next_agino;
1942 : }
1943 :
1944 : /* Point the head of the list to point to this inode. */
1945 59985718 : return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
1946 : }
1947 :
1948 : /*
1949 : * This is called when the inode's link count has gone to 0 or we are creating
1950 : * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
1951 : *
1952 : * We place the on-disk inode on a list in the AGI. It will be pulled from this
1953 : * list when the inode is freed.
1954 : */
1955 : STATIC int
1956 59814507 : xfs_iunlink(
1957 : struct xfs_trans *tp,
1958 : struct xfs_inode *ip)
1959 : {
1960 59814507 : struct xfs_mount *mp = tp->t_mountp;
1961 59814507 : struct xfs_perag *pag;
1962 59814507 : struct xfs_buf *agibp;
1963 59814507 : int error;
1964 :
1965 59814507 : ASSERT(VFS_I(ip)->i_nlink == 0);
1966 59814507 : ASSERT(VFS_I(ip)->i_mode != 0);
1967 59814507 : trace_xfs_iunlink(ip);
1968 :
1969 59688601 : pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1970 :
1971 : /* Get the agi buffer first. It ensures lock ordering on the list. */
1972 60088821 : error = xfs_read_agi(pag, tp, &agibp);
1973 59845246 : if (error)
1974 6 : goto out;
1975 :
1976 59845240 : error = xfs_iunlink_insert_inode(tp, pag, agibp, ip);
1977 59940032 : out:
1978 59940032 : xfs_perag_put(pag);
1979 60059848 : return error;
1980 : }
1981 :
1982 : static int
1983 60160732 : xfs_iunlink_remove_inode(
1984 : struct xfs_trans *tp,
1985 : struct xfs_perag *pag,
1986 : struct xfs_buf *agibp,
1987 : struct xfs_inode *ip)
1988 : {
1989 60160732 : struct xfs_mount *mp = tp->t_mountp;
1990 60160732 : struct xfs_agi *agi = agibp->b_addr;
1991 60160732 : xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1992 60160732 : xfs_agino_t head_agino;
1993 60160732 : short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1994 60160732 : int error;
1995 :
1996 60160732 : trace_xfs_iunlink_remove(ip);
1997 :
1998 : /*
1999 : * Get the index into the agi hash table for the list this inode will
2000 : * go on. Make sure the head pointer isn't garbage.
2001 : */
2002 60144684 : head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2003 60158826 : if (!xfs_verify_agino(pag, head_agino)) {
2004 0 : XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
2005 : agi, sizeof(*agi));
2006 0 : return -EFSCORRUPTED;
2007 : }
2008 :
2009 : /*
2010 : * Set our inode's next_unlinked pointer to NULL and then return
2011 : * the old pointer value so that we can update whatever was previous
2012 : * to us in the list to point to whatever was next in the list.
2013 : */
2014 60158826 : error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO);
2015 60153039 : if (error)
2016 : return error;
2017 :
2018 : /*
2019 : * Update the prev pointer in the next inode to point back to previous
2020 : * inode in the chain.
2021 : */
2022 60154914 : error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
2023 : ip->i_next_unlinked);
2024 60129427 : if (error)
2025 : return error;
2026 :
2027 60129427 : if (head_agino != agino) {
2028 14075401 : struct xfs_inode *prev_ip;
2029 :
2030 14075401 : prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
2031 14114143 : if (!prev_ip)
2032 : return -EFSCORRUPTED;
2033 :
2034 14114143 : error = xfs_iunlink_log_inode(tp, prev_ip, pag,
2035 : ip->i_next_unlinked);
2036 14112986 : prev_ip->i_next_unlinked = ip->i_next_unlinked;
2037 : } else {
2038 : /* Point the head of the list to the next unlinked inode. */
2039 46054026 : error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
2040 : ip->i_next_unlinked);
2041 : }
2042 :
2043 60191442 : ip->i_next_unlinked = NULLAGINO;
2044 60191442 : ip->i_prev_unlinked = NULLAGINO;
2045 60191442 : return error;
2046 : }
2047 :
2048 : /*
2049 : * Pull the on-disk inode from the AGI unlinked list.
2050 : */
2051 : STATIC int
2052 60175402 : xfs_iunlink_remove(
2053 : struct xfs_trans *tp,
2054 : struct xfs_perag *pag,
2055 : struct xfs_inode *ip)
2056 : {
2057 60175402 : struct xfs_buf *agibp;
2058 60175402 : int error;
2059 :
2060 60175402 : trace_xfs_iunlink_remove(ip);
2061 :
2062 : /* Get the agi buffer first. It ensures lock ordering on the list. */
2063 60151484 : error = xfs_read_agi(pag, tp, &agibp);
2064 60154642 : if (error)
2065 : return error;
2066 :
2067 60159303 : return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
2068 : }
2069 :
2070 : /*
2071 : * Look up the inode number specified and if it is not already marked XFS_ISTALE
2072 : * mark it stale. We should only find clean inodes in this lookup that aren't
2073 : * already stale.
2074 : */
2075 : static void
2076 20705954 : xfs_ifree_mark_inode_stale(
2077 : struct xfs_perag *pag,
2078 : struct xfs_inode *free_ip,
2079 : xfs_ino_t inum)
2080 : {
2081 20705954 : struct xfs_mount *mp = pag->pag_mount;
2082 20705954 : struct xfs_inode_log_item *iip;
2083 20705954 : struct xfs_inode *ip;
2084 :
2085 20705954 : retry:
2086 20705954 : rcu_read_lock();
2087 20705960 : ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
2088 :
2089 : /* Inode not in memory, nothing to do */
2090 20706420 : if (!ip) {
2091 2088778 : rcu_read_unlock();
2092 2088778 : return;
2093 : }
2094 :
2095 : /*
2096 : * because this is an RCU protected lookup, we could find a recently
2097 : * freed or even reallocated inode during the lookup. We need to check
2098 : * under the i_flags_lock for a valid inode here. Skip it if it is not
2099 : * valid, the wrong inode or stale.
2100 : */
2101 18617642 : spin_lock(&ip->i_flags_lock);
2102 18616775 : if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE))
2103 43298 : goto out_iflags_unlock;
2104 :
2105 : /*
2106 : * Don't try to lock/unlock the current inode, but we _cannot_ skip the
2107 : * other inodes that we did not find in the list attached to the buffer
2108 : * and are not already marked stale. If we can't lock it, back off and
2109 : * retry.
2110 : */
2111 18573477 : if (ip != free_ip) {
2112 18228140 : if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2113 0 : spin_unlock(&ip->i_flags_lock);
2114 0 : rcu_read_unlock();
2115 0 : delay(1);
2116 0 : goto retry;
2117 : }
2118 : }
2119 18574078 : ip->i_flags |= XFS_ISTALE;
2120 :
2121 : /*
2122 : * If the inode is flushing, it is already attached to the buffer. All
2123 : * we needed to do here is mark the inode stale so buffer IO completion
2124 : * will remove it from the AIL.
2125 : */
2126 18574078 : iip = ip->i_itemp;
2127 18574078 : if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
2128 3760 : ASSERT(!list_empty(&iip->ili_item.li_bio_list));
2129 3760 : ASSERT(iip->ili_last_fields);
2130 3760 : goto out_iunlock;
2131 : }
2132 :
2133 : /*
2134 : * Inodes not attached to the buffer can be released immediately.
2135 : * Everything else has to go through xfs_iflush_abort() on journal
2136 : * commit as the flock synchronises removal of the inode from the
2137 : * cluster buffer against inode reclaim.
2138 : */
2139 18570318 : if (!iip || list_empty(&iip->ili_item.li_bio_list))
2140 883543 : goto out_iunlock;
2141 :
2142 17686775 : __xfs_iflags_set(ip, XFS_IFLUSHING);
2143 17686775 : spin_unlock(&ip->i_flags_lock);
2144 17687963 : rcu_read_unlock();
2145 :
2146 : /* we have a dirty inode in memory that has not yet been flushed. */
2147 17687573 : spin_lock(&iip->ili_lock);
2148 17688570 : iip->ili_last_fields = iip->ili_fields;
2149 17688570 : iip->ili_fields = 0;
2150 17688570 : iip->ili_fsync_fields = 0;
2151 17688570 : spin_unlock(&iip->ili_lock);
2152 17687663 : ASSERT(iip->ili_last_fields);
2153 :
2154 17687663 : if (ip != free_ip)
2155 17459391 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
2156 : return;
2157 :
2158 887303 : out_iunlock:
2159 887303 : if (ip != free_ip)
2160 769228 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
2161 118075 : out_iflags_unlock:
2162 930614 : spin_unlock(&ip->i_flags_lock);
2163 930633 : rcu_read_unlock();
2164 : }
2165 :
2166 : /*
2167 : * A big issue when freeing the inode cluster is that we _cannot_ skip any
2168 : * inodes that are in memory - they all must be marked stale and attached to
2169 : * the cluster buffer.
2170 : */
2171 : static int
2172 346353 : xfs_ifree_cluster(
2173 : struct xfs_trans *tp,
2174 : struct xfs_perag *pag,
2175 : struct xfs_inode *free_ip,
2176 : struct xfs_icluster *xic)
2177 : {
2178 346353 : struct xfs_mount *mp = free_ip->i_mount;
2179 346353 : struct xfs_ino_geometry *igeo = M_IGEO(mp);
2180 346353 : struct xfs_buf *bp;
2181 346353 : xfs_daddr_t blkno;
2182 346353 : xfs_ino_t inum = xic->first_ino;
2183 346353 : int nbufs;
2184 346353 : int i, j;
2185 346353 : int ioffset;
2186 346353 : int error;
2187 :
2188 346353 : nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
2189 :
2190 1039067 : for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
2191 : /*
2192 : * The allocation bitmap tells us which inodes of the chunk were
2193 : * physically allocated. Skip the cluster if an inode falls into
2194 : * a sparse region.
2195 : */
2196 692709 : ioffset = inum - xic->first_ino;
2197 692709 : if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
2198 45565 : ASSERT(ioffset % igeo->inodes_per_cluster == 0);
2199 45565 : continue;
2200 : }
2201 :
2202 647144 : blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2203 : XFS_INO_TO_AGBNO(mp, inum));
2204 :
2205 : /*
2206 : * We obtain and lock the backing buffer first in the process
2207 : * here to ensure dirty inodes attached to the buffer remain in
2208 : * the flushing state while we mark them stale.
2209 : *
2210 : * If we scan the in-memory inodes first, then buffer IO can
2211 : * complete before we get a lock on it, and hence we may fail
2212 : * to mark all the active inodes on the buffer stale.
2213 : */
2214 647144 : error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2215 647144 : mp->m_bsize * igeo->blocks_per_cluster,
2216 : XBF_UNMAPPED, &bp);
2217 647130 : if (error)
2218 0 : return error;
2219 :
2220 : /*
2221 : * This buffer may not have been correctly initialised as we
2222 : * didn't read it from disk. That's not important because we are
2223 : * only using to mark the buffer as stale in the log, and to
2224 : * attach stale cached inodes on it. That means it will never be
2225 : * dispatched for IO. If it is, we want to know about it, and we
2226 : * want it to fail. We can acheive this by adding a write
2227 : * verifier to the buffer.
2228 : */
2229 647130 : bp->b_ops = &xfs_inode_buf_ops;
2230 :
2231 : /*
2232 : * Now we need to set all the cached clean inodes as XFS_ISTALE,
2233 : * too. This requires lookups, and will skip inodes that we've
2234 : * already marked XFS_ISTALE.
2235 : */
2236 21353096 : for (i = 0; i < igeo->inodes_per_cluster; i++)
2237 20705944 : xfs_ifree_mark_inode_stale(pag, free_ip, inum + i);
2238 :
2239 647152 : xfs_trans_stale_inode_buf(tp, bp);
2240 647144 : xfs_trans_binval(tp, bp);
2241 : }
2242 : return 0;
2243 : }
2244 :
2245 : /*
2246 : * This is called to return an inode to the inode free list. The inode should
2247 : * already be truncated to 0 length and have no pages associated with it. This
2248 : * routine also assumes that the inode is already a part of the transaction.
2249 : *
2250 : * The on-disk copy of the inode will have been added to the list of unlinked
2251 : * inodes in the AGI. We need to remove the inode from that list atomically with
2252 : * respect to freeing it here.
2253 : */
2254 : int
2255 57754953 : xfs_ifree(
2256 : struct xfs_trans *tp,
2257 : struct xfs_inode *ip)
2258 : {
2259 57754953 : struct xfs_mount *mp = ip->i_mount;
2260 57754953 : struct xfs_perag *pag;
2261 57754953 : struct xfs_icluster xic = { 0 };
2262 57754953 : struct xfs_inode_log_item *iip = ip->i_itemp;
2263 57754953 : int error;
2264 :
2265 57754953 : ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2266 57754953 : ASSERT(VFS_I(ip)->i_nlink == 0);
2267 57754953 : ASSERT(ip->i_df.if_nextents == 0);
2268 57754953 : ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
2269 57754953 : ASSERT(ip->i_nblocks == 0);
2270 :
2271 57754953 : pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2272 :
2273 : /*
2274 : * Free the inode first so that we guarantee that the AGI lock is going
2275 : * to be taken before we remove the inode from the unlinked list. This
2276 : * makes the AGI lock -> unlinked list modification order the same as
2277 : * used in O_TMPFILE creation.
2278 : */
2279 57821331 : error = xfs_difree(tp, pag, ip->i_ino, &xic);
2280 57860976 : if (error)
2281 130 : goto out;
2282 :
2283 57860846 : error = xfs_iunlink_remove(tp, pag, ip);
2284 57843939 : if (error)
2285 0 : goto out;
2286 :
2287 : /*
2288 : * Free any local-format data sitting around before we reset the
2289 : * data fork to extents format. Note that the attr fork data has
2290 : * already been freed by xfs_attr_inactive.
2291 : */
2292 57843939 : if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
2293 9022645 : kmem_free(ip->i_df.if_u1.if_data);
2294 9024008 : ip->i_df.if_u1.if_data = NULL;
2295 9024008 : ip->i_df.if_bytes = 0;
2296 : }
2297 :
2298 57845302 : VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
2299 57845302 : ip->i_diflags = 0;
2300 57845302 : ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
2301 57845302 : ip->i_forkoff = 0; /* mark the attr fork not in use */
2302 57845302 : ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
2303 115711544 : if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS))
2304 0 : xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS);
2305 :
2306 : /* Don't attempt to replay owner changes for a deleted inode */
2307 57866242 : spin_lock(&iip->ili_lock);
2308 57864470 : iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);
2309 57864470 : spin_unlock(&iip->ili_lock);
2310 :
2311 : /*
2312 : * Bump the generation count so no one will be confused
2313 : * by reincarnations of this inode.
2314 : */
2315 57859580 : VFS_I(ip)->i_generation++;
2316 57859580 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2317 :
2318 57860136 : if (xic.deleted)
2319 346357 : error = xfs_ifree_cluster(tp, pag, ip, &xic);
2320 57513779 : out:
2321 57860265 : xfs_perag_put(pag);
2322 57872082 : return error;
2323 : }
2324 :
2325 : /*
2326 : * This is called to unpin an inode. The caller must have the inode locked
2327 : * in at least shared mode so that the buffer cannot be subsequently pinned
2328 : * once someone is waiting for it to be unpinned.
2329 : */
2330 : static void
2331 3 : xfs_iunpin(
2332 : struct xfs_inode *ip)
2333 : {
2334 3 : ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2335 :
2336 3 : trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2337 :
2338 : /* Give the log a push to start the unpinning I/O */
2339 3 : xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
2340 :
2341 3 : }
2342 :
2343 : static void
2344 3 : __xfs_iunpin_wait(
2345 : struct xfs_inode *ip)
2346 : {
2347 3 : wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
2348 3 : DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2349 :
2350 3 : xfs_iunpin(ip);
2351 :
2352 3 : do {
2353 3 : prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
2354 3 : if (xfs_ipincount(ip))
2355 0 : io_schedule();
2356 3 : } while (xfs_ipincount(ip));
2357 3 : finish_wait(wq, &wait.wq_entry);
2358 3 : }
2359 :
2360 : void
2361 339864664 : xfs_iunpin_wait(
2362 : struct xfs_inode *ip)
2363 : {
2364 339864664 : if (xfs_ipincount(ip))
2365 3 : __xfs_iunpin_wait(ip);
2366 339864664 : }
2367 :
2368 : /*
2369 : * Removing an inode from the namespace involves removing the directory entry
2370 : * and dropping the link count on the inode. Removing the directory entry can
2371 : * result in locking an AGF (directory blocks were freed) and removing a link
2372 : * count can result in placing the inode on an unlinked list which results in
2373 : * locking an AGI.
2374 : *
2375 : * The big problem here is that we have an ordering constraint on AGF and AGI
2376 : * locking - inode allocation locks the AGI, then can allocate a new extent for
2377 : * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
2378 : * removes the inode from the unlinked list, requiring that we lock the AGI
2379 : * first, and then freeing the inode can result in an inode chunk being freed
2380 : * and hence freeing disk space requiring that we lock an AGF.
2381 : *
2382 : * Hence the ordering that is imposed by other parts of the code is AGI before
2383 : * AGF. This means we cannot remove the directory entry before we drop the inode
2384 : * reference count and put it on the unlinked list as this results in a lock
2385 : * order of AGF then AGI, and this can deadlock against inode allocation and
2386 : * freeing. Therefore we must drop the link counts before we remove the
2387 : * directory entry.
2388 : *
2389 : * This is still safe from a transactional point of view - it is not until we
2390 : * get to xfs_defer_finish() that we have the possibility of multiple
2391 : * transactions in this operation. Hence as long as we remove the directory
2392 : * entry and drop the link count in the first transaction of the remove
2393 : * operation, there are no transactional constraints on the ordering here.
2394 : */
2395 : int
2396 56589534 : xfs_remove(
2397 : xfs_inode_t *dp,
2398 : struct xfs_name *name,
2399 : xfs_inode_t *ip)
2400 : {
2401 56589534 : xfs_mount_t *mp = dp->i_mount;
2402 56589534 : xfs_trans_t *tp = NULL;
2403 56589534 : int is_dir = S_ISDIR(VFS_I(ip)->i_mode);
2404 56589534 : int dontcare;
2405 56589534 : int error = 0;
2406 56589534 : uint resblks;
2407 :
2408 56589534 : trace_xfs_remove(dp, name);
2409 :
2410 113136620 : if (xfs_is_shutdown(mp))
2411 : return -EIO;
2412 :
2413 56567938 : error = xfs_qm_dqattach(dp);
2414 56545939 : if (error)
2415 6 : goto std_return;
2416 :
2417 56545933 : error = xfs_qm_dqattach(ip);
2418 56564054 : if (error)
2419 0 : goto std_return;
2420 :
2421 : /*
2422 : * We try to get the real space reservation first, allowing for
2423 : * directory btree deletion(s) implying possible bmap insert(s). If we
2424 : * can't get the space reservation then we use 0 instead, and avoid the
2425 : * bmap btree insert(s) in the directory code by, if the bmap insert
2426 : * tries to happen, instead trimming the LAST block from the directory.
2427 : *
2428 : * Ignore EDQUOT and ENOSPC being returned via nospace_error because
2429 : * the directory code can handle a reservationless update and we don't
2430 : * want to prevent a user from trying to free space by deleting things.
2431 : */
2432 56564054 : resblks = XFS_REMOVE_SPACE_RES(mp);
2433 56564054 : error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks,
2434 : &tp, &dontcare);
2435 56580622 : if (error) {
2436 0 : ASSERT(error != -ENOSPC);
2437 0 : goto std_return;
2438 : }
2439 :
2440 : /*
2441 : * If we're removing a directory perform some additional validation.
2442 : */
2443 56580622 : if (is_dir) {
2444 2885493 : ASSERT(VFS_I(ip)->i_nlink >= 2);
2445 2885493 : if (VFS_I(ip)->i_nlink != 2) {
2446 1303970 : error = -ENOTEMPTY;
2447 1303970 : goto out_trans_cancel;
2448 : }
2449 1581523 : if (!xfs_dir_isempty(ip)) {
2450 701901 : error = -ENOTEMPTY;
2451 701901 : goto out_trans_cancel;
2452 : }
2453 :
2454 : /* Drop the link from ip's "..". */
2455 879602 : error = xfs_droplink(tp, dp);
2456 879628 : if (error)
2457 0 : goto out_trans_cancel;
2458 :
2459 : /* Drop the "." link from ip to self. */
2460 879628 : error = xfs_droplink(tp, ip);
2461 879659 : if (error)
2462 0 : goto out_trans_cancel;
2463 :
2464 : /*
2465 : * Point the unlinked child directory's ".." entry to the root
2466 : * directory to eliminate back-references to inodes that may
2467 : * get freed before the child directory is closed. If the fs
2468 : * gets shrunk, this can lead to dirent inode validation errors.
2469 : */
2470 879659 : if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) {
2471 793985 : error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
2472 : tp->t_mountp->m_sb.sb_rootino, 0);
2473 793845 : if (error)
2474 0 : goto out_trans_cancel;
2475 : }
2476 : } else {
2477 : /*
2478 : * When removing a non-directory we need to log the parent
2479 : * inode here. For a directory this is done implicitly
2480 : * by the xfs_droplink call for the ".." entry.
2481 : */
2482 53695129 : xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2483 : }
2484 54581687 : xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2485 :
2486 : /* Drop the link from dp to ip. */
2487 54551676 : error = xfs_droplink(tp, ip);
2488 54584264 : if (error)
2489 5 : goto out_trans_cancel;
2490 :
2491 54584259 : error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks);
2492 54590184 : if (error) {
2493 12 : ASSERT(error != -ENOENT);
2494 12 : goto out_trans_cancel;
2495 : }
2496 :
2497 : /*
2498 : * If this is a synchronous mount, make sure that the
2499 : * remove transaction goes to disk before returning to
2500 : * the user.
2501 : */
2502 54590172 : if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
2503 654 : xfs_trans_set_sync(tp);
2504 :
2505 54590172 : error = xfs_trans_commit(tp);
2506 54578146 : if (error)
2507 0 : goto std_return;
2508 :
2509 54578146 : if (is_dir && xfs_inode_is_filestream(ip))
2510 4320 : xfs_filestream_deassociate(ip);
2511 :
2512 : return 0;
2513 :
2514 2005888 : out_trans_cancel:
2515 2005888 : xfs_trans_cancel(tp);
2516 : std_return:
2517 : return error;
2518 : }
2519 :
2520 : /*
2521 : * Enter all inodes for a rename transaction into a sorted array.
2522 : */
2523 : #define __XFS_SORT_INODES 5
2524 : STATIC void
2525 45999526 : xfs_sort_for_rename(
2526 : struct xfs_inode *dp1, /* in: old (source) directory inode */
2527 : struct xfs_inode *dp2, /* in: new (target) directory inode */
2528 : struct xfs_inode *ip1, /* in: inode of old entry */
2529 : struct xfs_inode *ip2, /* in: inode of new entry */
2530 : struct xfs_inode *wip, /* in: whiteout inode */
2531 : struct xfs_inode **i_tab,/* out: sorted array of inodes */
2532 : int *num_inodes) /* in/out: inodes in array */
2533 : {
2534 45999526 : int i, j;
2535 :
2536 45999526 : ASSERT(*num_inodes == __XFS_SORT_INODES);
2537 45999526 : memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
2538 :
2539 : /*
2540 : * i_tab contains a list of pointers to inodes. We initialize
2541 : * the table here & we'll sort it. We will then use it to
2542 : * order the acquisition of the inode locks.
2543 : *
2544 : * Note that the table may contain duplicates. e.g., dp1 == dp2.
2545 : */
2546 45999526 : i = 0;
2547 45999526 : i_tab[i++] = dp1;
2548 45999526 : i_tab[i++] = dp2;
2549 45999526 : i_tab[i++] = ip1;
2550 45999526 : if (ip2)
2551 13545725 : i_tab[i++] = ip2;
2552 45999526 : if (wip)
2553 2308026 : i_tab[i++] = wip;
2554 45999526 : *num_inodes = i;
2555 :
2556 : /*
2557 : * Sort the elements via bubble sort. (Remember, there are at
2558 : * most 5 elements to sort, so this is adequate.)
2559 : */
2560 199845483 : for (i = 0; i < *num_inodes; i++) {
2561 525037046 : for (j = 1; j < *num_inodes; j++) {
2562 371191089 : if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
2563 82789028 : struct xfs_inode *temp = i_tab[j];
2564 82789028 : i_tab[j] = i_tab[j-1];
2565 82789028 : i_tab[j-1] = temp;
2566 : }
2567 : }
2568 : }
2569 45999526 : }
2570 :
2571 : static int
2572 45996720 : xfs_finish_rename(
2573 : struct xfs_trans *tp)
2574 : {
2575 : /*
2576 : * If this is a synchronous mount, make sure that the rename transaction
2577 : * goes to disk before returning to the user.
2578 : */
2579 45996720 : if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp))
2580 0 : xfs_trans_set_sync(tp);
2581 :
2582 45996720 : return xfs_trans_commit(tp);
2583 : }
2584 :
2585 : /*
2586 : * xfs_cross_rename()
2587 : *
2588 : * responsible for handling RENAME_EXCHANGE flag in renameat2() syscall
2589 : */
2590 : STATIC int
2591 12931686 : xfs_cross_rename(
2592 : struct xfs_trans *tp,
2593 : struct xfs_inode *dp1,
2594 : struct xfs_name *name1,
2595 : struct xfs_inode *ip1,
2596 : struct xfs_inode *dp2,
2597 : struct xfs_name *name2,
2598 : struct xfs_inode *ip2,
2599 : int spaceres)
2600 : {
2601 12931686 : int error = 0;
2602 12931686 : int ip1_flags = 0;
2603 12931686 : int ip2_flags = 0;
2604 12931686 : int dp2_flags = 0;
2605 :
2606 : /* Swap inode number for dirent in first parent */
2607 12931686 : error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres);
2608 12931659 : if (error)
2609 195 : goto out_trans_abort;
2610 :
2611 : /* Swap inode number for dirent in second parent */
2612 12931464 : error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres);
2613 12931503 : if (error)
2614 6 : goto out_trans_abort;
2615 :
2616 : /*
2617 : * If we're renaming one or more directories across different parents,
2618 : * update the respective ".." entries (and link counts) to match the new
2619 : * parents.
2620 : */
2621 12931497 : if (dp1 != dp2) {
2622 12622392 : dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2623 :
2624 12622392 : if (S_ISDIR(VFS_I(ip2)->i_mode)) {
2625 4187476 : error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
2626 : dp1->i_ino, spaceres);
2627 4187476 : if (error)
2628 0 : goto out_trans_abort;
2629 :
2630 : /* transfer ip2 ".." reference to dp1 */
2631 4187476 : if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
2632 44 : error = xfs_droplink(tp, dp2);
2633 44 : if (error)
2634 0 : goto out_trans_abort;
2635 44 : xfs_bumplink(tp, dp1);
2636 : }
2637 :
2638 : /*
2639 : * Although ip1 isn't changed here, userspace needs
2640 : * to be warned about the change, so that applications
2641 : * relying on it (like backup ones), will properly
2642 : * notify the change
2643 : */
2644 : ip1_flags |= XFS_ICHGTIME_CHG;
2645 : ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2646 : }
2647 :
2648 12622392 : if (S_ISDIR(VFS_I(ip1)->i_mode)) {
2649 4187476 : error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
2650 : dp2->i_ino, spaceres);
2651 4187476 : if (error)
2652 0 : goto out_trans_abort;
2653 :
2654 : /* transfer ip1 ".." reference to dp2 */
2655 4187476 : if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
2656 44 : error = xfs_droplink(tp, dp1);
2657 44 : if (error)
2658 0 : goto out_trans_abort;
2659 44 : xfs_bumplink(tp, dp2);
2660 : }
2661 :
2662 : /*
2663 : * Although ip2 isn't changed here, userspace needs
2664 : * to be warned about the change, so that applications
2665 : * relying on it (like backup ones), will properly
2666 : * notify the change
2667 : */
2668 4187476 : ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2669 4187476 : ip2_flags |= XFS_ICHGTIME_CHG;
2670 : }
2671 : }
2672 :
2673 12931497 : if (ip1_flags) {
2674 4187520 : xfs_trans_ichgtime(tp, ip1, ip1_flags);
2675 4187520 : xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
2676 : }
2677 12931497 : if (ip2_flags) {
2678 4187520 : xfs_trans_ichgtime(tp, ip2, ip2_flags);
2679 4187520 : xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
2680 : }
2681 12931497 : if (dp2_flags) {
2682 12622392 : xfs_trans_ichgtime(tp, dp2, dp2_flags);
2683 12622392 : xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
2684 : }
2685 12931497 : xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2686 12931492 : xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
2687 12931498 : return xfs_finish_rename(tp);
2688 :
2689 201 : out_trans_abort:
2690 201 : xfs_trans_cancel(tp);
2691 201 : return error;
2692 : }
2693 :
2694 : /*
2695 : * xfs_rename_alloc_whiteout()
2696 : *
2697 : * Return a referenced, unlinked, unlocked inode that can be used as a
2698 : * whiteout in a rename transaction. We use a tmpfile inode here so that if we
2699 : * crash between allocating the inode and linking it into the rename transaction
2700 : * recovery will free the inode and we won't leak it.
2701 : */
2702 : static int
2703 2365413 : xfs_rename_alloc_whiteout(
2704 : struct mnt_idmap *idmap,
2705 : struct xfs_name *src_name,
2706 : struct xfs_inode *dp,
2707 : struct xfs_inode **wip)
2708 : {
2709 2365413 : struct xfs_inode *tmpfile;
2710 2365413 : struct qstr name;
2711 2365413 : int error;
2712 :
2713 2365413 : error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE,
2714 : &tmpfile);
2715 2365472 : if (error)
2716 : return error;
2717 :
2718 2307937 : name.name = src_name->name;
2719 2307937 : name.len = src_name->len;
2720 2307937 : error = xfs_inode_init_security(VFS_I(tmpfile), VFS_I(dp), &name);
2721 2307829 : if (error) {
2722 0 : xfs_finish_inode_setup(tmpfile);
2723 0 : xfs_irele(tmpfile);
2724 0 : return error;
2725 : }
2726 :
2727 : /*
2728 : * Prepare the tmpfile inode as if it were created through the VFS.
2729 : * Complete the inode setup and flag it as linkable. nlink is already
2730 : * zero, so we can skip the drop_nlink.
2731 : */
2732 2307829 : xfs_setup_iops(tmpfile);
2733 2307817 : xfs_finish_inode_setup(tmpfile);
2734 2308037 : VFS_I(tmpfile)->i_state |= I_LINKABLE;
2735 :
2736 2308037 : *wip = tmpfile;
2737 2308037 : return 0;
2738 : }
2739 :
2740 : /*
2741 : * xfs_rename
2742 : */
2743 : int
2744 46056967 : xfs_rename(
2745 : struct mnt_idmap *idmap,
2746 : struct xfs_inode *src_dp,
2747 : struct xfs_name *src_name,
2748 : struct xfs_inode *src_ip,
2749 : struct xfs_inode *target_dp,
2750 : struct xfs_name *target_name,
2751 : struct xfs_inode *target_ip,
2752 : unsigned int flags)
2753 : {
2754 46056967 : struct xfs_mount *mp = src_dp->i_mount;
2755 46056967 : struct xfs_trans *tp;
2756 46056967 : struct xfs_inode *wip = NULL; /* whiteout inode */
2757 46056967 : struct xfs_inode *inodes[__XFS_SORT_INODES];
2758 46056967 : int i;
2759 46056967 : int num_inodes = __XFS_SORT_INODES;
2760 46056967 : bool new_parent = (src_dp != target_dp);
2761 46056967 : bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
2762 46056967 : int spaceres;
2763 46056967 : bool retried = false;
2764 46056967 : int error, nospace_error = 0;
2765 :
2766 46056967 : trace_xfs_rename(src_dp, target_dp, src_name, target_name);
2767 :
2768 46056353 : if ((flags & RENAME_EXCHANGE) && !target_ip)
2769 : return -EINVAL;
2770 :
2771 : /*
2772 : * If we are doing a whiteout operation, allocate the whiteout inode
2773 : * we will be placing at the target and ensure the type is set
2774 : * appropriately.
2775 : */
2776 46056353 : if (flags & RENAME_WHITEOUT) {
2777 2365385 : error = xfs_rename_alloc_whiteout(idmap, src_name,
2778 : target_dp, &wip);
2779 2365408 : if (error)
2780 : return error;
2781 :
2782 : /* setup target dirent info as whiteout */
2783 2307882 : src_name->type = XFS_DIR3_FT_CHRDEV;
2784 : }
2785 :
2786 45998850 : xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
2787 : inodes, &num_inodes);
2788 :
2789 46003238 : retry:
2790 46003238 : nospace_error = 0;
2791 46003238 : spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
2792 46003238 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
2793 46003119 : if (error == -ENOSPC) {
2794 350672 : nospace_error = error;
2795 350672 : spaceres = 0;
2796 350672 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
2797 : &tp);
2798 : }
2799 46003090 : if (error)
2800 246 : goto out_release_wip;
2801 :
2802 : /*
2803 : * Attach the dquots to the inodes
2804 : */
2805 46002844 : error = xfs_qm_vop_rename_dqattach(inodes);
2806 46002443 : if (error)
2807 792 : goto out_trans_cancel;
2808 :
2809 : /*
2810 : * Lock all the participating inodes. Depending upon whether
2811 : * the target_name exists in the target directory, and
2812 : * whether the target directory is the same as the source
2813 : * directory, we can lock from 2 to 5 inodes.
2814 : */
2815 46001651 : xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
2816 :
2817 : /*
2818 : * Join all the inodes to the transaction. From this point on,
2819 : * we can rely on either trans_commit or trans_cancel to unlock
2820 : * them.
2821 : */
2822 46002565 : xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
2823 46002580 : if (new_parent)
2824 41505449 : xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
2825 46002580 : xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
2826 46002677 : if (target_ip)
2827 13544722 : xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
2828 46002622 : if (wip)
2829 2308048 : xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
2830 :
2831 : /*
2832 : * If we are using project inheritance, we only allow renames
2833 : * into our tree when the project IDs are the same; else the
2834 : * tree quota mechanism would be circumvented.
2835 : */
2836 46002603 : if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
2837 : target_dp->i_projid != src_ip->i_projid)) {
2838 0 : error = -EXDEV;
2839 0 : goto out_trans_cancel;
2840 : }
2841 :
2842 : /* RENAME_EXCHANGE is unique from here on. */
2843 46002603 : if (flags & RENAME_EXCHANGE)
2844 12931690 : return xfs_cross_rename(tp, src_dp, src_name, src_ip,
2845 : target_dp, target_name, target_ip,
2846 : spaceres);
2847 :
2848 : /*
2849 : * Try to reserve quota to handle an expansion of the target directory.
2850 : * We'll allow the rename to continue in reservationless mode if we hit
2851 : * a space usage constraint. If we trigger reservationless mode, save
2852 : * the errno if there isn't any free space in the target directory.
2853 : */
2854 33070913 : if (spaceres != 0) {
2855 32812054 : error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres,
2856 : 0, false);
2857 32811914 : if (error == -EDQUOT || error == -ENOSPC) {
2858 7091 : if (!retried) {
2859 3832 : xfs_trans_cancel(tp);
2860 3832 : xfs_blockgc_free_quota(target_dp, 0);
2861 3832 : retried = true;
2862 3832 : goto retry;
2863 : }
2864 :
2865 : nospace_error = error;
2866 : spaceres = 0;
2867 : error = 0;
2868 : }
2869 32808082 : if (error)
2870 0 : goto out_trans_cancel;
2871 : }
2872 :
2873 : /*
2874 : * Check for expected errors before we dirty the transaction
2875 : * so we can return an error without a transaction abort.
2876 : */
2877 33066941 : if (target_ip == NULL) {
2878 : /*
2879 : * If there's no space reservation, check the entry will
2880 : * fit before actually inserting it.
2881 : */
2882 32453927 : if (!spaceres) {
2883 262146 : error = xfs_dir_canenter(tp, target_dp, target_name);
2884 262148 : if (error)
2885 1792 : goto out_trans_cancel;
2886 : }
2887 : } else {
2888 : /*
2889 : * If target exists and it's a directory, check that whether
2890 : * it can be destroyed.
2891 : */
2892 615037 : if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
2893 2023 : (!xfs_dir_isempty(target_ip) ||
2894 1924 : (VFS_I(target_ip)->i_nlink > 2))) {
2895 99 : error = -EEXIST;
2896 99 : goto out_trans_cancel;
2897 : }
2898 : }
2899 :
2900 : /*
2901 : * Lock the AGI buffers we need to handle bumping the nlink of the
2902 : * whiteout inode off the unlinked list and to handle dropping the
2903 : * nlink of the target inode. Per locking order rules, do this in
2904 : * increasing AG order and before directory block allocation tries to
2905 : * grab AGFs because we grab AGIs before AGFs.
2906 : *
2907 : * The (vfs) caller must ensure that if src is a directory then
2908 : * target_ip is either null or an empty directory.
2909 : */
2910 135179608 : for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
2911 201921768 : if (inodes[i] == wip ||
2912 99807495 : (inodes[i] == target_ip &&
2913 612922 : (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
2914 2892318 : struct xfs_perag *pag;
2915 2892318 : struct xfs_buf *bp;
2916 :
2917 8676954 : pag = xfs_perag_get(mp,
2918 2892318 : XFS_INO_TO_AGNO(mp, inodes[i]->i_ino));
2919 2892462 : error = xfs_read_agi(pag, tp, &bp);
2920 2892427 : xfs_perag_put(pag);
2921 2892489 : if (error)
2922 12 : goto out_trans_cancel;
2923 : }
2924 : }
2925 :
2926 : /*
2927 : * Directory entry creation below may acquire the AGF. Remove
2928 : * the whiteout from the unlinked list first to preserve correct
2929 : * AGI/AGF locking order. This dirties the transaction so failures
2930 : * after this point will abort and log recovery will clean up the
2931 : * mess.
2932 : *
2933 : * For whiteouts, we need to bump the link count on the whiteout
2934 : * inode. After this point, we have a real link, clear the tmpfile
2935 : * state flag from the inode so it doesn't accidentally get misused
2936 : * in future.
2937 : */
2938 33065210 : if (wip) {
2939 2307030 : struct xfs_perag *pag;
2940 :
2941 2307030 : ASSERT(VFS_I(wip)->i_nlink == 0);
2942 :
2943 2307030 : pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino));
2944 2307011 : error = xfs_iunlink_remove(tp, pag, wip);
2945 2306858 : xfs_perag_put(pag);
2946 2307005 : if (error)
2947 0 : goto out_trans_cancel;
2948 :
2949 2307005 : xfs_bumplink(tp, wip);
2950 2306994 : VFS_I(wip)->i_state &= ~I_LINKABLE;
2951 : }
2952 :
2953 : /*
2954 : * Set up the target.
2955 : */
2956 33065174 : if (target_ip == NULL) {
2957 : /*
2958 : * If target does not exist and the rename crosses
2959 : * directories, adjust the target directory link count
2960 : * to account for the ".." reference from the new entry.
2961 : */
2962 32452254 : error = xfs_dir_createname(tp, target_dp, target_name,
2963 : src_ip->i_ino, spaceres);
2964 32452258 : if (error)
2965 21 : goto out_trans_cancel;
2966 :
2967 32452237 : xfs_trans_ichgtime(tp, target_dp,
2968 : XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2969 :
2970 32451960 : if (new_parent && src_is_directory) {
2971 9680045 : xfs_bumplink(tp, target_dp);
2972 : }
2973 : } else { /* target_ip != NULL */
2974 : /*
2975 : * Link the source inode under the target name.
2976 : * If the source inode is a directory and we are moving
2977 : * it across directories, its ".." entry will be
2978 : * inconsistent until we replace that down below.
2979 : *
2980 : * In case there is already an entry with the same
2981 : * name at the destination directory, remove it first.
2982 : */
2983 612920 : error = xfs_dir_replace(tp, target_dp, target_name,
2984 : src_ip->i_ino, spaceres);
2985 612910 : if (error)
2986 3 : goto out_trans_cancel;
2987 :
2988 612907 : xfs_trans_ichgtime(tp, target_dp,
2989 : XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2990 :
2991 : /*
2992 : * Decrement the link count on the target since the target
2993 : * dir no longer points to it.
2994 : */
2995 612898 : error = xfs_droplink(tp, target_ip);
2996 612917 : if (error)
2997 1 : goto out_trans_cancel;
2998 :
2999 612916 : if (src_is_directory) {
3000 : /*
3001 : * Drop the link from the old "." entry.
3002 : */
3003 1924 : error = xfs_droplink(tp, target_ip);
3004 1924 : if (error)
3005 0 : goto out_trans_cancel;
3006 : }
3007 : } /* target_ip != NULL */
3008 :
3009 : /*
3010 : * Remove the source.
3011 : */
3012 33064876 : if (new_parent && src_is_directory) {
3013 : /*
3014 : * Rewrite the ".." entry to point to the new
3015 : * directory.
3016 : */
3017 9680089 : error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
3018 : target_dp->i_ino, spaceres);
3019 9680089 : ASSERT(error != -EEXIST);
3020 9680089 : if (error)
3021 0 : goto out_trans_cancel;
3022 : }
3023 :
3024 : /*
3025 : * We always want to hit the ctime on the source inode.
3026 : *
3027 : * This isn't strictly required by the standards since the source
3028 : * inode isn't really being changed, but old unix file systems did
3029 : * it and some incremental backup programs won't work without it.
3030 : */
3031 33064876 : xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
3032 33065020 : xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
3033 :
3034 : /*
3035 : * Adjust the link count on src_dp. This is necessary when
3036 : * renaming a directory, either within one parent when
3037 : * the target existed, or across two parent directories.
3038 : */
3039 33065337 : if (src_is_directory && (new_parent || target_ip != NULL)) {
3040 :
3041 : /*
3042 : * Decrement link count on src_directory since the
3043 : * entry that's moved no longer points to it.
3044 : */
3045 9681969 : error = xfs_droplink(tp, src_dp);
3046 9681969 : if (error)
3047 0 : goto out_trans_cancel;
3048 : }
3049 :
3050 : /*
3051 : * For whiteouts, we only need to update the source dirent with the
3052 : * inode number of the whiteout inode rather than removing it
3053 : * altogether.
3054 : */
3055 33065337 : if (wip)
3056 2307043 : error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
3057 : spaceres);
3058 : else
3059 30758294 : error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
3060 : spaceres);
3061 :
3062 33065003 : if (error)
3063 2 : goto out_trans_cancel;
3064 :
3065 33065001 : xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3066 33065147 : xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
3067 33065410 : if (new_parent)
3068 28878744 : xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
3069 :
3070 33065410 : error = xfs_finish_rename(tp);
3071 33065015 : if (wip)
3072 2306985 : xfs_irele(wip);
3073 : return error;
3074 :
3075 2722 : out_trans_cancel:
3076 2722 : xfs_trans_cancel(tp);
3077 2968 : out_release_wip:
3078 2968 : if (wip)
3079 1105 : xfs_irele(wip);
3080 2968 : if (error == -ENOSPC && nospace_error)
3081 1792 : error = nospace_error;
3082 : return error;
3083 : }
3084 :
3085 : static int
3086 174019019 : xfs_iflush(
3087 : struct xfs_inode *ip,
3088 : struct xfs_buf *bp)
3089 : {
3090 174019019 : struct xfs_inode_log_item *iip = ip->i_itemp;
3091 174019019 : struct xfs_dinode *dip;
3092 174019019 : struct xfs_mount *mp = ip->i_mount;
3093 174019019 : int error;
3094 :
3095 174019019 : ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3096 348038038 : ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
3097 174019019 : ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
3098 : ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3099 174019019 : ASSERT(iip->ili_item.li_buf == bp);
3100 :
3101 174019019 : dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
3102 :
3103 : /*
3104 : * We don't flush the inode if any of the following checks fail, but we
3105 : * do still update the log item and attach to the backing buffer as if
3106 : * the flush happened. This is a formality to facilitate predictable
3107 : * error handling as the caller will shutdown and fail the buffer.
3108 : */
3109 174019019 : error = -EFSCORRUPTED;
3110 174019019 : if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
3111 : mp, XFS_ERRTAG_IFLUSH_1)) {
3112 0 : xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3113 : "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT,
3114 : __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
3115 0 : goto flush_out;
3116 : }
3117 174019019 : if (S_ISREG(VFS_I(ip)->i_mode)) {
3118 109340282 : if (XFS_TEST_ERROR(
3119 : ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
3120 : ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
3121 : mp, XFS_ERRTAG_IFLUSH_3)) {
3122 0 : xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3123 : "%s: Bad regular inode %llu, ptr "PTR_FMT,
3124 : __func__, ip->i_ino, ip);
3125 0 : goto flush_out;
3126 : }
3127 64678737 : } else if (S_ISDIR(VFS_I(ip)->i_mode)) {
3128 40174889 : if (XFS_TEST_ERROR(
3129 : ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
3130 : ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
3131 : ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
3132 : mp, XFS_ERRTAG_IFLUSH_4)) {
3133 0 : xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3134 : "%s: Bad directory inode %llu, ptr "PTR_FMT,
3135 : __func__, ip->i_ino, ip);
3136 0 : goto flush_out;
3137 : }
3138 : }
3139 348038038 : if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
3140 : ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
3141 0 : xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3142 : "%s: detected corrupt incore inode %llu, "
3143 : "total extents = %llu nblocks = %lld, ptr "PTR_FMT,
3144 : __func__, ip->i_ino,
3145 : ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af),
3146 : ip->i_nblocks, ip);
3147 0 : goto flush_out;
3148 : }
3149 174019019 : if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize,
3150 : mp, XFS_ERRTAG_IFLUSH_6)) {
3151 0 : xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3152 : "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT,
3153 : __func__, ip->i_ino, ip->i_forkoff, ip);
3154 0 : goto flush_out;
3155 : }
3156 :
3157 : /*
3158 : * Inode item log recovery for v2 inodes are dependent on the flushiter
3159 : * count for correct sequencing. We bump the flush iteration count so
3160 : * we can detect flushes which postdate a log record during recovery.
3161 : * This is redundant as we now log every change and hence this can't
3162 : * happen but we need to still do it to ensure backwards compatibility
3163 : * with old kernels that predate logging all inode changes.
3164 : */
3165 174019019 : if (!xfs_has_v3inodes(mp))
3166 1285 : ip->i_flushiter++;
3167 :
3168 : /*
3169 : * If there are inline format data / attr forks attached to this inode,
3170 : * make sure they are not corrupt.
3171 : */
3172 216210399 : if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
3173 42191380 : xfs_ifork_verify_local_data(ip))
3174 0 : goto flush_out;
3175 174019019 : if (xfs_inode_has_attr_fork(ip) &&
3176 70405459 : ip->i_af.if_format == XFS_DINODE_FMT_LOCAL &&
3177 22179152 : xfs_ifork_verify_local_attr(ip))
3178 0 : goto flush_out;
3179 :
3180 : /*
3181 : * Copy the dirty parts of the inode into the on-disk inode. We always
3182 : * copy out the core of the inode, because if the inode is dirty at all
3183 : * the core must be.
3184 : */
3185 174019019 : xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
3186 :
3187 : /* Wrap, we never let the log put out DI_MAX_FLUSH */
3188 174019019 : if (!xfs_has_v3inodes(mp)) {
3189 1285 : if (ip->i_flushiter == DI_MAX_FLUSH)
3190 0 : ip->i_flushiter = 0;
3191 : }
3192 :
3193 174019019 : xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
3194 174019019 : if (xfs_inode_has_attr_fork(ip))
3195 48226307 : xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
3196 :
3197 : /*
3198 : * We've recorded everything logged in the inode, so we'd like to clear
3199 : * the ili_fields bits so we don't log and flush things unnecessarily.
3200 : * However, we can't stop logging all this information until the data
3201 : * we've copied into the disk buffer is written to disk. If we did we
3202 : * might overwrite the copy of the inode in the log with all the data
3203 : * after re-logging only part of it, and in the face of a crash we
3204 : * wouldn't have all the data we need to recover.
3205 : *
3206 : * What we do is move the bits to the ili_last_fields field. When
3207 : * logging the inode, these bits are moved back to the ili_fields field.
3208 : * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since
3209 : * we know that the information those bits represent is permanently on
3210 : * disk. As long as the flush completes before the inode is logged
3211 : * again, then both ili_fields and ili_last_fields will be cleared.
3212 : */
3213 : error = 0;
3214 174019019 : flush_out:
3215 174019019 : spin_lock(&iip->ili_lock);
3216 174019019 : iip->ili_last_fields = iip->ili_fields;
3217 174019019 : iip->ili_fields = 0;
3218 174019019 : iip->ili_fsync_fields = 0;
3219 174019019 : spin_unlock(&iip->ili_lock);
3220 :
3221 : /*
3222 : * Store the current LSN of the inode so that we can tell whether the
3223 : * item has moved in the AIL from xfs_buf_inode_iodone().
3224 : */
3225 174019019 : xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
3226 : &iip->ili_item.li_lsn);
3227 :
3228 : /* generate the checksum. */
3229 174019019 : xfs_dinode_calc_crc(mp, dip);
3230 174019018 : return error;
3231 : }
3232 :
3233 : /*
3234 : * Non-blocking flush of dirty inode metadata into the backing buffer.
3235 : *
3236 : * The caller must have a reference to the inode and hold the cluster buffer
3237 : * locked. The function will walk across all the inodes on the cluster buffer it
3238 : * can find and lock without blocking, and flush them to the cluster buffer.
3239 : *
3240 : * On successful flushing of at least one inode, the caller must write out the
3241 : * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
3242 : * the caller needs to release the buffer. On failure, the filesystem will be
3243 : * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
3244 : * will be returned.
3245 : */
3246 : int
3247 24065002 : xfs_iflush_cluster(
3248 : struct xfs_buf *bp)
3249 : {
3250 24065002 : struct xfs_mount *mp = bp->b_mount;
3251 24065002 : struct xfs_log_item *lip, *n;
3252 24065002 : struct xfs_inode *ip;
3253 24065002 : struct xfs_inode_log_item *iip;
3254 24065002 : int clcount = 0;
3255 24065002 : int error = 0;
3256 :
3257 : /*
3258 : * We must use the safe variant here as on shutdown xfs_iflush_abort()
3259 : * will remove itself from the list.
3260 : */
3261 206225196 : list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
3262 182160194 : iip = (struct xfs_inode_log_item *)lip;
3263 182160194 : ip = iip->ili_inode;
3264 :
3265 : /*
3266 : * Quick and dirty check to avoid locks if possible.
3267 : */
3268 182160194 : if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING))
3269 122467 : continue;
3270 182037727 : if (xfs_ipincount(ip))
3271 4727658 : continue;
3272 :
3273 : /*
3274 : * The inode is still attached to the buffer, which means it is
3275 : * dirty but reclaim might try to grab it. Check carefully for
3276 : * that, and grab the ilock while still holding the i_flags_lock
3277 : * to guarantee reclaim will not be able to reclaim this inode
3278 : * once we drop the i_flags_lock.
3279 : */
3280 177310069 : spin_lock(&ip->i_flags_lock);
3281 177310069 : ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
3282 177310069 : if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) {
3283 13 : spin_unlock(&ip->i_flags_lock);
3284 13 : continue;
3285 : }
3286 :
3287 : /*
3288 : * ILOCK will pin the inode against reclaim and prevent
3289 : * concurrent transactions modifying the inode while we are
3290 : * flushing the inode. If we get the lock, set the flushing
3291 : * state before we drop the i_flags_lock.
3292 : */
3293 177310056 : if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3294 1161731 : spin_unlock(&ip->i_flags_lock);
3295 1161731 : continue;
3296 : }
3297 176148325 : __xfs_iflags_set(ip, XFS_IFLUSHING);
3298 176148325 : spin_unlock(&ip->i_flags_lock);
3299 :
3300 : /*
3301 : * Abort flushing this inode if we are shut down because the
3302 : * inode may not currently be in the AIL. This can occur when
3303 : * log I/O failure unpins the inode without inserting into the
3304 : * AIL, leaving a dirty/unpinned inode attached to the buffer
3305 : * that otherwise looks like it should be flushed.
3306 : */
3307 352296650 : if (xlog_is_shutdown(mp->m_log)) {
3308 2129306 : xfs_iunpin_wait(ip);
3309 2129306 : xfs_iflush_abort(ip);
3310 2129306 : xfs_iunlock(ip, XFS_ILOCK_SHARED);
3311 2129306 : error = -EIO;
3312 2129306 : continue;
3313 : }
3314 :
3315 : /* don't block waiting on a log force to unpin dirty inodes */
3316 174019019 : if (xfs_ipincount(ip)) {
3317 0 : xfs_iflags_clear(ip, XFS_IFLUSHING);
3318 0 : xfs_iunlock(ip, XFS_ILOCK_SHARED);
3319 0 : continue;
3320 : }
3321 :
3322 174019019 : if (!xfs_inode_clean(ip))
3323 174019019 : error = xfs_iflush(ip, bp);
3324 : else
3325 0 : xfs_iflags_clear(ip, XFS_IFLUSHING);
3326 174019018 : xfs_iunlock(ip, XFS_ILOCK_SHARED);
3327 174019019 : if (error)
3328 : break;
3329 174019019 : clcount++;
3330 : }
3331 :
3332 24065002 : if (error) {
3333 : /*
3334 : * Shutdown first so we kill the log before we release this
3335 : * buffer. If it is an INODE_ALLOC buffer and pins the tail
3336 : * of the log, failing it before the _log_ is shut down can
3337 : * result in the log tail being moved forward in the journal
3338 : * on disk because log writes can still be taking place. Hence
3339 : * unpinning the tail will allow the ICREATE intent to be
3340 : * removed from the log an recovery will fail with uninitialised
3341 : * inode cluster buffers.
3342 : */
3343 325005 : xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3344 325005 : bp->b_flags |= XBF_ASYNC;
3345 325005 : xfs_buf_ioend_fail(bp);
3346 325005 : return error;
3347 : }
3348 :
3349 23739997 : if (!clcount)
3350 : return -EAGAIN;
3351 :
3352 23593612 : XFS_STATS_INC(mp, xs_icluster_flushcnt);
3353 23593612 : XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
3354 23593612 : return 0;
3355 :
3356 : }
3357 :
3358 : /* Release an inode. */
3359 : void
3360 56004156003 : xfs_irele(
3361 : struct xfs_inode *ip)
3362 : {
3363 56004156003 : trace_xfs_irele(ip, _RET_IP_);
3364 56111677807 : iput(VFS_I(ip));
3365 57403116978 : }
3366 :
3367 : /*
3368 : * Ensure all commited transactions touching the inode are written to the log.
3369 : */
3370 : int
3371 529159 : xfs_log_force_inode(
3372 : struct xfs_inode *ip)
3373 : {
3374 529159 : xfs_csn_t seq = 0;
3375 :
3376 529159 : xfs_ilock(ip, XFS_ILOCK_SHARED);
3377 529166 : if (xfs_ipincount(ip))
3378 70466 : seq = ip->i_itemp->ili_commit_seq;
3379 529166 : xfs_iunlock(ip, XFS_ILOCK_SHARED);
3380 :
3381 529162 : if (!seq)
3382 : return 0;
3383 70465 : return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL);
3384 : }
3385 :
3386 : /*
3387 : * Grab the exclusive iolock for a data copy from src to dest, making sure to
3388 : * abide vfs locking order (lowest pointer value goes first) and breaking the
3389 : * layout leases before proceeding. The loop is needed because we cannot call
3390 : * the blocking break_layout() with the iolocks held, and therefore have to
3391 : * back out both locks.
3392 : */
3393 : static int
3394 140147901 : xfs_iolock_two_inodes_and_break_layout(
3395 : struct inode *src,
3396 : struct inode *dest)
3397 : {
3398 140147901 : int error;
3399 :
3400 140147901 : if (src > dest)
3401 67880881 : swap(src, dest);
3402 :
3403 140147901 : retry:
3404 : /* Wait to break both inodes' layouts before we start locking. */
3405 140147901 : error = break_layout(src, true);
3406 140149417 : if (error)
3407 0 : return error;
3408 140149417 : if (src != dest) {
3409 135363416 : error = break_layout(dest, true);
3410 135362735 : if (error)
3411 0 : return error;
3412 : }
3413 :
3414 : /* Lock one inode and make sure nobody got in and leased it. */
3415 140148736 : inode_lock(src);
3416 140147393 : error = break_layout(src, false);
3417 140146634 : if (error) {
3418 0 : inode_unlock(src);
3419 0 : if (error == -EWOULDBLOCK)
3420 0 : goto retry;
3421 0 : return error;
3422 : }
3423 :
3424 140146634 : if (src == dest)
3425 : return 0;
3426 :
3427 : /* Lock the other inode and make sure nobody got in and leased it. */
3428 135360638 : inode_lock_nested(dest, I_MUTEX_NONDIR2);
3429 135362003 : error = break_layout(dest, false);
3430 135363612 : if (error) {
3431 0 : inode_unlock(src);
3432 0 : inode_unlock(dest);
3433 0 : if (error == -EWOULDBLOCK)
3434 0 : goto retry;
3435 0 : return error;
3436 : }
3437 :
3438 : return 0;
3439 : }
3440 :
3441 : static int
3442 0 : xfs_mmaplock_two_inodes_and_break_dax_layout(
3443 : struct xfs_inode *ip1,
3444 : struct xfs_inode *ip2)
3445 : {
3446 0 : int error;
3447 0 : bool retry;
3448 0 : struct page *page;
3449 :
3450 0 : if (ip1->i_ino > ip2->i_ino)
3451 0 : swap(ip1, ip2);
3452 :
3453 0 : again:
3454 0 : retry = false;
3455 : /* Lock the first inode */
3456 0 : xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
3457 0 : error = xfs_break_dax_layouts(VFS_I(ip1), &retry);
3458 0 : if (error || retry) {
3459 0 : xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
3460 0 : if (error == 0 && retry)
3461 0 : goto again;
3462 0 : return error;
3463 : }
3464 :
3465 0 : if (ip1 == ip2)
3466 : return 0;
3467 :
3468 : /* Nested lock the second inode */
3469 0 : xfs_ilock(ip2, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1));
3470 : /*
3471 : * We cannot use xfs_break_dax_layouts() directly here because it may
3472 : * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
3473 : * for this nested lock case.
3474 : */
3475 0 : page = dax_layout_busy_page(VFS_I(ip2)->i_mapping);
3476 0 : if (page && page_ref_count(page) != 1) {
3477 0 : xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
3478 0 : xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
3479 0 : goto again;
3480 : }
3481 :
3482 : return 0;
3483 : }
3484 :
3485 : /*
3486 : * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
3487 : * mmap activity.
3488 : */
3489 : int
3490 140144367 : xfs_ilock2_io_mmap(
3491 : struct xfs_inode *ip1,
3492 : struct xfs_inode *ip2)
3493 : {
3494 140144367 : int ret;
3495 :
3496 140144367 : ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
3497 140146766 : if (ret)
3498 : return ret;
3499 :
3500 140146766 : if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
3501 0 : ret = xfs_mmaplock_two_inodes_and_break_dax_layout(ip1, ip2);
3502 0 : if (ret) {
3503 0 : inode_unlock(VFS_I(ip2));
3504 0 : if (ip1 != ip2)
3505 0 : inode_unlock(VFS_I(ip1));
3506 0 : return ret;
3507 : }
3508 : } else
3509 140146766 : filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping,
3510 : VFS_I(ip2)->i_mapping);
3511 :
3512 : return 0;
3513 : }
3514 :
3515 : /* Unlock both inodes to allow IO and mmap activity. */
3516 : void
3517 140117052 : xfs_iunlock2_io_mmap(
3518 : struct xfs_inode *ip1,
3519 : struct xfs_inode *ip2)
3520 : {
3521 140117052 : if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
3522 0 : xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
3523 0 : if (ip1 != ip2)
3524 0 : xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
3525 : } else
3526 140117052 : filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping,
3527 : VFS_I(ip2)->i_mapping);
3528 :
3529 140134977 : inode_unlock(VFS_I(ip2));
3530 140145165 : if (ip1 != ip2)
3531 135359174 : inode_unlock(VFS_I(ip1));
3532 140143750 : }
|