Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 : * All Rights Reserved.
5 : */
6 : #include <linux/iversion.h>
7 :
8 : #include "xfs.h"
9 : #include "xfs_fs.h"
10 : #include "xfs_shared.h"
11 : #include "xfs_format.h"
12 : #include "xfs_log_format.h"
13 : #include "xfs_trans_resv.h"
14 : #include "xfs_mount.h"
15 : #include "xfs_defer.h"
16 : #include "xfs_inode.h"
17 : #include "xfs_dir2.h"
18 : #include "xfs_attr.h"
19 : #include "xfs_bit.h"
20 : #include "xfs_trans_space.h"
21 : #include "xfs_trans.h"
22 : #include "xfs_buf_item.h"
23 : #include "xfs_inode_item.h"
24 : #include "xfs_iunlink_item.h"
25 : #include "xfs_ialloc.h"
26 : #include "xfs_bmap.h"
27 : #include "xfs_bmap_util.h"
28 : #include "xfs_errortag.h"
29 : #include "xfs_error.h"
30 : #include "xfs_quota.h"
31 : #include "xfs_filestream.h"
32 : #include "xfs_trace.h"
33 : #include "xfs_icache.h"
34 : #include "xfs_symlink.h"
35 : #include "xfs_trans_priv.h"
36 : #include "xfs_log.h"
37 : #include "xfs_bmap_btree.h"
38 : #include "xfs_reflink.h"
39 : #include "xfs_ag.h"
40 : #include "xfs_log_priv.h"
41 : #include "xfs_health.h"
42 : #include "xfs_pnfs.h"
43 : #include "xfs_parent.h"
44 : #include "xfs_xattr.h"
45 :
46 : struct kmem_cache *xfs_inode_cache;
47 :
48 : /*
49 : * helper function to extract extent size hint from inode
50 : */
51 : xfs_extlen_t
52 601209899 : xfs_get_extsz_hint(
53 : struct xfs_inode *ip)
54 : {
55 : /*
56 : * No point in aligning allocations if we need to COW to actually
57 : * write to them.
58 : */
59 601209899 : if (xfs_is_always_cow_inode(ip))
60 : return 0;
61 601209899 : if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
62 : return ip->i_extsize;
63 588583763 : if (XFS_IS_REALTIME_INODE(ip))
64 196587166 : return ip->i_mount->m_sb.sb_rextsize;
65 : return 0;
66 : }
67 :
68 : /*
69 : * Helper function to extract CoW extent size hint from inode.
70 : * Between the extent size hint and the CoW extent size hint, we
71 : * return the greater of the two. If the value is zero (automatic),
72 : * use the default size.
73 : */
74 : xfs_extlen_t
75 716973 : xfs_get_cowextsz_hint(
76 : struct xfs_inode *ip)
77 : {
78 716973 : xfs_extlen_t a, b;
79 :
80 716973 : a = 0;
81 716973 : if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
82 27829 : a = ip->i_cowextsize;
83 716973 : b = xfs_get_extsz_hint(ip);
84 :
85 716973 : a = max(a, b);
86 716973 : if (a == 0)
87 689150 : return XFS_DEFAULT_COWEXTSZ_HINT;
88 : return a;
89 : }
90 :
91 : /*
92 : * These two are wrapper routines around the xfs_ilock() routine used to
93 : * centralize some grungy code. They are used in places that wish to lock the
94 : * inode solely for reading the extents. The reason these places can't just
95 : * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
96 : * bringing in of the extents from disk for a file in b-tree format. If the
97 : * inode is in b-tree format, then we need to lock the inode exclusively until
98 : * the extents are read in. Locking it exclusively all the time would limit
99 : * our parallelism unnecessarily, though. What we do instead is check to see
100 : * if the extents have been read in yet, and only lock the inode exclusively
101 : * if they have not.
102 : *
103 : * The functions return a value which should be given to the corresponding
104 : * xfs_iunlock() call.
105 : */
106 : uint
107 722882747 : xfs_ilock_data_map_shared(
108 : struct xfs_inode *ip)
109 : {
110 722882747 : uint lock_mode = XFS_ILOCK_SHARED;
111 :
112 722882747 : if (xfs_need_iread_extents(&ip->i_df))
113 65640 : lock_mode = XFS_ILOCK_EXCL;
114 723024524 : xfs_ilock(ip, lock_mode);
115 723050617 : return lock_mode;
116 : }
117 :
118 : uint
119 1709151885 : xfs_ilock_attr_map_shared(
120 : struct xfs_inode *ip)
121 : {
122 1709151885 : uint lock_mode = XFS_ILOCK_SHARED;
123 :
124 3418895897 : if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
125 0 : lock_mode = XFS_ILOCK_EXCL;
126 1709355685 : xfs_ilock(ip, lock_mode);
127 1709924259 : return lock_mode;
128 : }
129 :
130 : /*
131 : * You can't set both SHARED and EXCL for the same lock,
132 : * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED,
133 : * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values
134 : * to set in lock_flags.
135 : */
136 : static inline void
137 >14113*10^7 : xfs_lock_flags_assert(
138 : uint lock_flags)
139 : {
140 >14113*10^7 : ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
141 : (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
142 >14113*10^7 : ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
143 : (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
144 >14113*10^7 : ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
145 : (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
146 >14113*10^7 : ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
147 >14113*10^7 : ASSERT(lock_flags != 0);
148 >14113*10^7 : }
149 :
150 : /*
151 : * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
152 : * multi-reader locks: invalidate_lock and the i_lock. This routine allows
153 : * various combinations of the locks to be obtained.
154 : *
155 : * The 3 locks should always be ordered so that the IO lock is obtained first,
156 : * the mmap lock second and the ilock last in order to prevent deadlock.
157 : *
158 : * Basic locking order:
159 : *
160 : * i_rwsem -> invalidate_lock -> page_lock -> i_ilock
161 : *
162 : * mmap_lock locking order:
163 : *
164 : * i_rwsem -> page lock -> mmap_lock
165 : * mmap_lock -> invalidate_lock -> page_lock
166 : *
167 : * The difference in mmap_lock locking order mean that we cannot hold the
168 : * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths
169 : * can fault in pages during copy in/out (for buffered IO) or require the
170 : * mmap_lock in get_user_pages() to map the user pages into the kernel address
171 : * space for direct IO. Similarly the i_rwsem cannot be taken inside a page
172 : * fault because page faults already hold the mmap_lock.
173 : *
174 : * Hence to serialise fully against both syscall and mmap based IO, we need to
175 : * take both the i_rwsem and the invalidate_lock. These locks should *only* be
176 : * both taken in places where we need to invalidate the page cache in a race
177 : * free manner (e.g. truncate, hole punch and other extent manipulation
178 : * functions).
179 : */
180 : void
181 69285871119 : xfs_ilock(
182 : xfs_inode_t *ip,
183 : uint lock_flags)
184 : {
185 69285871119 : trace_xfs_ilock(ip, lock_flags, _RET_IP_);
186 :
187 69627389619 : xfs_lock_flags_assert(lock_flags);
188 :
189 69675008520 : if (lock_flags & XFS_IOLOCK_EXCL) {
190 714649375 : down_write_nested(&VFS_I(ip)->i_rwsem,
191 : XFS_IOLOCK_DEP(lock_flags));
192 68960359145 : } else if (lock_flags & XFS_IOLOCK_SHARED) {
193 867481296 : down_read_nested(&VFS_I(ip)->i_rwsem,
194 : XFS_IOLOCK_DEP(lock_flags));
195 : }
196 :
197 69675090088 : if (lock_flags & XFS_MMAPLOCK_EXCL) {
198 69812816 : down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
199 : XFS_MMAPLOCK_DEP(lock_flags));
200 69605277272 : } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
201 27297943 : down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
202 : XFS_MMAPLOCK_DEP(lock_flags));
203 : }
204 :
205 69675089164 : if (lock_flags & XFS_ILOCK_EXCL)
206 2289400324 : mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
207 67385688840 : else if (lock_flags & XFS_ILOCK_SHARED)
208 65817591094 : mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
209 69657907239 : }
210 :
211 : /*
212 : * This is just like xfs_ilock(), except that the caller
213 : * is guaranteed not to sleep. It returns 1 if it gets
214 : * the requested locks and 0 otherwise. If the IO lock is
215 : * obtained but the inode lock cannot be, then the IO lock
216 : * is dropped before returning.
217 : *
218 : * ip -- the inode being locked
219 : * lock_flags -- this parameter indicates the inode's locks to be
220 : * to be locked. See the comment for xfs_ilock() for a list
221 : * of valid values.
222 : */
223 : int
224 2735973524 : xfs_ilock_nowait(
225 : xfs_inode_t *ip,
226 : uint lock_flags)
227 : {
228 2735973524 : trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
229 :
230 2736265119 : xfs_lock_flags_assert(lock_flags);
231 :
232 2739372633 : if (lock_flags & XFS_IOLOCK_EXCL) {
233 472693272 : if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
234 717874 : goto out;
235 2266679361 : } else if (lock_flags & XFS_IOLOCK_SHARED) {
236 217082275 : if (!down_read_trylock(&VFS_I(ip)->i_rwsem))
237 26815047 : goto out;
238 : }
239 :
240 2711834477 : if (lock_flags & XFS_MMAPLOCK_EXCL) {
241 5014 : if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
242 0 : goto out_undo_iolock;
243 2711829463 : } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
244 0 : if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
245 0 : goto out_undo_iolock;
246 : }
247 :
248 2711834477 : if (lock_flags & XFS_ILOCK_EXCL) {
249 1251435680 : if (!mrtryupdate(&ip->i_lock))
250 27152 : goto out_undo_mmaplock;
251 1460398797 : } else if (lock_flags & XFS_ILOCK_SHARED) {
252 798201444 : if (!mrtryaccess(&ip->i_lock))
253 1497477 : goto out_undo_mmaplock;
254 : }
255 : return 1;
256 :
257 1524629 : out_undo_mmaplock:
258 1524629 : if (lock_flags & XFS_MMAPLOCK_EXCL)
259 0 : up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
260 1524629 : else if (lock_flags & XFS_MMAPLOCK_SHARED)
261 0 : up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
262 1524629 : out_undo_iolock:
263 1524629 : if (lock_flags & XFS_IOLOCK_EXCL)
264 0 : up_write(&VFS_I(ip)->i_rwsem);
265 1524629 : else if (lock_flags & XFS_IOLOCK_SHARED)
266 0 : up_read(&VFS_I(ip)->i_rwsem);
267 1524629 : out:
268 : return 0;
269 : }
270 :
271 : /*
272 : * xfs_iunlock() is used to drop the inode locks acquired with
273 : * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
274 : * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
275 : * that we know which locks to drop.
276 : *
277 : * ip -- the inode being unlocked
278 : * lock_flags -- this parameter indicates the inode's locks to be
279 : * to be unlocked. See the comment for xfs_ilock() for a list
280 : * of valid values for this parameter.
281 : *
282 : */
283 : void
284 71331485121 : xfs_iunlock(
285 : xfs_inode_t *ip,
286 : uint lock_flags)
287 : {
288 71331485121 : xfs_lock_flags_assert(lock_flags);
289 :
290 71502304773 : if (lock_flags & XFS_IOLOCK_EXCL)
291 1186557988 : up_write(&VFS_I(ip)->i_rwsem);
292 70315746785 : else if (lock_flags & XFS_IOLOCK_SHARED)
293 1057825843 : up_read(&VFS_I(ip)->i_rwsem);
294 :
295 71502265384 : if (lock_flags & XFS_MMAPLOCK_EXCL)
296 69814379 : up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
297 71432451005 : else if (lock_flags & XFS_MMAPLOCK_SHARED)
298 27297971 : up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
299 :
300 71502268443 : if (lock_flags & XFS_ILOCK_EXCL)
301 3539596253 : mrunlock_excl(&ip->i_lock);
302 67962672190 : else if (lock_flags & XFS_ILOCK_SHARED)
303 66520627731 : mrunlock_shared(&ip->i_lock);
304 :
305 71621378356 : trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
306 71570483100 : }
307 :
308 : /*
309 : * give up write locks. the i/o lock cannot be held nested
310 : * if it is being demoted.
311 : */
312 : void
313 134504 : xfs_ilock_demote(
314 : xfs_inode_t *ip,
315 : uint lock_flags)
316 : {
317 134504 : ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
318 134504 : ASSERT((lock_flags &
319 : ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
320 :
321 134504 : if (lock_flags & XFS_ILOCK_EXCL)
322 0 : mrdemote(&ip->i_lock);
323 134504 : if (lock_flags & XFS_MMAPLOCK_EXCL)
324 0 : downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
325 134504 : if (lock_flags & XFS_IOLOCK_EXCL)
326 134503 : downgrade_write(&VFS_I(ip)->i_rwsem);
327 :
328 134498 : trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
329 134497 : }
330 :
331 : #if defined(DEBUG) || defined(XFS_WARN)
332 : static inline bool
333 : __xfs_rwsem_islocked(
334 : struct rw_semaphore *rwsem,
335 : bool shared)
336 : {
337 1183951429 : if (!debug_locks)
338 0 : return rwsem_is_locked(rwsem);
339 :
340 : if (!shared)
341 : return lockdep_is_held_type(rwsem, 0);
342 :
343 : /*
344 : * We are checking that the lock is held at least in shared
345 : * mode but don't care that it might be held exclusively
346 : * (i.e. shared | excl). Hence we check if the lock is held
347 : * in any mode rather than an explicit shared mode.
348 : */
349 : return lockdep_is_held_type(rwsem, -1);
350 : }
351 :
352 : bool
353 17200778744 : xfs_isilocked(
354 : struct xfs_inode *ip,
355 : uint lock_flags)
356 : {
357 17200778744 : if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
358 16138832378 : if (!(lock_flags & XFS_ILOCK_SHARED))
359 8708795175 : return !!ip->i_lock.mr_writer;
360 7430037203 : return rwsem_is_locked(&ip->i_lock.mr_lock);
361 : }
362 :
363 1061946366 : if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
364 6376731 : return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock,
365 : (lock_flags & XFS_MMAPLOCK_SHARED));
366 : }
367 :
368 1055569635 : if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
369 1063804713 : return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
370 : (lock_flags & XFS_IOLOCK_SHARED));
371 : }
372 :
373 0 : ASSERT(0);
374 0 : return false;
375 : }
376 : #endif
377 :
378 : /*
379 : * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
380 : * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
381 : * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
382 : * errors and warnings.
383 : */
384 : #if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
385 : static bool
386 : xfs_lockdep_subclass_ok(
387 : int subclass)
388 : {
389 : return subclass < MAX_LOCKDEP_SUBCLASSES;
390 : }
391 : #else
392 : #define xfs_lockdep_subclass_ok(subclass) (true)
393 : #endif
394 :
395 : /*
396 : * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
397 : * value. This can be called for any type of inode lock combination, including
398 : * parent locking. Care must be taken to ensure we don't overrun the subclass
399 : * storage fields in the class mask we build.
400 : */
401 : static inline uint
402 382677252 : xfs_lock_inumorder(
403 : uint lock_mode,
404 : uint subclass)
405 : {
406 382677252 : uint class = 0;
407 :
408 382677252 : ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
409 : XFS_ILOCK_RTSUM)));
410 382677252 : ASSERT(xfs_lockdep_subclass_ok(subclass));
411 :
412 382677252 : if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
413 0 : ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
414 0 : class += subclass << XFS_IOLOCK_SHIFT;
415 : }
416 :
417 382677252 : if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
418 0 : ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS);
419 0 : class += subclass << XFS_MMAPLOCK_SHIFT;
420 : }
421 :
422 382677252 : if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) {
423 382683348 : ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS);
424 382683348 : class += subclass << XFS_ILOCK_SHIFT;
425 : }
426 :
427 382677252 : return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class;
428 : }
429 :
430 : /*
431 : * The following routine will lock n inodes in exclusive mode. We assume the
432 : * caller calls us with the inodes in i_ino order.
433 : *
434 : * We need to detect deadlock where an inode that we lock is in the AIL and we
435 : * start waiting for another inode that is locked by a thread in a long running
436 : * transaction (such as truncate). This can result in deadlock since the long
437 : * running trans might need to wait for the inode we just locked in order to
438 : * push the tail and free space in the log.
439 : *
440 : * xfs_lock_inodes() can only be used to lock one type of lock at a time -
441 : * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
442 : * lock more than one at a time, lockdep will report false positives saying we
443 : * have violated locking orders.
444 : */
445 : void
446 29481457 : xfs_lock_inodes(
447 : struct xfs_inode **ips,
448 : int inodes,
449 : uint lock_mode)
450 : {
451 29481457 : int attempts = 0;
452 29481457 : uint i;
453 29481457 : int j;
454 29481457 : bool try_lock;
455 29481457 : struct xfs_log_item *lp;
456 :
457 : /*
458 : * Currently supports between 2 and 5 inodes with exclusive locking. We
459 : * support an arbitrary depth of locking here, but absolute limits on
460 : * inodes depend on the type of locking and the limits placed by
461 : * lockdep annotations in xfs_lock_inumorder. These are all checked by
462 : * the asserts.
463 : */
464 29481457 : ASSERT(ips && inodes >= 2 && inodes <= 5);
465 29481457 : ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL |
466 : XFS_ILOCK_EXCL));
467 29481457 : ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
468 : XFS_ILOCK_SHARED)));
469 29481457 : ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
470 : inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
471 29481457 : ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
472 : inodes <= XFS_ILOCK_MAX_SUBCLASS + 1);
473 :
474 29481457 : if (lock_mode & XFS_IOLOCK_EXCL) {
475 0 : ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)));
476 29481457 : } else if (lock_mode & XFS_MMAPLOCK_EXCL)
477 0 : ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
478 :
479 29481457 : again:
480 29495155 : try_lock = false;
481 29495155 : i = 0;
482 128098355 : for (; i < inodes; i++) {
483 98616906 : ASSERT(ips[i]);
484 :
485 98616906 : if (i && (ips[i] == ips[i - 1])) /* Already locked */
486 1174954 : continue;
487 :
488 : /*
489 : * If try_lock is not set yet, make sure all locked inodes are
490 : * not in the AIL. If any are, set try_lock to be used later.
491 : */
492 97441952 : if (!try_lock) {
493 141875133 : for (j = (i - 1); j >= 0 && !try_lock; j--) {
494 65487296 : lp = &ips[j]->i_itemp->ili_item;
495 125093066 : if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
496 18934680 : try_lock = true;
497 : }
498 : }
499 :
500 : /*
501 : * If any of the previous locks we have locked is in the AIL,
502 : * we must TRY to get the second and subsequent locks. If
503 : * we can't get any, we must release all we have
504 : * and try again.
505 : */
506 97441952 : if (!try_lock) {
507 57453175 : xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
508 57453177 : continue;
509 : }
510 :
511 : /* try_lock means we have an inode locked that is in the AIL. */
512 39988777 : ASSERT(i != 0);
513 39988777 : if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
514 39975069 : continue;
515 :
516 : /*
517 : * Unlock all previous guys and try again. xfs_iunlock will try
518 : * to push the tail if the inode is in the AIL.
519 : */
520 13698 : attempts++;
521 40952 : for (j = i - 1; j >= 0; j--) {
522 : /*
523 : * Check to see if we've already unlocked this one. Not
524 : * the first one going back, and the inode ptr is the
525 : * same.
526 : */
527 27254 : if (j != (i - 1) && ips[j] == ips[j + 1])
528 8333 : continue;
529 :
530 18921 : xfs_iunlock(ips[j], lock_mode);
531 : }
532 :
533 13698 : if ((attempts % 5) == 0) {
534 2643 : delay(1); /* Don't just spin the CPU */
535 : }
536 13698 : goto again;
537 : }
538 29481449 : }
539 :
540 : /*
541 : * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and
542 : * mmaplock must be double-locked separately since we use i_rwsem and
543 : * invalidate_lock for that. We now support taking one lock EXCL and the
544 : * other SHARED.
545 : */
546 : void
547 142620091 : xfs_lock_two_inodes(
548 : struct xfs_inode *ip0,
549 : uint ip0_mode,
550 : struct xfs_inode *ip1,
551 : uint ip1_mode)
552 : {
553 142620091 : int attempts = 0;
554 142620091 : struct xfs_log_item *lp;
555 :
556 285241346 : ASSERT(hweight32(ip0_mode) == 1);
557 285241396 : ASSERT(hweight32(ip1_mode) == 1);
558 142620811 : ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
559 142620811 : ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
560 142620811 : ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
561 142620811 : ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
562 142620811 : ASSERT(ip0->i_ino != ip1->i_ino);
563 :
564 142620811 : if (ip0->i_ino > ip1->i_ino) {
565 7918810 : swap(ip0, ip1);
566 7918810 : swap(ip0_mode, ip1_mode);
567 : }
568 :
569 142620811 : again:
570 142634263 : xfs_ilock(ip0, xfs_lock_inumorder(ip0_mode, 0));
571 :
572 : /*
573 : * If the first lock we have locked is in the AIL, we must TRY to get
574 : * the second lock. If we can't get it, we must release the first one
575 : * and try again.
576 : */
577 142614619 : lp = &ip0->i_itemp->ili_item;
578 142614619 : if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
579 107480504 : if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) {
580 13451 : xfs_iunlock(ip0, ip0_mode);
581 13451 : if ((++attempts % 5) == 0)
582 2668 : delay(1); /* Don't just spin the CPU */
583 13452 : goto again;
584 : }
585 : } else {
586 35134115 : xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1));
587 : }
588 142620863 : }
589 :
590 : uint
591 61214964134 : xfs_ip2xflags(
592 : struct xfs_inode *ip)
593 : {
594 61214964134 : uint flags = 0;
595 :
596 61214964134 : if (ip->i_diflags & XFS_DIFLAG_ANY) {
597 17721127296 : if (ip->i_diflags & XFS_DIFLAG_REALTIME)
598 7361531583 : flags |= FS_XFLAG_REALTIME;
599 17721127296 : if (ip->i_diflags & XFS_DIFLAG_PREALLOC)
600 9555813060 : flags |= FS_XFLAG_PREALLOC;
601 17721127296 : if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE)
602 198 : flags |= FS_XFLAG_IMMUTABLE;
603 17721127296 : if (ip->i_diflags & XFS_DIFLAG_APPEND)
604 164 : flags |= FS_XFLAG_APPEND;
605 17721127296 : if (ip->i_diflags & XFS_DIFLAG_SYNC)
606 60 : flags |= FS_XFLAG_SYNC;
607 17721127296 : if (ip->i_diflags & XFS_DIFLAG_NOATIME)
608 42 : flags |= FS_XFLAG_NOATIME;
609 17721127296 : if (ip->i_diflags & XFS_DIFLAG_NODUMP)
610 46 : flags |= FS_XFLAG_NODUMP;
611 17721127296 : if (ip->i_diflags & XFS_DIFLAG_RTINHERIT)
612 3868236930 : flags |= FS_XFLAG_RTINHERIT;
613 17721127296 : if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT)
614 1220 : flags |= FS_XFLAG_PROJINHERIT;
615 17721127296 : if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS)
616 8 : flags |= FS_XFLAG_NOSYMLINKS;
617 17721127296 : if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
618 5282 : flags |= FS_XFLAG_EXTSIZE;
619 17721127296 : if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT)
620 5415 : flags |= FS_XFLAG_EXTSZINHERIT;
621 17721127296 : if (ip->i_diflags & XFS_DIFLAG_NODEFRAG)
622 0 : flags |= FS_XFLAG_NODEFRAG;
623 17721127296 : if (ip->i_diflags & XFS_DIFLAG_FILESTREAM)
624 14565 : flags |= FS_XFLAG_FILESTREAM;
625 : }
626 :
627 61214964134 : if (ip->i_diflags2 & XFS_DIFLAG2_ANY) {
628 61357689838 : if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
629 86 : flags |= FS_XFLAG_DAX;
630 61357689838 : if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
631 14125 : flags |= FS_XFLAG_COWEXTSIZE;
632 : }
633 :
634 61214964134 : if (xfs_inode_has_attr_fork(ip))
635 61377712771 : flags |= FS_XFLAG_HASATTR;
636 61214964134 : return flags;
637 : }
638 :
639 : /*
640 : * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
641 : * is allowed, otherwise it has to be an exact match. If a CI match is found,
642 : * ci_name->name will point to a the actual name (caller must free) or
643 : * will be set to NULL if an exact match is found.
644 : */
645 : int
646 192947846 : xfs_lookup(
647 : struct xfs_inode *dp,
648 : const struct xfs_name *name,
649 : struct xfs_inode **ipp,
650 : struct xfs_name *ci_name)
651 : {
652 192947846 : xfs_ino_t inum;
653 192947846 : int error;
654 :
655 192947846 : trace_xfs_lookup(dp, name);
656 :
657 385880696 : if (xfs_is_shutdown(dp->i_mount))
658 : return -EIO;
659 :
660 192878760 : error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
661 192895722 : if (error)
662 73164523 : goto out_unlock;
663 :
664 119731199 : error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
665 119732455 : if (error)
666 2597 : goto out_free_name;
667 :
668 : return 0;
669 :
670 : out_free_name:
671 2597 : if (ci_name)
672 0 : kmem_free(ci_name->name);
673 2597 : out_unlock:
674 73167120 : *ipp = NULL;
675 73167120 : return error;
676 : }
677 :
678 : /* Propagate di_flags from a parent inode to a child inode. */
679 : static void
680 12326799 : xfs_inode_inherit_flags(
681 : struct xfs_inode *ip,
682 : const struct xfs_inode *pip)
683 : {
684 12326799 : unsigned int di_flags = 0;
685 12326799 : xfs_failaddr_t failaddr;
686 12326799 : umode_t mode = VFS_I(ip)->i_mode;
687 :
688 12326799 : if (S_ISDIR(mode)) {
689 2303894 : if (pip->i_diflags & XFS_DIFLAG_RTINHERIT)
690 2303839 : di_flags |= XFS_DIFLAG_RTINHERIT;
691 2303894 : if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
692 0 : di_flags |= XFS_DIFLAG_EXTSZINHERIT;
693 0 : ip->i_extsize = pip->i_extsize;
694 : }
695 2303894 : if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT)
696 2 : di_flags |= XFS_DIFLAG_PROJINHERIT;
697 10022905 : } else if (S_ISREG(mode)) {
698 10022760 : if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
699 10017522 : xfs_has_realtime(ip->i_mount))
700 10017532 : di_flags |= XFS_DIFLAG_REALTIME;
701 10022760 : if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
702 430 : di_flags |= XFS_DIFLAG_EXTSIZE;
703 430 : ip->i_extsize = pip->i_extsize;
704 : }
705 : }
706 12326799 : if ((pip->i_diflags & XFS_DIFLAG_NOATIME) &&
707 0 : xfs_inherit_noatime)
708 0 : di_flags |= XFS_DIFLAG_NOATIME;
709 12326799 : if ((pip->i_diflags & XFS_DIFLAG_NODUMP) &&
710 0 : xfs_inherit_nodump)
711 0 : di_flags |= XFS_DIFLAG_NODUMP;
712 12326799 : if ((pip->i_diflags & XFS_DIFLAG_SYNC) &&
713 0 : xfs_inherit_sync)
714 0 : di_flags |= XFS_DIFLAG_SYNC;
715 12326799 : if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) &&
716 2 : xfs_inherit_nosymlinks)
717 0 : di_flags |= XFS_DIFLAG_NOSYMLINKS;
718 12326799 : if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) &&
719 0 : xfs_inherit_nodefrag)
720 0 : di_flags |= XFS_DIFLAG_NODEFRAG;
721 12326799 : if (pip->i_diflags & XFS_DIFLAG_FILESTREAM)
722 4757 : di_flags |= XFS_DIFLAG_FILESTREAM;
723 :
724 12326799 : ip->i_diflags |= di_flags;
725 :
726 : /*
727 : * Inode verifiers on older kernels only check that the extent size
728 : * hint is an integer multiple of the rt extent size on realtime files.
729 : * They did not check the hint alignment on a directory with both
730 : * rtinherit and extszinherit flags set. If the misaligned hint is
731 : * propagated from a directory into a new realtime file, new file
732 : * allocations will fail due to math errors in the rt allocator and/or
733 : * trip the verifiers. Validate the hint settings in the new file so
734 : * that we don't let broken hints propagate.
735 : */
736 12326799 : failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize,
737 : VFS_I(ip)->i_mode, ip->i_diflags);
738 12326212 : if (failaddr) {
739 0 : ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
740 : XFS_DIFLAG_EXTSZINHERIT);
741 0 : ip->i_extsize = 0;
742 : }
743 12326212 : }
744 :
745 : /* Propagate di_flags2 from a parent inode to a child inode. */
746 : static void
747 29773430 : xfs_inode_inherit_flags2(
748 : struct xfs_inode *ip,
749 : const struct xfs_inode *pip)
750 : {
751 29773430 : xfs_failaddr_t failaddr;
752 :
753 29773430 : if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
754 360 : ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
755 360 : ip->i_cowextsize = pip->i_cowextsize;
756 : }
757 29773430 : if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
758 30 : ip->i_diflags2 |= XFS_DIFLAG2_DAX;
759 :
760 : /* Don't let invalid cowextsize hints propagate. */
761 29773430 : failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
762 : VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2);
763 29773546 : if (failaddr) {
764 0 : ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
765 0 : ip->i_cowextsize = 0;
766 : }
767 29773546 : }
768 :
769 : /*
770 : * Initialise a newly allocated inode and return the in-core inode to the
771 : * caller locked exclusively.
772 : *
773 : * Caller is responsible for unlocking the inode manually upon return
774 : */
775 : int
776 69439768 : xfs_init_new_inode(
777 : struct mnt_idmap *idmap,
778 : struct xfs_trans *tp,
779 : struct xfs_inode *pip,
780 : xfs_ino_t ino,
781 : umode_t mode,
782 : xfs_nlink_t nlink,
783 : dev_t rdev,
784 : prid_t prid,
785 : bool init_xattrs,
786 : struct xfs_inode **ipp)
787 : {
788 69439768 : struct inode *dir = pip ? VFS_I(pip) : NULL;
789 69439768 : struct xfs_mount *mp = tp->t_mountp;
790 69439768 : struct xfs_inode *ip;
791 69439768 : unsigned int flags;
792 69439768 : int error;
793 69439768 : struct timespec64 tv;
794 69439768 : struct inode *inode;
795 :
796 : /*
797 : * Protect against obviously corrupt allocation btree records. Later
798 : * xfs_iget checks will catch re-allocation of other active in-memory
799 : * and on-disk inodes. If we don't catch reallocating the parent inode
800 : * here we will deadlock in xfs_iget() so we have to do these checks
801 : * first.
802 : */
803 69439768 : if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) {
804 0 : xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
805 0 : xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino),
806 : XFS_SICK_AG_INOBT);
807 0 : return -EFSCORRUPTED;
808 : }
809 :
810 : /*
811 : * Get the in-core inode with the lock held exclusively to prevent
812 : * others from looking at until we're done.
813 : */
814 69438460 : error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
815 69436458 : if (error)
816 : return error;
817 :
818 69436456 : ASSERT(ip != NULL);
819 69436456 : inode = VFS_I(ip);
820 69436456 : set_nlink(inode, nlink);
821 69434260 : inode->i_rdev = rdev;
822 69434260 : ip->i_projid = prid;
823 :
824 69434260 : if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) {
825 0 : inode_fsuid_set(inode, idmap);
826 0 : inode->i_gid = dir->i_gid;
827 0 : inode->i_mode = mode;
828 : } else {
829 69434260 : inode_init_owner(idmap, inode, dir, mode);
830 : }
831 :
832 : /*
833 : * If the group ID of the new file does not match the effective group
834 : * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
835 : * (and only if the irix_sgid_inherit compatibility variable is set).
836 : */
837 69433481 : if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
838 0 : !vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)))
839 0 : inode->i_mode &= ~S_ISGID;
840 :
841 69433481 : ip->i_disk_size = 0;
842 69433481 : ip->i_df.if_nextents = 0;
843 69433481 : ASSERT(ip->i_nblocks == 0);
844 :
845 69433481 : tv = current_time(inode);
846 69437412 : inode->i_mtime = tv;
847 69437412 : inode->i_atime = tv;
848 69437412 : inode->i_ctime = tv;
849 :
850 69437412 : ip->i_extsize = 0;
851 69437412 : ip->i_diflags = 0;
852 :
853 69437412 : if (xfs_has_v3inodes(mp)) {
854 69436887 : inode_set_iversion(inode, 1);
855 69436887 : ip->i_cowextsize = 0;
856 69436887 : ip->i_crtime = tv;
857 : }
858 :
859 69437412 : flags = XFS_ILOG_CORE;
860 69437412 : switch (mode & S_IFMT) {
861 7722653 : case S_IFIFO:
862 : case S_IFCHR:
863 : case S_IFBLK:
864 : case S_IFSOCK:
865 7722653 : ip->i_df.if_format = XFS_DINODE_FMT_DEV;
866 7722653 : flags |= XFS_ILOG_DEV;
867 7722653 : break;
868 29783272 : case S_IFREG:
869 : case S_IFDIR:
870 29783272 : if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
871 12326224 : xfs_inode_inherit_flags(ip, pip);
872 29783010 : if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
873 29773975 : xfs_inode_inherit_flags2(ip, pip);
874 61713901 : fallthrough;
875 : case S_IFLNK:
876 61713901 : ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
877 61713901 : ip->i_df.if_bytes = 0;
878 61713901 : ip->i_df.if_u1.if_root = NULL;
879 61713901 : break;
880 0 : default:
881 0 : ASSERT(0);
882 : }
883 :
884 : /*
885 : * If we need to create attributes immediately after allocating the
886 : * inode, initialise an empty attribute fork right now. We use the
887 : * default fork offset for attributes here as we don't know exactly what
888 : * size or how many attributes we might be adding. We can do this
889 : * safely here because we know the data fork is completely empty and
890 : * this saves us from needing to run a separate transaction to set the
891 : * fork offset in the immediate future.
892 : */
893 69436554 : if (init_xattrs && xfs_has_attr(mp)) {
894 61297401 : ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
895 61295622 : xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
896 : }
897 :
898 : /*
899 : * Log the new values stuffed into the inode.
900 : */
901 69431457 : xfs_trans_ijoin(tp, ip, 0);
902 69431328 : xfs_trans_log_inode(tp, ip, flags);
903 :
904 : /* now that we have an i_mode we can setup the inode structure */
905 69440085 : xfs_setup_inode(ip);
906 :
907 69438659 : *ipp = ip;
908 69438659 : return 0;
909 : }
910 :
911 : /*
912 : * Decrement the link count on an inode & log the change. If this causes the
913 : * link count to go to zero, move the inode to AGI unlinked list so that it can
914 : * be freed when the last active reference goes away via xfs_inactive().
915 : */
916 : int
917 49908073 : xfs_droplink(
918 : struct xfs_trans *tp,
919 : struct xfs_inode *ip)
920 : {
921 49908073 : struct inode *inode = VFS_I(ip);
922 :
923 49908073 : xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
924 :
925 49908749 : if (inode->i_nlink != XFS_NLINK_PINNED)
926 49909429 : drop_nlink(VFS_I(ip));
927 :
928 49908275 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
929 :
930 49909484 : if (VFS_I(ip)->i_nlink)
931 : return 0;
932 :
933 38574800 : return xfs_iunlink(tp, ip);
934 : }
935 :
936 : /*
937 : * Increment the link count on an inode & log the change.
938 : */
939 : void
940 21552481 : xfs_bumplink(
941 : struct xfs_trans *tp,
942 : struct xfs_inode *ip)
943 : {
944 21552481 : struct inode *inode = VFS_I(ip);
945 :
946 21552481 : xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
947 :
948 21552412 : if (inode->i_nlink != XFS_NLINK_PINNED)
949 21552438 : inc_nlink(VFS_I(ip));
950 :
951 21552535 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
952 21552555 : }
953 :
954 : #ifdef CONFIG_XFS_LIVE_HOOKS
955 : /*
956 : * Use a static key here to reduce the overhead of directory live update hooks.
957 : * If the compiler supports jump labels, the static branch will be replaced by
958 : * a nop sled when there are no hook users. Online fsck is currently the only
959 : * caller, so this is a reasonable tradeoff.
960 : *
961 : * Note: Patching the kernel code requires taking the cpu hotplug lock. Other
962 : * parts of the kernel allocate memory with that lock held, which means that
963 : * XFS callers cannot hold any locks that might be used by memory reclaim or
964 : * writeback when calling the static_branch_{inc,dec} functions.
965 : */
966 : DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch);
967 :
968 : void
969 20588196 : xfs_dir_hook_disable(void)
970 : {
971 20588196 : xfs_hooks_switch_off(&xfs_dir_hooks_switch);
972 20591794 : }
973 :
974 : void
975 20591332 : xfs_dir_hook_enable(void)
976 : {
977 20591332 : xfs_hooks_switch_on(&xfs_dir_hooks_switch);
978 20587768 : }
979 :
980 : /* Call hooks for a directory update relating to a child dirent update. */
981 : inline void
982 182550823 : xfs_dir_update_hook(
983 : struct xfs_inode *dp,
984 : struct xfs_inode *ip,
985 : int delta,
986 : const struct xfs_name *name)
987 : {
988 239931920 : if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) {
989 57380422 : struct xfs_dir_update_params p = {
990 : .dp = dp,
991 : .ip = ip,
992 : .delta = delta,
993 : .name = name,
994 : };
995 57380422 : struct xfs_mount *mp = ip->i_mount;
996 :
997 57380422 : xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p);
998 : }
999 182551990 : }
1000 :
1001 : /* Call the specified function during a directory update. */
1002 : int
1003 13800694 : xfs_dir_hook_add(
1004 : struct xfs_mount *mp,
1005 : struct xfs_dir_hook *hook)
1006 : {
1007 13800694 : return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook);
1008 : }
1009 :
1010 : /* Stop calling the specified function during a directory update. */
1011 : void
1012 13818464 : xfs_dir_hook_del(
1013 : struct xfs_mount *mp,
1014 : struct xfs_dir_hook *hook)
1015 : {
1016 13818464 : xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook);
1017 13820342 : }
1018 : #endif /* CONFIG_XFS_LIVE_HOOKS */
1019 :
1020 : int
1021 28100616 : xfs_create(
1022 : struct mnt_idmap *idmap,
1023 : struct xfs_inode *dp,
1024 : struct xfs_name *name,
1025 : umode_t mode,
1026 : dev_t rdev,
1027 : bool init_xattrs,
1028 : xfs_inode_t **ipp)
1029 : {
1030 28100616 : int is_dir = S_ISDIR(mode);
1031 28100616 : struct xfs_mount *mp = dp->i_mount;
1032 28100616 : struct xfs_inode *ip = NULL;
1033 28100616 : struct xfs_trans *tp = NULL;
1034 28100616 : int error;
1035 28100616 : bool unlock_dp_on_error = false;
1036 28100616 : prid_t prid;
1037 28100616 : struct xfs_dquot *udqp = NULL;
1038 28100616 : struct xfs_dquot *gdqp = NULL;
1039 28100616 : struct xfs_dquot *pdqp = NULL;
1040 28100616 : struct xfs_trans_res *tres;
1041 28100616 : uint resblks;
1042 28100616 : xfs_ino_t ino;
1043 28100616 : struct xfs_parent_defer *parent;
1044 :
1045 28100616 : trace_xfs_create(dp, name);
1046 :
1047 56199798 : if (xfs_is_shutdown(mp))
1048 : return -EIO;
1049 :
1050 28099895 : prid = xfs_get_initial_prid(dp);
1051 :
1052 : /*
1053 : * Make sure that we have allocated dquot(s) on disk.
1054 : */
1055 28099895 : error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns),
1056 : mapped_fsgid(idmap, &init_user_ns), prid,
1057 : XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1058 : &udqp, &gdqp, &pdqp);
1059 28105651 : if (error)
1060 : return error;
1061 :
1062 28103460 : if (is_dir) {
1063 6180839 : resblks = xfs_mkdir_space_res(mp, name->len);
1064 6180857 : tres = &M_RES(mp)->tr_mkdir;
1065 : } else {
1066 21922621 : resblks = xfs_create_space_res(mp, name->len);
1067 21920418 : tres = &M_RES(mp)->tr_create;
1068 : }
1069 :
1070 28101275 : error = xfs_parent_start(mp, &parent);
1071 28099696 : if (error)
1072 211 : goto out_release_dquots;
1073 :
1074 : /*
1075 : * Initially assume that the file does not exist and
1076 : * reserve the resources for that case. If that is not
1077 : * the case we'll drop the one we have and get a more
1078 : * appropriate transaction later.
1079 : */
1080 28099485 : error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
1081 : &tp);
1082 28103680 : if (error == -ENOSPC) {
1083 : /* flush outstanding delalloc blocks and retry */
1084 238907 : xfs_flush_inodes(mp);
1085 238832 : error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp,
1086 : resblks, &tp);
1087 : }
1088 28103706 : if (error)
1089 230003 : goto out_parent;
1090 :
1091 27873703 : xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1092 27873809 : unlock_dp_on_error = true;
1093 :
1094 : /*
1095 : * A newly created regular or special file just has one directory
1096 : * entry pointing to them, but a directory also the "." entry
1097 : * pointing to itself.
1098 : */
1099 27873809 : error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
1100 27873897 : if (!error)
1101 49327682 : error = xfs_init_new_inode(idmap, tp, dp, ino, mode,
1102 : is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip);
1103 27872453 : if (error)
1104 146709 : goto out_trans_cancel;
1105 :
1106 : /*
1107 : * Now we join the directory inode to the transaction. We do not do it
1108 : * earlier because xfs_dialloc might commit the previous transaction
1109 : * (and release all the locks). An error from here on will result in
1110 : * the transaction cancel unlocking dp so don't do it explicitly in the
1111 : * error path.
1112 : */
1113 27725744 : xfs_trans_ijoin(tp, dp, 0);
1114 :
1115 55447466 : error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1116 27724875 : resblks - XFS_IALLOC_SPACE_RES(mp));
1117 27726113 : if (error) {
1118 282 : ASSERT(error != -ENOSPC);
1119 282 : goto out_trans_cancel;
1120 : }
1121 27725831 : xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1122 27723803 : xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1123 :
1124 27726151 : if (is_dir) {
1125 6126779 : error = xfs_dir_init(tp, ip, dp);
1126 6126656 : if (error)
1127 0 : goto out_trans_cancel;
1128 :
1129 6126656 : xfs_bumplink(tp, dp);
1130 : }
1131 :
1132 : /*
1133 : * If we have parent pointers, we need to add the attribute containing
1134 : * the parent information now.
1135 : */
1136 27726099 : if (parent) {
1137 27575170 : error = xfs_parent_add(tp, parent, dp, name, ip);
1138 27573973 : if (error)
1139 0 : goto out_trans_cancel;
1140 : }
1141 :
1142 : /*
1143 : * Create ip with a reference from dp, and add '.' and '..' references
1144 : * if it's a directory.
1145 : */
1146 27724902 : xfs_dir_update_hook(dp, ip, 1, name);
1147 :
1148 : /*
1149 : * If this is a synchronous mount, make sure that the
1150 : * create transaction goes to disk before returning to
1151 : * the user.
1152 : */
1153 27724669 : if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
1154 236 : xfs_trans_set_sync(tp);
1155 :
1156 : /*
1157 : * Attach the dquot(s) to the inodes and modify them incore.
1158 : * These ids of the inode couldn't have changed since the new
1159 : * inode has been locked ever since it was created.
1160 : */
1161 27724669 : xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1162 :
1163 27725833 : error = xfs_trans_commit(tp);
1164 27726498 : if (error)
1165 165 : goto out_release_inode;
1166 :
1167 27726333 : xfs_qm_dqrele(udqp);
1168 27726522 : xfs_qm_dqrele(gdqp);
1169 27726571 : xfs_qm_dqrele(pdqp);
1170 :
1171 27726477 : *ipp = ip;
1172 27726477 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
1173 27726264 : xfs_iunlock(dp, XFS_ILOCK_EXCL);
1174 27725846 : xfs_parent_finish(mp, parent);
1175 : return 0;
1176 :
1177 146991 : out_trans_cancel:
1178 146991 : xfs_trans_cancel(tp);
1179 147159 : out_release_inode:
1180 : /*
1181 : * Wait until after the current transaction is aborted to finish the
1182 : * setup of the inode and release the inode. This prevents recursive
1183 : * transactions and deadlocks from xfs_inactive.
1184 : */
1185 147159 : if (ip) {
1186 447 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
1187 447 : xfs_finish_inode_setup(ip);
1188 447 : xfs_irele(ip);
1189 : }
1190 146712 : out_parent:
1191 377162 : xfs_parent_finish(mp, parent);
1192 377790 : out_release_dquots:
1193 377790 : xfs_qm_dqrele(udqp);
1194 377376 : xfs_qm_dqrele(gdqp);
1195 377376 : xfs_qm_dqrele(pdqp);
1196 :
1197 377376 : if (unlock_dp_on_error)
1198 147159 : xfs_iunlock(dp, XFS_ILOCK_EXCL);
1199 : return error;
1200 : }
1201 :
1202 : int
1203 2816903 : xfs_create_tmpfile(
1204 : struct mnt_idmap *idmap,
1205 : struct xfs_inode *dp,
1206 : umode_t mode,
1207 : bool init_xattrs,
1208 : struct xfs_inode **ipp)
1209 : {
1210 2816903 : struct xfs_mount *mp = dp->i_mount;
1211 2816903 : struct xfs_inode *ip = NULL;
1212 2816903 : struct xfs_trans *tp = NULL;
1213 2816903 : int error;
1214 2816903 : prid_t prid;
1215 2816903 : struct xfs_dquot *udqp = NULL;
1216 2816903 : struct xfs_dquot *gdqp = NULL;
1217 2816903 : struct xfs_dquot *pdqp = NULL;
1218 2816903 : struct xfs_trans_res *tres;
1219 2816903 : uint resblks;
1220 2816903 : xfs_ino_t ino;
1221 :
1222 5633806 : if (xfs_is_shutdown(mp))
1223 : return -EIO;
1224 :
1225 2816903 : prid = xfs_get_initial_prid(dp);
1226 :
1227 : /*
1228 : * Make sure that we have allocated dquot(s) on disk.
1229 : */
1230 2816903 : error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns),
1231 : mapped_fsgid(idmap, &init_user_ns), prid,
1232 : XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1233 : &udqp, &gdqp, &pdqp);
1234 2817666 : if (error)
1235 : return error;
1236 :
1237 2817666 : resblks = XFS_IALLOC_SPACE_RES(mp);
1238 2817666 : tres = &M_RES(mp)->tr_create_tmpfile;
1239 :
1240 2817666 : error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
1241 : &tp);
1242 2817666 : if (error)
1243 26039 : goto out_release_dquots;
1244 :
1245 2791627 : error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
1246 2791858 : if (!error)
1247 2791590 : error = xfs_init_new_inode(idmap, tp, dp, ino, mode,
1248 : 0, 0, prid, init_xattrs, &ip);
1249 2791615 : if (error)
1250 2 : goto out_trans_cancel;
1251 :
1252 2791613 : if (xfs_has_wsync(mp))
1253 0 : xfs_trans_set_sync(tp);
1254 :
1255 : /*
1256 : * Attach the dquot(s) to the inodes and modify them incore.
1257 : * These ids of the inode couldn't have changed since the new
1258 : * inode has been locked ever since it was created.
1259 : */
1260 2791613 : xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1261 :
1262 2791611 : error = xfs_iunlink(tp, ip);
1263 2791585 : if (error)
1264 1 : goto out_trans_cancel;
1265 :
1266 2791584 : error = xfs_trans_commit(tp);
1267 2791627 : if (error)
1268 1 : goto out_release_inode;
1269 :
1270 2791626 : xfs_qm_dqrele(udqp);
1271 2791626 : xfs_qm_dqrele(gdqp);
1272 2791627 : xfs_qm_dqrele(pdqp);
1273 :
1274 2791627 : *ipp = ip;
1275 2791627 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
1276 2791627 : return 0;
1277 :
1278 3 : out_trans_cancel:
1279 3 : xfs_trans_cancel(tp);
1280 4 : out_release_inode:
1281 : /*
1282 : * Wait until after the current transaction is aborted to finish the
1283 : * setup of the inode and release the inode. This prevents recursive
1284 : * transactions and deadlocks from xfs_inactive.
1285 : */
1286 4 : if (ip) {
1287 2 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
1288 2 : xfs_finish_inode_setup(ip);
1289 2 : xfs_irele(ip);
1290 : }
1291 2 : out_release_dquots:
1292 26043 : xfs_qm_dqrele(udqp);
1293 26043 : xfs_qm_dqrele(gdqp);
1294 26043 : xfs_qm_dqrele(pdqp);
1295 :
1296 26043 : return error;
1297 : }
1298 :
1299 : int
1300 5799398 : xfs_link(
1301 : struct xfs_inode *tdp,
1302 : struct xfs_inode *sip,
1303 : struct xfs_name *target_name)
1304 : {
1305 5799398 : struct xfs_mount *mp = tdp->i_mount;
1306 5799398 : struct xfs_trans *tp;
1307 5799398 : int error, nospace_error = 0;
1308 5799398 : int resblks;
1309 5799398 : struct xfs_parent_defer *parent = NULL;
1310 :
1311 5799398 : trace_xfs_link(tdp, target_name);
1312 :
1313 5799302 : ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
1314 :
1315 11598604 : if (xfs_is_shutdown(mp))
1316 : return -EIO;
1317 :
1318 5799301 : error = xfs_qm_dqattach(sip);
1319 5799329 : if (error)
1320 0 : goto std_return;
1321 :
1322 5799329 : error = xfs_qm_dqattach(tdp);
1323 5799296 : if (error)
1324 0 : goto std_return;
1325 :
1326 5799296 : error = xfs_parent_start(mp, &parent);
1327 5799307 : if (error)
1328 0 : goto std_return;
1329 :
1330 5799307 : resblks = xfs_link_space_res(mp, target_name->len);
1331 5799290 : error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks,
1332 : &tp, &nospace_error);
1333 5799445 : if (error)
1334 0 : goto out_parent;
1335 :
1336 : /*
1337 : * We don't allow reservationless or quotaless hardlinking when parent
1338 : * pointers are enabled because we can't back out if the xattrs must
1339 : * grow.
1340 : */
1341 5799445 : if (parent && nospace_error) {
1342 24073 : error = nospace_error;
1343 24073 : goto error_return;
1344 : }
1345 :
1346 : /*
1347 : * If we are using project inheritance, we only allow hard link
1348 : * creation in our tree when the project IDs are the same; else
1349 : * the tree quota mechanism could be circumvented.
1350 : */
1351 5775372 : if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
1352 : tdp->i_projid != sip->i_projid)) {
1353 0 : error = -EXDEV;
1354 0 : goto error_return;
1355 : }
1356 :
1357 5775372 : if (!resblks) {
1358 0 : error = xfs_dir_canenter(tp, tdp, target_name);
1359 0 : if (error)
1360 0 : goto error_return;
1361 : }
1362 :
1363 : /*
1364 : * Handle initial link state of O_TMPFILE inode
1365 : */
1366 5775372 : if (VFS_I(sip)->i_nlink == 0) {
1367 8538 : struct xfs_perag *pag;
1368 :
1369 8538 : pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino));
1370 8538 : error = xfs_iunlink_remove(tp, pag, sip);
1371 8538 : xfs_perag_put(pag);
1372 8538 : if (error)
1373 0 : goto error_return;
1374 : }
1375 :
1376 5775372 : error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1377 : resblks);
1378 5775400 : if (error)
1379 3 : goto error_return;
1380 5775397 : xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1381 5775357 : xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1382 :
1383 5775318 : xfs_bumplink(tp, sip);
1384 :
1385 : /*
1386 : * If we have parent pointers, we now need to add the parent record to
1387 : * the attribute fork of the inode. If this is the initial parent
1388 : * attribute, we need to create it correctly, otherwise we can just add
1389 : * the parent to the inode.
1390 : */
1391 5775361 : if (parent) {
1392 5743906 : error = xfs_parent_add(tp, parent, tdp, target_name, sip);
1393 5743762 : if (error)
1394 0 : goto error_return;
1395 : }
1396 :
1397 5775217 : xfs_dir_update_hook(tdp, sip, 1, target_name);
1398 :
1399 : /*
1400 : * If this is a synchronous mount, make sure that the
1401 : * link transaction goes to disk before returning to
1402 : * the user.
1403 : */
1404 5775238 : if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
1405 0 : xfs_trans_set_sync(tp);
1406 :
1407 5775238 : error = xfs_trans_commit(tp);
1408 5775339 : xfs_iunlock(tdp, XFS_ILOCK_EXCL);
1409 5775341 : xfs_iunlock(sip, XFS_ILOCK_EXCL);
1410 5775327 : xfs_parent_finish(mp, parent);
1411 : return error;
1412 :
1413 24076 : error_return:
1414 24076 : xfs_trans_cancel(tp);
1415 24076 : xfs_iunlock(tdp, XFS_ILOCK_EXCL);
1416 24076 : xfs_iunlock(sip, XFS_ILOCK_EXCL);
1417 24076 : out_parent:
1418 24076 : xfs_parent_finish(mp, parent);
1419 24076 : std_return:
1420 24076 : if (error == -ENOSPC && nospace_error)
1421 23646 : error = nospace_error;
1422 : return error;
1423 : }
1424 :
1425 : /* Clear the reflink flag and the cowblocks tag if possible. */
1426 : static void
1427 10963269 : xfs_itruncate_clear_reflink_flags(
1428 : struct xfs_inode *ip)
1429 : {
1430 10963269 : struct xfs_ifork *dfork;
1431 10963269 : struct xfs_ifork *cfork;
1432 :
1433 10963269 : if (!xfs_is_reflink_inode(ip))
1434 : return;
1435 4313823 : dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK);
1436 4313823 : cfork = xfs_ifork_ptr(ip, XFS_COW_FORK);
1437 4313823 : if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
1438 814915 : ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1439 4313823 : if (cfork->if_bytes == 0)
1440 4199728 : xfs_inode_clear_cowblocks_tag(ip);
1441 : }
1442 :
1443 : /*
1444 : * Free up the underlying blocks past new_size. The new size must be smaller
1445 : * than the current size. This routine can be used both for the attribute and
1446 : * data fork, and does not modify the inode size, which is left to the caller.
1447 : *
1448 : * The transaction passed to this routine must have made a permanent log
1449 : * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the
1450 : * given transaction and start new ones, so make sure everything involved in
1451 : * the transaction is tidy before calling here. Some transaction will be
1452 : * returned to the caller to be committed. The incoming transaction must
1453 : * already include the inode, and both inode locks must be held exclusively.
1454 : * The inode must also be "held" within the transaction. On return the inode
1455 : * will be "held" within the returned transaction. This routine does NOT
1456 : * require any disk space to be reserved for it within the transaction.
1457 : *
1458 : * If we get an error, we must return with the inode locked and linked into the
1459 : * current transaction. This keeps things simple for the higher level code,
1460 : * because it always knows that the inode is locked and held in the transaction
1461 : * that returns to it whether errors occur or not. We don't mark the inode
1462 : * dirty on error so that transactions can be easily aborted if possible.
1463 : */
1464 : int
1465 11944169 : xfs_itruncate_extents_flags(
1466 : struct xfs_trans **tpp,
1467 : struct xfs_inode *ip,
1468 : int whichfork,
1469 : xfs_fsize_t new_size,
1470 : int flags)
1471 : {
1472 11944169 : struct xfs_mount *mp = ip->i_mount;
1473 11944169 : struct xfs_trans *tp = *tpp;
1474 11944169 : xfs_fileoff_t first_unmap_block;
1475 11944169 : int error = 0;
1476 :
1477 11944169 : ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1478 20179247 : ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
1479 : xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1480 23888338 : ASSERT(new_size <= XFS_ISIZE(ip));
1481 11944169 : ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1482 11944169 : ASSERT(ip->i_itemp != NULL);
1483 11944169 : ASSERT(ip->i_itemp->ili_lock_flags == 0);
1484 11944169 : ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1485 :
1486 11944169 : trace_xfs_itruncate_extents_start(ip, new_size);
1487 :
1488 11944474 : flags |= xfs_bmapi_aflag(whichfork);
1489 :
1490 : /*
1491 : * Since it is possible for space to become allocated beyond
1492 : * the end of the file (in a crash where the space is allocated
1493 : * but the inode size is not yet updated), simply remove any
1494 : * blocks which show up between the new EOF and the maximum
1495 : * possible file size.
1496 : *
1497 : * We have to free all the blocks to the bmbt maximum offset, even if
1498 : * the page cache can't scale that far.
1499 : */
1500 11944474 : first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1501 11944474 : if (!xfs_verify_fileoff(mp, first_unmap_block)) {
1502 0 : WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF);
1503 0 : return 0;
1504 : }
1505 :
1506 11942787 : error = xfs_bunmapi_range(&tp, ip, flags, first_unmap_block,
1507 : XFS_MAX_FILEOFF);
1508 11944249 : if (error)
1509 1807 : goto out;
1510 :
1511 11942442 : if (whichfork == XFS_DATA_FORK) {
1512 : /* Remove all pending CoW reservations. */
1513 10961708 : error = xfs_reflink_cancel_cow_blocks(ip, &tp,
1514 : first_unmap_block, XFS_MAX_FILEOFF, true);
1515 10961749 : if (error)
1516 0 : goto out;
1517 :
1518 10961749 : xfs_itruncate_clear_reflink_flags(ip);
1519 : }
1520 :
1521 : /*
1522 : * Always re-log the inode so that our permanent transaction can keep
1523 : * on rolling it forward in the log.
1524 : */
1525 11942526 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1526 :
1527 11945397 : trace_xfs_itruncate_extents_end(ip, new_size);
1528 :
1529 11947207 : out:
1530 11947207 : *tpp = tp;
1531 11947207 : return error;
1532 : }
1533 :
1534 : int
1535 342653896 : xfs_release(
1536 : struct xfs_inode *ip,
1537 : bool want_free_eofblocks)
1538 : {
1539 342653896 : struct xfs_mount *mp = ip->i_mount;
1540 342653896 : int error = 0;
1541 :
1542 342653896 : if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0))
1543 : return 0;
1544 :
1545 : /* If this is a read-only mount, don't do this (would generate I/O) */
1546 685335162 : if (xfs_is_readonly(mp))
1547 : return 0;
1548 :
1549 683079714 : if (!xfs_is_shutdown(mp)) {
1550 341353440 : int truncated;
1551 :
1552 : /*
1553 : * If we previously truncated this file and removed old data
1554 : * in the process, we want to initiate "early" writeout on
1555 : * the last close. This is an attempt to combat the notorious
1556 : * NULL files problem which is particularly noticeable from a
1557 : * truncate down, buffered (re-)write (delalloc), followed by
1558 : * a crash. What we are effectively doing here is
1559 : * significantly reducing the time window where we'd otherwise
1560 : * be exposed to that problem.
1561 : */
1562 341353440 : truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1563 341365568 : if (truncated) {
1564 1587450 : xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
1565 1587450 : if (ip->i_delayed_blks > 0) {
1566 119698 : error = filemap_flush(VFS_I(ip)->i_mapping);
1567 119700 : if (error)
1568 : return error;
1569 : }
1570 : }
1571 : }
1572 :
1573 341551966 : if (VFS_I(ip)->i_nlink == 0)
1574 : return 0;
1575 :
1576 : /*
1577 : * If we can't get the iolock just skip truncating the blocks past EOF
1578 : * because we could deadlock with the mmap_lock otherwise. We'll get
1579 : * another chance to drop them once the last reference to the inode is
1580 : * dropped, so we'll never leak blocks permanently.
1581 : */
1582 340553879 : if (!want_free_eofblocks || !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
1583 52162943 : return 0;
1584 :
1585 288393540 : if (xfs_can_free_eofblocks(ip, false)) {
1586 : /*
1587 : * Check if the inode is being opened, written and closed
1588 : * frequently and we have delayed allocation blocks outstanding
1589 : * (e.g. streaming writes from the NFS server), truncating the
1590 : * blocks past EOF will cause fragmentation to occur.
1591 : *
1592 : * In this case don't do the truncation, but we have to be
1593 : * careful how we detect this case. Blocks beyond EOF show up as
1594 : * i_delayed_blks even when the inode is clean, so we need to
1595 : * truncate them away first before checking for a dirty release.
1596 : * Hence on the first dirty close we will still remove the
1597 : * speculative allocation, but after that we will leave it in
1598 : * place.
1599 : */
1600 40349431 : if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
1601 14897946 : goto out_unlock;
1602 :
1603 5276671 : error = xfs_free_eofblocks(ip);
1604 5278717 : if (error)
1605 5 : goto out_unlock;
1606 :
1607 5278712 : xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1608 : }
1609 :
1610 268216248 : out_unlock:
1611 288392931 : xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1612 288392931 : return error;
1613 : }
1614 :
1615 : /*
1616 : * Mark all the buffers attached to this directory stale. In theory we should
1617 : * never be freeing a directory with any blocks at all, but this covers the
1618 : * case where we've recovered a directory swap with a "temporary" directory
1619 : * created by online repair and now need to dump it.
1620 : */
1621 : STATIC void
1622 0 : xfs_inactive_dir(
1623 : struct xfs_inode *dp)
1624 : {
1625 0 : struct xfs_iext_cursor icur;
1626 0 : struct xfs_bmbt_irec got;
1627 0 : struct xfs_mount *mp = dp->i_mount;
1628 0 : struct xfs_da_geometry *geo = mp->m_dir_geo;
1629 0 : struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK);
1630 0 : xfs_fileoff_t off;
1631 :
1632 : /*
1633 : * Invalidate each directory block. All directory blocks are of
1634 : * fsbcount length and alignment, so we only need to walk those same
1635 : * offsets. We hold the only reference to this inode, so we must wait
1636 : * for the buffer locks.
1637 : */
1638 0 : for_each_xfs_iext(ifp, &icur, &got) {
1639 0 : for (off = round_up(got.br_startoff, geo->fsbcount);
1640 0 : off < got.br_startoff + got.br_blockcount;
1641 0 : off += geo->fsbcount) {
1642 0 : struct xfs_buf *bp = NULL;
1643 0 : xfs_fsblock_t fsbno;
1644 0 : int error;
1645 :
1646 0 : fsbno = (off - got.br_startoff) + got.br_startblock;
1647 0 : error = xfs_buf_incore(mp->m_ddev_targp,
1648 0 : XFS_FSB_TO_DADDR(mp, fsbno),
1649 0 : XFS_FSB_TO_BB(mp, geo->fsbcount),
1650 : XBF_LIVESCAN, &bp);
1651 0 : if (error)
1652 0 : continue;
1653 :
1654 0 : xfs_buf_stale(bp);
1655 0 : xfs_buf_relse(bp);
1656 : }
1657 : }
1658 0 : }
1659 :
1660 : /*
1661 : * xfs_inactive_truncate
1662 : *
1663 : * Called to perform a truncate when an inode becomes unlinked.
1664 : */
1665 : STATIC int
1666 2536356 : xfs_inactive_truncate(
1667 : struct xfs_inode *ip)
1668 : {
1669 2536356 : struct xfs_mount *mp = ip->i_mount;
1670 2536356 : struct xfs_trans *tp;
1671 2536356 : int error;
1672 :
1673 2536356 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
1674 2539103 : if (error) {
1675 1268 : ASSERT(xfs_is_shutdown(mp));
1676 634 : return error;
1677 : }
1678 2538469 : xfs_ilock(ip, XFS_ILOCK_EXCL);
1679 2538312 : xfs_trans_ijoin(tp, ip, 0);
1680 :
1681 : /*
1682 : * Log the inode size first to prevent stale data exposure in the event
1683 : * of a system crash before the truncate completes. See the related
1684 : * comment in xfs_vn_setattr_size() for details.
1685 : */
1686 2536106 : ip->i_disk_size = 0;
1687 2536106 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1688 :
1689 2538444 : error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
1690 2538621 : if (error)
1691 1292 : goto error_trans_cancel;
1692 :
1693 2537329 : ASSERT(ip->i_df.if_nextents == 0);
1694 :
1695 2537329 : error = xfs_trans_commit(tp);
1696 2537247 : if (error)
1697 0 : goto error_unlock;
1698 :
1699 2537247 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
1700 2537247 : return 0;
1701 :
1702 : error_trans_cancel:
1703 1292 : xfs_trans_cancel(tp);
1704 1292 : error_unlock:
1705 1292 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
1706 1292 : return error;
1707 : }
1708 :
1709 : /*
1710 : * xfs_inactive_ifree()
1711 : *
1712 : * Perform the inode free when an inode is unlinked.
1713 : */
1714 : STATIC int
1715 47537985 : xfs_inactive_ifree(
1716 : struct xfs_inode *ip)
1717 : {
1718 47537985 : struct xfs_mount *mp = ip->i_mount;
1719 47537985 : struct xfs_trans *tp;
1720 47537985 : int error;
1721 :
1722 : /*
1723 : * We try to use a per-AG reservation for any block needed by the finobt
1724 : * tree, but as the finobt feature predates the per-AG reservation
1725 : * support a degraded file system might not have enough space for the
1726 : * reservation at mount time. In that case try to dip into the reserved
1727 : * pool and pray.
1728 : *
1729 : * Send a warning if the reservation does happen to fail, as the inode
1730 : * now remains allocated and sits on the unlinked list until the fs is
1731 : * repaired.
1732 : */
1733 47537985 : if (unlikely(mp->m_finobt_nores)) {
1734 0 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
1735 : XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
1736 : &tp);
1737 : } else {
1738 47537985 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
1739 : }
1740 47550379 : if (error) {
1741 2 : if (error == -ENOSPC) {
1742 0 : xfs_warn_ratelimited(mp,
1743 : "Failed to remove inode(s) from unlinked list. "
1744 : "Please free space, unmount and run xfs_repair.");
1745 : } else {
1746 4 : ASSERT(xfs_is_shutdown(mp));
1747 : }
1748 2 : return error;
1749 : }
1750 :
1751 : /*
1752 : * We do not hold the inode locked across the entire rolling transaction
1753 : * here. We only need to hold it for the first transaction that
1754 : * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the
1755 : * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode
1756 : * here breaks the relationship between cluster buffer invalidation and
1757 : * stale inode invalidation on cluster buffer item journal commit
1758 : * completion, and can result in leaving dirty stale inodes hanging
1759 : * around in memory.
1760 : *
1761 : * We have no need for serialising this inode operation against other
1762 : * operations - we freed the inode and hence reallocation is required
1763 : * and that will serialise on reallocating the space the deferops need
1764 : * to free. Hence we can unlock the inode on the first commit of
1765 : * the transaction rather than roll it right through the deferops. This
1766 : * avoids relogging the XFS_ISTALE inode.
1767 : *
1768 : * We check that xfs_ifree() hasn't grown an internal transaction roll
1769 : * by asserting that the inode is still locked when it returns.
1770 : */
1771 47550377 : xfs_ilock(ip, XFS_ILOCK_EXCL);
1772 47549288 : xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1773 :
1774 47544418 : error = xfs_ifree(tp, ip);
1775 47548651 : ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1776 47548651 : if (error) {
1777 : /*
1778 : * If we fail to free the inode, shut down. The cancel
1779 : * might do that, we need to make sure. Otherwise the
1780 : * inode might be lost for a long time or forever.
1781 : */
1782 244 : if (!xfs_is_shutdown(mp)) {
1783 1 : xfs_notice(mp, "%s: xfs_ifree returned error %d",
1784 : __func__, error);
1785 1 : xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1786 : }
1787 122 : xfs_trans_cancel(tp);
1788 122 : return error;
1789 : }
1790 :
1791 : /*
1792 : * Credit the quota account(s). The inode is gone.
1793 : */
1794 47548529 : xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1795 :
1796 47547094 : return xfs_trans_commit(tp);
1797 : }
1798 :
1799 : /*
1800 : * Returns true if we need to update the on-disk metadata before we can free
1801 : * the memory used by this inode. Updates include freeing post-eof
1802 : * preallocations; freeing COW staging extents; and marking the inode free in
1803 : * the inobt if it is on the unlinked list.
1804 : */
1805 : bool
1806 1055874269 : xfs_inode_needs_inactive(
1807 : struct xfs_inode *ip)
1808 : {
1809 1055874269 : struct xfs_mount *mp = ip->i_mount;
1810 1055874269 : struct xfs_ifork *cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
1811 :
1812 : /*
1813 : * If the inode is already free, then there can be nothing
1814 : * to clean up here.
1815 : */
1816 1055874269 : if (VFS_I(ip)->i_mode == 0)
1817 : return false;
1818 :
1819 : /* If this is a read-only mount, don't do this (would generate I/O) */
1820 2111748538 : if (xfs_is_readonly(mp))
1821 : return false;
1822 :
1823 : /* If the log isn't running, push inodes straight to reclaim. */
1824 2088251858 : if (xfs_is_shutdown(mp) || xfs_has_norecovery(mp))
1825 : return false;
1826 :
1827 : /* Metadata inodes require explicit resource cleanup. */
1828 751122390 : if (xfs_is_metadata_inode(ip))
1829 : return false;
1830 :
1831 : /* Want to clean out the cow blocks if there are any. */
1832 751074449 : if (cow_ifp && cow_ifp->if_bytes > 0)
1833 : return true;
1834 :
1835 : /* Unlinked files must be freed. */
1836 751068870 : if (VFS_I(ip)->i_nlink == 0)
1837 : return true;
1838 :
1839 : /*
1840 : * This file isn't being freed, so check if there are post-eof blocks
1841 : * to free. @force is true because we are evicting an inode from the
1842 : * cache. Post-eof blocks must be freed, lest we end up with broken
1843 : * free space accounting.
1844 : *
1845 : * Note: don't bother with iolock here since lockdep complains about
1846 : * acquiring it in reclaim context. We have the only reference to the
1847 : * inode at this point anyways.
1848 : */
1849 703538894 : return xfs_can_free_eofblocks(ip, true);
1850 : }
1851 :
1852 : /*
1853 : * Save health status somewhere, if we're dumping an inode with uncorrected
1854 : * errors and online repair isn't running.
1855 : */
1856 : static inline void
1857 47726163 : xfs_inactive_health(
1858 : struct xfs_inode *ip)
1859 : {
1860 47726163 : struct xfs_mount *mp = ip->i_mount;
1861 47726163 : struct xfs_perag *pag;
1862 47726163 : unsigned int sick;
1863 47726163 : unsigned int checked;
1864 :
1865 47726163 : xfs_inode_measure_sickness(ip, &sick, &checked);
1866 47723604 : if (!sick)
1867 47723604 : return;
1868 :
1869 0 : trace_xfs_inode_unfixed_corruption(ip, sick);
1870 :
1871 0 : if (sick & XFS_SICK_INO_FORGET)
1872 : return;
1873 :
1874 0 : pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1875 0 : if (!pag) {
1876 : /* There had better still be a perag structure! */
1877 0 : ASSERT(0);
1878 0 : return;
1879 : }
1880 :
1881 0 : xfs_ag_mark_sick(pag, XFS_SICK_AG_INODES);
1882 0 : xfs_perag_put(pag);
1883 : }
1884 :
1885 : /*
1886 : * xfs_inactive
1887 : *
1888 : * This is called when the vnode reference count for the vnode
1889 : * goes to zero. If the file has been unlinked, then it must
1890 : * now be truncated. Also, we clear all of the read-ahead state
1891 : * kept for the inode here since the file is now closed.
1892 : */
1893 : int
1894 47732809 : xfs_inactive(
1895 : xfs_inode_t *ip)
1896 : {
1897 47732809 : struct xfs_mount *mp;
1898 47732809 : int error = 0;
1899 47732809 : int truncate = 0;
1900 :
1901 : /*
1902 : * If the inode is already free, then there can be nothing
1903 : * to clean up here.
1904 : */
1905 47732809 : if (VFS_I(ip)->i_mode == 0) {
1906 0 : ASSERT(ip->i_df.if_broot_bytes == 0);
1907 0 : goto out;
1908 : }
1909 :
1910 47732809 : mp = ip->i_mount;
1911 95452376 : ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
1912 :
1913 47719567 : xfs_inactive_health(ip);
1914 :
1915 : /* If this is a read-only mount, don't do this (would generate I/O) */
1916 95454500 : if (xfs_is_readonly(mp))
1917 0 : goto out;
1918 :
1919 : /* Metadata inodes require explicit resource cleanup. */
1920 47727250 : if (xfs_is_metadata_inode(ip))
1921 0 : goto out;
1922 :
1923 : /* Try to clean out the cow blocks if there are any. */
1924 95454500 : if (xfs_inode_has_cow_data(ip))
1925 5579 : xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
1926 :
1927 47727250 : if (VFS_I(ip)->i_nlink != 0) {
1928 : /*
1929 : * force is true because we are evicting an inode from the
1930 : * cache. Post-eof blocks must be freed, lest we end up with
1931 : * broken free space accounting.
1932 : *
1933 : * Note: don't bother with iolock here since lockdep complains
1934 : * about acquiring it in reclaim context. We have the only
1935 : * reference to the inode at this point anyways.
1936 : */
1937 196373 : if (xfs_can_free_eofblocks(ip, true))
1938 191906 : error = xfs_free_eofblocks(ip);
1939 :
1940 196375 : goto out;
1941 : }
1942 :
1943 47530877 : if (S_ISREG(VFS_I(ip)->i_mode) &&
1944 14633339 : (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 ||
1945 12117997 : ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
1946 : truncate = 1;
1947 :
1948 47530877 : error = xfs_qm_dqattach(ip);
1949 47517989 : if (error)
1950 7 : goto out;
1951 :
1952 47517982 : if (S_ISDIR(VFS_I(ip)->i_mode) && ip->i_df.if_nextents > 0) {
1953 0 : xfs_inactive_dir(ip);
1954 0 : truncate = 1;
1955 : }
1956 :
1957 47517982 : if (S_ISLNK(VFS_I(ip)->i_mode))
1958 30774157 : error = xfs_inactive_symlink(ip);
1959 16743825 : else if (truncate)
1960 2536183 : error = xfs_inactive_truncate(ip);
1961 47523335 : if (error)
1962 1941 : goto out;
1963 :
1964 : /*
1965 : * If there are attributes associated with the file then blow them away
1966 : * now. The code calls a routine that recursively deconstructs the
1967 : * attribute fork. If also blows away the in-core attribute fork.
1968 : */
1969 47521394 : if (xfs_inode_has_attr_fork(ip)) {
1970 39449998 : error = xfs_attr_inactive(ip);
1971 39470347 : if (error)
1972 213 : goto out;
1973 : }
1974 :
1975 47541530 : ASSERT(ip->i_forkoff == 0);
1976 :
1977 : /*
1978 : * Free the inode.
1979 : */
1980 47541530 : error = xfs_inactive_ifree(ip);
1981 :
1982 47749438 : out:
1983 : /*
1984 : * We're done making metadata updates for this inode, so we can release
1985 : * the attached dquots.
1986 : */
1987 47749438 : xfs_qm_dqdetach(ip);
1988 47747652 : return error;
1989 : }
1990 :
1991 : /*
1992 : * In-Core Unlinked List Lookups
1993 : * =============================
1994 : *
1995 : * Every inode is supposed to be reachable from some other piece of metadata
1996 : * with the exception of the root directory. Inodes with a connection to a
1997 : * file descriptor but not linked from anywhere in the on-disk directory tree
1998 : * are collectively known as unlinked inodes, though the filesystem itself
1999 : * maintains links to these inodes so that on-disk metadata are consistent.
2000 : *
2001 : * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI
2002 : * header contains a number of buckets that point to an inode, and each inode
2003 : * record has a pointer to the next inode in the hash chain. This
2004 : * singly-linked list causes scaling problems in the iunlink remove function
2005 : * because we must walk that list to find the inode that points to the inode
2006 : * being removed from the unlinked hash bucket list.
2007 : *
2008 : * Hence we keep an in-memory double linked list to link each inode on an
2009 : * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer
2010 : * based lists would require having 64 list heads in the perag, one for each
2011 : * list. This is expensive in terms of memory (think millions of AGs) and cache
2012 : * misses on lookups. Instead, use the fact that inodes on the unlinked list
2013 : * must be referenced at the VFS level to keep them on the list and hence we
2014 : * have an existence guarantee for inodes on the unlinked list.
2015 : *
2016 : * Given we have an existence guarantee, we can use lockless inode cache lookups
2017 : * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode
2018 : * for the double linked unlinked list, and we don't need any extra locking to
2019 : * keep the list safe as all manipulations are done under the AGI buffer lock.
2020 : * Keeping the list up to date does not require memory allocation, just finding
2021 : * the XFS inode and updating the next/prev unlinked list aginos.
2022 : */
2023 :
2024 : /*
2025 : * Find an inode on the unlinked list. This does not take references to the
2026 : * inode as we have existence guarantees by holding the AGI buffer lock and that
2027 : * only unlinked, referenced inodes can be on the unlinked inode list. If we
2028 : * don't find the inode in cache, then let the caller handle the situation.
2029 : */
2030 : struct xfs_inode *
2031 23076663 : xfs_iunlink_lookup(
2032 : struct xfs_perag *pag,
2033 : xfs_agino_t agino)
2034 : {
2035 23076663 : struct xfs_inode *ip;
2036 :
2037 23076663 : rcu_read_lock();
2038 23072382 : ip = radix_tree_lookup(&pag->pag_ici_root, agino);
2039 :
2040 : /*
2041 : * Inode not in memory or in RCU freeing limbo should not happen.
2042 : * Warn about this and let the caller handle the failure.
2043 : */
2044 46113057 : if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
2045 0 : xfs_emerg(pag->pag_mount, "IUNLINK agno 0x%x agino 0x%x ino 0x%llx ip? %d", pag->pag_agno, agino, XFS_AGINO_TO_INO(pag->pag_mount, pag->pag_agno, agino), ip != NULL);
2046 0 : rcu_read_unlock();
2047 0 : return NULL;
2048 : }
2049 46132828 : ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM));
2050 46157784 : if (xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM))
2051 0 : xfs_emerg(pag->pag_mount, "IUNLINK agno 0x%x agino 0x%x ino 0x%llx ipino 0x%llx", pag->pag_agno, agino, XFS_AGINO_TO_INO(pag->pag_mount, pag->pag_agno, agino), ip->i_ino);
2052 23079694 : rcu_read_unlock();
2053 23079694 : return ip;
2054 : }
2055 :
2056 : /* Update the prev pointer of the next agino. */
2057 : static int
2058 98662390 : xfs_iunlink_update_backref(
2059 : struct xfs_perag *pag,
2060 : xfs_agino_t prev_agino,
2061 : xfs_agino_t next_agino)
2062 : {
2063 98662390 : struct xfs_inode *ip;
2064 :
2065 : /* No update necessary if we are at the end of the list. */
2066 98662390 : if (next_agino == NULLAGINO)
2067 : return 0;
2068 :
2069 19516230 : ip = xfs_iunlink_lookup(pag, next_agino);
2070 19515656 : if (!ip) {
2071 0 : xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
2072 0 : return -EFSCORRUPTED;
2073 : }
2074 :
2075 19515656 : ip->i_prev_unlinked = prev_agino;
2076 19515656 : return 0;
2077 : }
2078 :
2079 : /*
2080 : * Point the AGI unlinked bucket at an inode and log the results. The caller
2081 : * is responsible for validating the old value.
2082 : */
2083 : STATIC int
2084 95112848 : xfs_iunlink_update_bucket(
2085 : struct xfs_trans *tp,
2086 : struct xfs_perag *pag,
2087 : struct xfs_buf *agibp,
2088 : unsigned int bucket_index,
2089 : xfs_agino_t new_agino)
2090 : {
2091 95112848 : struct xfs_agi *agi = agibp->b_addr;
2092 95112848 : xfs_agino_t old_value;
2093 95112848 : int offset;
2094 :
2095 151338366 : ASSERT(xfs_verify_agino_or_null(pag, new_agino));
2096 :
2097 95112848 : old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2098 95112848 : trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
2099 : old_value, new_agino);
2100 :
2101 : /*
2102 : * We should never find the head of the list already set to the value
2103 : * passed in because either we're adding or removing ourselves from the
2104 : * head of the list.
2105 : */
2106 95116100 : if (old_value == new_agino) {
2107 0 : xfs_buf_mark_corrupt(agibp);
2108 0 : xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
2109 0 : return -EFSCORRUPTED;
2110 : }
2111 :
2112 95116100 : agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
2113 95116100 : offset = offsetof(struct xfs_agi, agi_unlinked) +
2114 : (sizeof(xfs_agino_t) * bucket_index);
2115 95116100 : xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
2116 95116100 : return 0;
2117 : }
2118 :
2119 : static int
2120 49341651 : xfs_iunlink_insert_inode(
2121 : struct xfs_trans *tp,
2122 : struct xfs_perag *pag,
2123 : struct xfs_buf *agibp,
2124 : struct xfs_inode *ip)
2125 : {
2126 49341651 : struct xfs_mount *mp = tp->t_mountp;
2127 49341651 : struct xfs_agi *agi = agibp->b_addr;
2128 49341651 : xfs_agino_t next_agino;
2129 49341651 : xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2130 49341651 : short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2131 49341651 : int error;
2132 :
2133 : /*
2134 : * Get the index into the agi hash table for the list this inode will
2135 : * go on. Make sure the pointer isn't garbage and that this inode
2136 : * isn't already on the list.
2137 : */
2138 49341651 : next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2139 49341651 : if (next_agino == agino ||
2140 : !xfs_verify_agino_or_null(pag, next_agino)) {
2141 0 : xfs_buf_mark_corrupt(agibp);
2142 0 : xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
2143 0 : return -EFSCORRUPTED;
2144 : }
2145 :
2146 : /*
2147 : * Update the prev pointer in the next inode to point back to this
2148 : * inode.
2149 : */
2150 49343263 : error = xfs_iunlink_update_backref(pag, agino, next_agino);
2151 49341964 : if (error)
2152 : return error;
2153 :
2154 49341964 : if (next_agino != NULLAGINO) {
2155 : /*
2156 : * There is already another inode in the bucket, so point this
2157 : * inode to the current head of the list.
2158 : */
2159 10447715 : error = xfs_iunlink_log_inode(tp, ip, pag, next_agino);
2160 10448621 : if (error)
2161 : return error;
2162 10448621 : ip->i_next_unlinked = next_agino;
2163 : }
2164 :
2165 : /* Point the head of the list to point to this inode. */
2166 49342870 : ip->i_prev_unlinked = NULLAGINO;
2167 49342870 : return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
2168 : }
2169 :
2170 : /*
2171 : * This is called when the inode's link count has gone to 0 or we are creating
2172 : * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
2173 : *
2174 : * We place the on-disk inode on a list in the AGI. It will be pulled from this
2175 : * list when the inode is freed.
2176 : */
2177 : int
2178 49343118 : xfs_iunlink(
2179 : struct xfs_trans *tp,
2180 : struct xfs_inode *ip)
2181 : {
2182 49343118 : struct xfs_mount *mp = tp->t_mountp;
2183 49343118 : struct xfs_perag *pag;
2184 49343118 : struct xfs_buf *agibp;
2185 49343118 : int error;
2186 :
2187 49343118 : ASSERT(VFS_I(ip)->i_nlink == 0);
2188 49343118 : ASSERT(VFS_I(ip)->i_mode != 0);
2189 49343118 : trace_xfs_iunlink(ip);
2190 :
2191 49343724 : pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2192 :
2193 : /* Get the agi buffer first. It ensures lock ordering on the list. */
2194 49343654 : error = xfs_read_agi(pag, tp, &agibp);
2195 49342371 : if (error)
2196 8 : goto out;
2197 :
2198 49342363 : error = xfs_iunlink_insert_inode(tp, pag, agibp, ip);
2199 49341795 : out:
2200 49341795 : xfs_perag_put(pag);
2201 49342378 : return error;
2202 : }
2203 :
2204 : static int
2205 49342969 : xfs_iunlink_remove_inode(
2206 : struct xfs_trans *tp,
2207 : struct xfs_perag *pag,
2208 : struct xfs_buf *agibp,
2209 : struct xfs_inode *ip)
2210 : {
2211 49342969 : struct xfs_mount *mp = tp->t_mountp;
2212 49342969 : struct xfs_agi *agi = agibp->b_addr;
2213 49342969 : xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2214 49342969 : xfs_agino_t head_agino;
2215 49342969 : short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2216 49342969 : int error;
2217 :
2218 49342969 : trace_xfs_iunlink_remove(ip);
2219 :
2220 : /*
2221 : * Get the index into the agi hash table for the list this inode will
2222 : * go on. Make sure the head pointer isn't garbage.
2223 : */
2224 49337352 : head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2225 49337352 : if (!xfs_verify_agino(pag, head_agino)) {
2226 7493 : XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
2227 : agi, sizeof(*agi));
2228 0 : xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
2229 0 : return -EFSCORRUPTED;
2230 : }
2231 :
2232 : /*
2233 : * Set our inode's next_unlinked pointer to NULL and then return
2234 : * the old pointer value so that we can update whatever was previous
2235 : * to us in the list to point to whatever was next in the list.
2236 : */
2237 49329859 : error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO);
2238 49329065 : if (error)
2239 : return error;
2240 :
2241 : /*
2242 : * Update the prev pointer in the next inode to point back to previous
2243 : * inode in the chain.
2244 : */
2245 49332178 : error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
2246 : ip->i_next_unlinked);
2247 49325783 : if (error)
2248 : return error;
2249 :
2250 49325783 : if (head_agino != agino) {
2251 3556405 : struct xfs_inode *prev_ip;
2252 :
2253 3556405 : prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
2254 3556308 : if (!prev_ip) {
2255 0 : xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
2256 0 : return -EFSCORRUPTED;
2257 : }
2258 :
2259 3556308 : error = xfs_iunlink_log_inode(tp, prev_ip, pag,
2260 : ip->i_next_unlinked);
2261 3556116 : prev_ip->i_next_unlinked = ip->i_next_unlinked;
2262 : } else {
2263 : /* Point the head of the list to the next unlinked inode. */
2264 45769378 : error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
2265 : ip->i_next_unlinked);
2266 : }
2267 :
2268 49342068 : ip->i_next_unlinked = NULLAGINO;
2269 49342068 : ip->i_prev_unlinked = 0;
2270 49342068 : return error;
2271 : }
2272 :
2273 : /*
2274 : * Pull the on-disk inode from the AGI unlinked list.
2275 : */
2276 : int
2277 49345356 : xfs_iunlink_remove(
2278 : struct xfs_trans *tp,
2279 : struct xfs_perag *pag,
2280 : struct xfs_inode *ip)
2281 : {
2282 49345356 : struct xfs_buf *agibp;
2283 49345356 : int error;
2284 :
2285 49345356 : trace_xfs_iunlink_remove(ip);
2286 :
2287 : /* Get the agi buffer first. It ensures lock ordering on the list. */
2288 49344407 : error = xfs_read_agi(pag, tp, &agibp);
2289 49337956 : if (error)
2290 : return error;
2291 :
2292 49336655 : return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
2293 : }
2294 :
2295 : /*
2296 : * Look up the inode number specified and if it is not already marked XFS_ISTALE
2297 : * mark it stale. We should only find clean inodes in this lookup that aren't
2298 : * already stale.
2299 : */
2300 : static void
2301 4388934 : xfs_ifree_mark_inode_stale(
2302 : struct xfs_perag *pag,
2303 : struct xfs_inode *free_ip,
2304 : xfs_ino_t inum)
2305 : {
2306 4388934 : struct xfs_mount *mp = pag->pag_mount;
2307 4388934 : struct xfs_inode_log_item *iip;
2308 4388934 : struct xfs_inode *ip;
2309 :
2310 4388934 : retry:
2311 4388934 : rcu_read_lock();
2312 4388906 : ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
2313 :
2314 : /* Inode not in memory, nothing to do */
2315 4389037 : if (!ip) {
2316 1046831 : rcu_read_unlock();
2317 1046831 : return;
2318 : }
2319 :
2320 : /*
2321 : * because this is an RCU protected lookup, we could find a recently
2322 : * freed or even reallocated inode during the lookup. We need to check
2323 : * under the i_flags_lock for a valid inode here. Skip it if it is not
2324 : * valid, the wrong inode or stale.
2325 : */
2326 3342206 : spin_lock(&ip->i_flags_lock);
2327 3341815 : if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE))
2328 44495 : goto out_iflags_unlock;
2329 :
2330 : /*
2331 : * Don't try to lock/unlock the current inode, but we _cannot_ skip the
2332 : * other inodes that we did not find in the list attached to the buffer
2333 : * and are not already marked stale. If we can't lock it, back off and
2334 : * retry.
2335 : */
2336 3297320 : if (ip != free_ip) {
2337 3219197 : if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2338 0 : spin_unlock(&ip->i_flags_lock);
2339 0 : rcu_read_unlock();
2340 0 : delay(1);
2341 0 : goto retry;
2342 : }
2343 : }
2344 3297390 : ip->i_flags |= XFS_ISTALE;
2345 :
2346 : /*
2347 : * If the inode is flushing, it is already attached to the buffer. All
2348 : * we needed to do here is mark the inode stale so buffer IO completion
2349 : * will remove it from the AIL.
2350 : */
2351 3297390 : iip = ip->i_itemp;
2352 3297390 : if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
2353 796 : ASSERT(!list_empty(&iip->ili_item.li_bio_list));
2354 796 : ASSERT(iip->ili_last_fields);
2355 796 : goto out_iunlock;
2356 : }
2357 :
2358 : /*
2359 : * Inodes not attached to the buffer can be released immediately.
2360 : * Everything else has to go through xfs_iflush_abort() on journal
2361 : * commit as the flock synchronises removal of the inode from the
2362 : * cluster buffer against inode reclaim.
2363 : */
2364 3296594 : if (!iip || list_empty(&iip->ili_item.li_bio_list))
2365 153937 : goto out_iunlock;
2366 :
2367 3142657 : __xfs_iflags_set(ip, XFS_IFLUSHING);
2368 3142657 : spin_unlock(&ip->i_flags_lock);
2369 3142711 : rcu_read_unlock();
2370 :
2371 : /* we have a dirty inode in memory that has not yet been flushed. */
2372 3142755 : spin_lock(&iip->ili_lock);
2373 3142945 : iip->ili_last_fields = iip->ili_fields;
2374 3142945 : iip->ili_fields = 0;
2375 3142945 : iip->ili_fsync_fields = 0;
2376 3142945 : spin_unlock(&iip->ili_lock);
2377 3142935 : ASSERT(iip->ili_last_fields);
2378 :
2379 3142935 : if (ip != free_ip)
2380 3064561 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
2381 : return;
2382 :
2383 154733 : out_iunlock:
2384 154733 : if (ip != free_ip)
2385 154715 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
2386 18 : out_iflags_unlock:
2387 199231 : spin_unlock(&ip->i_flags_lock);
2388 199234 : rcu_read_unlock();
2389 : }
2390 :
2391 : /*
2392 : * A big issue when freeing the inode cluster is that we _cannot_ skip any
2393 : * inodes that are in memory - they all must be marked stale and attached to
2394 : * the cluster buffer.
2395 : */
2396 : static int
2397 78393 : xfs_ifree_cluster(
2398 : struct xfs_trans *tp,
2399 : struct xfs_perag *pag,
2400 : struct xfs_inode *free_ip,
2401 : struct xfs_icluster *xic)
2402 : {
2403 78393 : struct xfs_mount *mp = free_ip->i_mount;
2404 78393 : struct xfs_ino_geometry *igeo = M_IGEO(mp);
2405 78393 : struct xfs_buf *bp;
2406 78393 : xfs_daddr_t blkno;
2407 78393 : xfs_ino_t inum = xic->first_ino;
2408 78393 : int nbufs;
2409 78393 : int i, j;
2410 78393 : int ioffset;
2411 78393 : int error;
2412 :
2413 78393 : nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
2414 :
2415 235195 : for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
2416 : /*
2417 : * The allocation bitmap tells us which inodes of the chunk were
2418 : * physically allocated. Skip the cluster if an inode falls into
2419 : * a sparse region.
2420 : */
2421 156793 : ioffset = inum - xic->first_ino;
2422 156793 : if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
2423 19595 : ASSERT(ioffset % igeo->inodes_per_cluster == 0);
2424 19595 : continue;
2425 : }
2426 :
2427 137198 : blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2428 : XFS_INO_TO_AGBNO(mp, inum));
2429 :
2430 : /*
2431 : * We obtain and lock the backing buffer first in the process
2432 : * here to ensure dirty inodes attached to the buffer remain in
2433 : * the flushing state while we mark them stale.
2434 : *
2435 : * If we scan the in-memory inodes first, then buffer IO can
2436 : * complete before we get a lock on it, and hence we may fail
2437 : * to mark all the active inodes on the buffer stale.
2438 : */
2439 137198 : error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2440 137198 : mp->m_bsize * igeo->blocks_per_cluster,
2441 : XBF_UNMAPPED, &bp);
2442 137203 : if (error)
2443 0 : return error;
2444 :
2445 : /*
2446 : * This buffer may not have been correctly initialised as we
2447 : * didn't read it from disk. That's not important because we are
2448 : * only using to mark the buffer as stale in the log, and to
2449 : * attach stale cached inodes on it. That means it will never be
2450 : * dispatched for IO. If it is, we want to know about it, and we
2451 : * want it to fail. We can acheive this by adding a write
2452 : * verifier to the buffer.
2453 : */
2454 137203 : bp->b_ops = &xfs_inode_buf_ops;
2455 :
2456 : /*
2457 : * Now we need to set all the cached clean inodes as XFS_ISTALE,
2458 : * too. This requires lookups, and will skip inodes that we've
2459 : * already marked XFS_ISTALE.
2460 : */
2461 4526091 : for (i = 0; i < igeo->inodes_per_cluster; i++)
2462 4388889 : xfs_ifree_mark_inode_stale(pag, free_ip, inum + i);
2463 :
2464 137202 : xfs_trans_stale_inode_buf(tp, bp);
2465 137198 : xfs_trans_binval(tp, bp);
2466 : }
2467 : return 0;
2468 : }
2469 :
2470 : /*
2471 : * This is called to return an inode to the inode free list. The inode should
2472 : * already be truncated to 0 length and have no pages associated with it. This
2473 : * routine also assumes that the inode is already a part of the transaction.
2474 : *
2475 : * The on-disk copy of the inode will have been added to the list of unlinked
2476 : * inodes in the AGI. We need to remove the inode from that list atomically with
2477 : * respect to freeing it here.
2478 : */
2479 : int
2480 47542862 : xfs_ifree(
2481 : struct xfs_trans *tp,
2482 : struct xfs_inode *ip)
2483 : {
2484 47542862 : struct xfs_mount *mp = ip->i_mount;
2485 47542862 : struct xfs_perag *pag;
2486 47542862 : struct xfs_icluster xic = { 0 };
2487 47542862 : struct xfs_inode_log_item *iip = ip->i_itemp;
2488 47542862 : int error;
2489 :
2490 47542862 : ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2491 47542862 : ASSERT(VFS_I(ip)->i_nlink == 0);
2492 47542862 : ASSERT(ip->i_df.if_nextents == 0);
2493 47542862 : ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
2494 47542862 : ASSERT(ip->i_nblocks == 0);
2495 :
2496 47542862 : pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2497 :
2498 : /*
2499 : * Free the inode first so that we guarantee that the AGI lock is going
2500 : * to be taken before we remove the inode from the unlinked list. This
2501 : * makes the AGI lock -> unlinked list modification order the same as
2502 : * used in O_TMPFILE creation.
2503 : */
2504 47532742 : error = xfs_difree(tp, pag, ip->i_ino, &xic);
2505 47545242 : if (error)
2506 122 : goto out;
2507 :
2508 47545120 : error = xfs_iunlink_remove(tp, pag, ip);
2509 47534875 : if (error)
2510 0 : goto out;
2511 :
2512 : /*
2513 : * Free any local-format data sitting around before we reset the
2514 : * data fork to extents format. Note that the attr fork data has
2515 : * already been freed by xfs_attr_inactive.
2516 : */
2517 47534875 : if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
2518 7198341 : kmem_free(ip->i_df.if_u1.if_data);
2519 7198598 : ip->i_df.if_u1.if_data = NULL;
2520 7198598 : ip->i_df.if_bytes = 0;
2521 : }
2522 :
2523 47535132 : VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
2524 47535132 : ip->i_diflags = 0;
2525 47535132 : ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
2526 47535132 : ip->i_forkoff = 0; /* mark the attr fork not in use */
2527 47535132 : ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
2528 95073908 : if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS))
2529 0 : xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS);
2530 :
2531 : /* Don't attempt to replay owner changes for a deleted inode */
2532 47538776 : spin_lock(&iip->ili_lock);
2533 47547694 : iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);
2534 47547694 : spin_unlock(&iip->ili_lock);
2535 :
2536 : /*
2537 : * Bump the generation count so no one will be confused
2538 : * by reincarnations of this inode.
2539 : */
2540 47545493 : VFS_I(ip)->i_generation++;
2541 47545493 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2542 :
2543 47548629 : if (xic.deleted)
2544 78393 : error = xfs_ifree_cluster(tp, pag, ip, &xic);
2545 47470236 : out:
2546 47548760 : xfs_perag_put(pag);
2547 47548739 : return error;
2548 : }
2549 :
2550 : /*
2551 : * This is called to unpin an inode. The caller must have the inode locked
2552 : * in at least shared mode so that the buffer cannot be subsequently pinned
2553 : * once someone is waiting for it to be unpinned.
2554 : */
2555 : static void
2556 34 : xfs_iunpin(
2557 : struct xfs_inode *ip)
2558 : {
2559 34 : ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2560 :
2561 34 : trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2562 :
2563 : /* Give the log a push to start the unpinning I/O */
2564 34 : xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
2565 :
2566 34 : }
2567 :
2568 : static void
2569 34 : __xfs_iunpin_wait(
2570 : struct xfs_inode *ip)
2571 : {
2572 34 : wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
2573 34 : DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2574 :
2575 34 : xfs_iunpin(ip);
2576 :
2577 34 : do {
2578 34 : prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
2579 34 : if (xfs_ipincount(ip))
2580 34 : io_schedule();
2581 34 : } while (xfs_ipincount(ip));
2582 34 : finish_wait(wq, &wait.wq_entry);
2583 34 : }
2584 :
2585 : void
2586 293787254 : xfs_iunpin_wait(
2587 : struct xfs_inode *ip)
2588 : {
2589 293787254 : if (xfs_ipincount(ip))
2590 34 : __xfs_iunpin_wait(ip);
2591 293787254 : }
2592 :
2593 : /*
2594 : * Removing an inode from the namespace involves removing the directory entry
2595 : * and dropping the link count on the inode. Removing the directory entry can
2596 : * result in locking an AGF (directory blocks were freed) and removing a link
2597 : * count can result in placing the inode on an unlinked list which results in
2598 : * locking an AGI.
2599 : *
2600 : * The big problem here is that we have an ordering constraint on AGF and AGI
2601 : * locking - inode allocation locks the AGI, then can allocate a new extent for
2602 : * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
2603 : * removes the inode from the unlinked list, requiring that we lock the AGI
2604 : * first, and then freeing the inode can result in an inode chunk being freed
2605 : * and hence freeing disk space requiring that we lock an AGF.
2606 : *
2607 : * Hence the ordering that is imposed by other parts of the code is AGI before
2608 : * AGF. This means we cannot remove the directory entry before we drop the inode
2609 : * reference count and put it on the unlinked list as this results in a lock
2610 : * order of AGF then AGI, and this can deadlock against inode allocation and
2611 : * freeing. Therefore we must drop the link counts before we remove the
2612 : * directory entry.
2613 : *
2614 : * This is still safe from a transactional point of view - it is not until we
2615 : * get to xfs_defer_finish() that we have the possibility of multiple
2616 : * transactions in this operation. Hence as long as we remove the directory
2617 : * entry and drop the link count in the first transaction of the remove
2618 : * operation, there are no transactional constraints on the ordering here.
2619 : */
2620 : int
2621 42711548 : xfs_remove(
2622 : struct xfs_inode *dp,
2623 : struct xfs_name *name,
2624 : struct xfs_inode *ip)
2625 : {
2626 42711548 : struct xfs_mount *mp = dp->i_mount;
2627 42711548 : struct xfs_trans *tp = NULL;
2628 42711548 : int is_dir = S_ISDIR(VFS_I(ip)->i_mode);
2629 42711548 : int dontcare;
2630 42711548 : int error = 0;
2631 42711548 : uint resblks;
2632 42711548 : struct xfs_parent_defer *parent = NULL;
2633 :
2634 42711548 : trace_xfs_remove(dp, name);
2635 :
2636 85423666 : if (xfs_is_shutdown(mp))
2637 : return -EIO;
2638 :
2639 42711379 : error = xfs_qm_dqattach(dp);
2640 42711254 : if (error)
2641 1 : goto std_return;
2642 :
2643 42711253 : error = xfs_qm_dqattach(ip);
2644 42708095 : if (error)
2645 0 : goto std_return;
2646 :
2647 42708095 : error = xfs_parent_start(mp, &parent);
2648 42710927 : if (error)
2649 0 : goto std_return;
2650 :
2651 : /*
2652 : * We try to get the real space reservation first, allowing for
2653 : * directory btree deletion(s) implying possible bmap insert(s). If we
2654 : * can't get the space reservation then we use 0 instead, and avoid the
2655 : * bmap btree insert(s) in the directory code by, if the bmap insert
2656 : * tries to happen, instead trimming the LAST block from the directory.
2657 : *
2658 : * Ignore EDQUOT and ENOSPC being returned via nospace_error because
2659 : * the directory code can handle a reservationless update and we don't
2660 : * want to prevent a user from trying to free space by deleting things.
2661 : */
2662 42710927 : resblks = xfs_remove_space_res(mp, name->len);
2663 42710751 : error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks,
2664 : &tp, &dontcare);
2665 42711311 : if (error) {
2666 1 : ASSERT(error != -ENOSPC);
2667 1 : goto out_parent;
2668 : }
2669 :
2670 : /*
2671 : * If we're removing a directory perform some additional validation.
2672 : */
2673 42711310 : if (is_dir) {
2674 2181540 : ASSERT(VFS_I(ip)->i_nlink >= 2);
2675 2181540 : if (VFS_I(ip)->i_nlink != 2) {
2676 1120017 : error = -ENOTEMPTY;
2677 1120017 : goto out_trans_cancel;
2678 : }
2679 1061523 : if (!xfs_dir_isempty(ip)) {
2680 641982 : error = -ENOTEMPTY;
2681 641982 : goto out_trans_cancel;
2682 : }
2683 :
2684 : /* Drop the link from ip's "..". */
2685 419543 : error = xfs_droplink(tp, dp);
2686 419549 : if (error)
2687 0 : goto out_trans_cancel;
2688 :
2689 : /* Drop the "." link from ip to self. */
2690 419549 : error = xfs_droplink(tp, ip);
2691 419552 : if (error)
2692 0 : goto out_trans_cancel;
2693 :
2694 : /*
2695 : * Point the unlinked child directory's ".." entry to the root
2696 : * directory to eliminate back-references to inodes that may
2697 : * get freed before the child directory is closed. If the fs
2698 : * gets shrunk, this can lead to dirent inode validation errors.
2699 : */
2700 419552 : if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) {
2701 391702 : error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
2702 : tp->t_mountp->m_sb.sb_rootino, 0);
2703 391700 : if (error)
2704 0 : goto out_trans_cancel;
2705 : }
2706 : } else {
2707 : /*
2708 : * When removing a non-directory we need to log the parent
2709 : * inode here. For a directory this is done implicitly
2710 : * by the xfs_droplink call for the ".." entry.
2711 : */
2712 40529770 : xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2713 : }
2714 40950116 : xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2715 :
2716 : /* Drop the link from dp to ip. */
2717 40950044 : error = xfs_droplink(tp, ip);
2718 40948228 : if (error)
2719 3 : goto out_trans_cancel;
2720 :
2721 40948225 : error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks);
2722 40946950 : if (error) {
2723 3 : ASSERT(error != -ENOENT);
2724 3 : goto out_trans_cancel;
2725 : }
2726 :
2727 40946947 : if (parent) {
2728 40826166 : error = xfs_parent_remove(tp, parent, dp, name, ip);
2729 40827928 : if (error)
2730 0 : goto out_trans_cancel;
2731 : }
2732 :
2733 : /*
2734 : * Drop the link from dp to ip, and if ip was a directory, remove the
2735 : * '.' and '..' references since we freed the directory.
2736 : */
2737 40948709 : xfs_dir_update_hook(dp, ip, -1, name);
2738 :
2739 : /*
2740 : * If this is a synchronous mount, make sure that the
2741 : * remove transaction goes to disk before returning to
2742 : * the user.
2743 : */
2744 40947824 : if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
2745 218 : xfs_trans_set_sync(tp);
2746 :
2747 40947824 : error = xfs_trans_commit(tp);
2748 40949250 : if (error)
2749 5 : goto out_unlock;
2750 :
2751 40949245 : if (is_dir && xfs_inode_is_filestream(ip))
2752 802 : xfs_filestream_deassociate(ip);
2753 :
2754 40949245 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
2755 40950460 : xfs_iunlock(dp, XFS_ILOCK_EXCL);
2756 40950509 : xfs_parent_finish(mp, parent);
2757 : return 0;
2758 :
2759 1762005 : out_trans_cancel:
2760 1762005 : xfs_trans_cancel(tp);
2761 1762013 : out_unlock:
2762 1762013 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
2763 1762010 : xfs_iunlock(dp, XFS_ILOCK_EXCL);
2764 1762010 : out_parent:
2765 1762010 : xfs_parent_finish(mp, parent);
2766 : std_return:
2767 : return error;
2768 : }
2769 :
2770 : static inline void
2771 29481467 : xfs_iunlock_rename(
2772 : struct xfs_inode **i_tab,
2773 : int num_inodes)
2774 : {
2775 29481467 : int i;
2776 :
2777 128057505 : for (i = num_inodes - 1; i >= 0; i--) {
2778 : /* Skip duplicate inodes if src and target dps are the same */
2779 98576040 : if (!i_tab[i] || (i > 0 && i_tab[i] == i_tab[i - 1]))
2780 1166629 : continue;
2781 97409411 : xfs_iunlock(i_tab[i], XFS_ILOCK_EXCL);
2782 : }
2783 29481465 : }
2784 :
2785 : /*
2786 : * Enter all inodes for a rename transaction into a sorted array.
2787 : */
2788 : #define __XFS_SORT_INODES 5
2789 : STATIC void
2790 29658590 : xfs_sort_for_rename(
2791 : struct xfs_inode *dp1, /* in: old (source) directory inode */
2792 : struct xfs_inode *dp2, /* in: new (target) directory inode */
2793 : struct xfs_inode *ip1, /* in: inode of old entry */
2794 : struct xfs_inode *ip2, /* in: inode of new entry */
2795 : struct xfs_inode *wip, /* in: whiteout inode */
2796 : struct xfs_inode **i_tab,/* out: sorted array of inodes */
2797 : int *num_inodes) /* in/out: inodes in array */
2798 : {
2799 29658590 : int i;
2800 :
2801 29658590 : ASSERT(*num_inodes == __XFS_SORT_INODES);
2802 29658590 : memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
2803 :
2804 : /*
2805 : * i_tab contains a list of pointers to inodes. We initialize
2806 : * the table here & we'll sort it. We will then use it to
2807 : * order the acquisition of the inode locks.
2808 : *
2809 : * Note that the table may contain duplicates. e.g., dp1 == dp2.
2810 : */
2811 29658590 : i = 0;
2812 29658590 : i_tab[i++] = dp1;
2813 29658590 : i_tab[i++] = dp2;
2814 29658590 : i_tab[i++] = ip1;
2815 29658590 : if (ip2)
2816 8393125 : i_tab[i++] = ip2;
2817 29658590 : if (wip)
2818 1814554 : i_tab[i++] = wip;
2819 29658590 : *num_inodes = i;
2820 :
2821 29658590 : xfs_sort_inodes(i_tab, *num_inodes);
2822 29658584 : }
2823 :
2824 : void
2825 29658583 : xfs_sort_inodes(
2826 : struct xfs_inode **i_tab,
2827 : unsigned int num_inodes)
2828 : {
2829 29658583 : int i, j;
2830 :
2831 29658583 : ASSERT(num_inodes <= __XFS_SORT_INODES);
2832 :
2833 : /*
2834 : * Sort the elements via bubble sort. (Remember, there are at
2835 : * most 5 elements to sort, so this is adequate.)
2836 : */
2837 128841839 : for (i = 0; i < num_inodes; i++) {
2838 338423287 : for (j = 1; j < num_inodes; j++) {
2839 239240031 : if (i_tab[j]->i_ino < i_tab[j-1]->i_ino)
2840 239240031 : swap(i_tab[j], i_tab[j - 1]);
2841 : }
2842 : }
2843 29658583 : }
2844 :
2845 : static int
2846 29479021 : xfs_finish_rename(
2847 : struct xfs_trans *tp)
2848 : {
2849 : /*
2850 : * If this is a synchronous mount, make sure that the rename transaction
2851 : * goes to disk before returning to the user.
2852 : */
2853 29479021 : if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp))
2854 0 : xfs_trans_set_sync(tp);
2855 :
2856 29479021 : return xfs_trans_commit(tp);
2857 : }
2858 :
2859 : /*
2860 : * xfs_cross_rename()
2861 : *
2862 : * responsible for handling RENAME_EXCHANGE flag in renameat2() syscall
2863 : */
2864 : STATIC int
2865 8088275 : xfs_cross_rename(
2866 : struct xfs_trans *tp,
2867 : struct xfs_inode *dp1,
2868 : struct xfs_name *name1,
2869 : struct xfs_inode *ip1,
2870 : struct xfs_parent_defer *ip1_pptr,
2871 : struct xfs_inode *dp2,
2872 : struct xfs_name *name2,
2873 : struct xfs_inode *ip2,
2874 : struct xfs_parent_defer *ip2_pptr,
2875 : int spaceres)
2876 : {
2877 8088275 : struct xfs_mount *mp = dp1->i_mount;
2878 8088275 : int error = 0;
2879 8088275 : int ip1_flags = 0;
2880 8088275 : int ip2_flags = 0;
2881 8088275 : int dp2_flags = 0;
2882 :
2883 : /* Swap inode number for dirent in first parent */
2884 8088275 : error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres);
2885 8088276 : if (error)
2886 108 : goto out_trans_abort;
2887 :
2888 : /* Swap inode number for dirent in second parent */
2889 8088168 : error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres);
2890 8088168 : if (error)
2891 7 : goto out_trans_abort;
2892 :
2893 : /*
2894 : * If we're renaming one or more directories across different parents,
2895 : * update the respective ".." entries (and link counts) to match the new
2896 : * parents.
2897 : */
2898 8088161 : if (dp1 != dp2) {
2899 7857214 : dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2900 :
2901 7857214 : if (S_ISDIR(VFS_I(ip2)->i_mode)) {
2902 2463519 : error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
2903 : dp1->i_ino, spaceres);
2904 2463519 : if (error)
2905 0 : goto out_trans_abort;
2906 :
2907 : /* transfer ip2 ".." reference to dp1 */
2908 2463519 : if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
2909 8 : error = xfs_droplink(tp, dp2);
2910 8 : if (error)
2911 0 : goto out_trans_abort;
2912 8 : xfs_bumplink(tp, dp1);
2913 : }
2914 :
2915 : /*
2916 : * Although ip1 isn't changed here, userspace needs
2917 : * to be warned about the change, so that applications
2918 : * relying on it (like backup ones), will properly
2919 : * notify the change
2920 : */
2921 : ip1_flags |= XFS_ICHGTIME_CHG;
2922 : ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2923 : }
2924 :
2925 7857214 : if (S_ISDIR(VFS_I(ip1)->i_mode)) {
2926 2463519 : error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
2927 : dp2->i_ino, spaceres);
2928 2463519 : if (error)
2929 0 : goto out_trans_abort;
2930 :
2931 : /* transfer ip1 ".." reference to dp2 */
2932 2463519 : if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
2933 8 : error = xfs_droplink(tp, dp1);
2934 8 : if (error)
2935 0 : goto out_trans_abort;
2936 8 : xfs_bumplink(tp, dp2);
2937 : }
2938 :
2939 : /*
2940 : * Although ip2 isn't changed here, userspace needs
2941 : * to be warned about the change, so that applications
2942 : * relying on it (like backup ones), will properly
2943 : * notify the change
2944 : */
2945 2463519 : ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2946 2463519 : ip2_flags |= XFS_ICHGTIME_CHG;
2947 : }
2948 : }
2949 :
2950 8088161 : if (xfs_has_parent(mp)) {
2951 8086951 : error = xfs_parent_replace(tp, ip1_pptr, dp1, name1, dp2,
2952 : name2, ip1);
2953 8086951 : if (error)
2954 0 : goto out_trans_abort;
2955 :
2956 8086951 : error = xfs_parent_replace(tp, ip2_pptr, dp2, name2, dp1,
2957 : name1, ip2);
2958 8086951 : if (error)
2959 0 : goto out_trans_abort;
2960 : }
2961 :
2962 8088161 : if (ip1_flags) {
2963 2463527 : xfs_trans_ichgtime(tp, ip1, ip1_flags);
2964 2463527 : xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
2965 : }
2966 8088161 : if (ip2_flags) {
2967 2463527 : xfs_trans_ichgtime(tp, ip2, ip2_flags);
2968 2463527 : xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
2969 : }
2970 8088161 : if (dp2_flags) {
2971 7857214 : xfs_trans_ichgtime(tp, dp2, dp2_flags);
2972 7857214 : xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
2973 : }
2974 8088161 : xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2975 8088161 : xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
2976 :
2977 : /*
2978 : * Inform our hook clients that we've finished an exchange operation as
2979 : * follows: removed the source and target files from their directories;
2980 : * added the target to the source directory; and added the source to
2981 : * the target directory. All inodes are locked, so it's ok to model a
2982 : * rename this way so long as we say we deleted entries before we add
2983 : * new ones.
2984 : */
2985 8088160 : xfs_dir_update_hook(dp1, ip1, -1, name1);
2986 8088160 : xfs_dir_update_hook(dp2, ip2, -1, name2);
2987 8088160 : xfs_dir_update_hook(dp1, ip2, 1, name1);
2988 8088160 : xfs_dir_update_hook(dp2, ip1, 1, name2);
2989 :
2990 8088160 : return xfs_finish_rename(tp);
2991 :
2992 115 : out_trans_abort:
2993 115 : xfs_trans_cancel(tp);
2994 115 : return error;
2995 : }
2996 :
2997 : /*
2998 : * xfs_rename_alloc_whiteout()
2999 : *
3000 : * Return a referenced, unlinked, unlocked inode that can be used as a
3001 : * whiteout in a rename transaction. We use a tmpfile inode here so that if we
3002 : * crash between allocating the inode and linking it into the rename transaction
3003 : * recovery will free the inode and we won't leak it.
3004 : */
3005 : static int
3006 1840591 : xfs_rename_alloc_whiteout(
3007 : struct mnt_idmap *idmap,
3008 : struct xfs_name *src_name,
3009 : struct xfs_inode *dp,
3010 : struct xfs_inode **wip)
3011 : {
3012 1840591 : struct xfs_inode *tmpfile;
3013 1840591 : struct qstr name;
3014 1840591 : int error;
3015 :
3016 1840591 : error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE,
3017 : xfs_has_parent(dp->i_mount), &tmpfile);
3018 1840602 : if (error)
3019 : return error;
3020 :
3021 1814559 : name.name = src_name->name;
3022 1814559 : name.len = src_name->len;
3023 1814559 : error = xfs_inode_init_security(VFS_I(tmpfile), VFS_I(dp), &name);
3024 1814559 : if (error) {
3025 0 : xfs_finish_inode_setup(tmpfile);
3026 0 : xfs_irele(tmpfile);
3027 0 : return error;
3028 : }
3029 :
3030 : /*
3031 : * Prepare the tmpfile inode as if it were created through the VFS.
3032 : * Complete the inode setup and flag it as linkable. nlink is already
3033 : * zero, so we can skip the drop_nlink.
3034 : */
3035 1814559 : xfs_setup_iops(tmpfile);
3036 1814559 : xfs_finish_inode_setup(tmpfile);
3037 1814554 : VFS_I(tmpfile)->i_state |= I_LINKABLE;
3038 :
3039 1814554 : *wip = tmpfile;
3040 1814554 : return 0;
3041 : }
3042 :
3043 : /*
3044 : * xfs_rename
3045 : */
3046 : int
3047 29684635 : xfs_rename(
3048 : struct mnt_idmap *idmap,
3049 : struct xfs_inode *src_dp,
3050 : struct xfs_name *src_name,
3051 : struct xfs_inode *src_ip,
3052 : struct xfs_inode *target_dp,
3053 : struct xfs_name *target_name,
3054 : struct xfs_inode *target_ip,
3055 : unsigned int flags)
3056 : {
3057 29684635 : struct xfs_mount *mp = src_dp->i_mount;
3058 29684635 : struct xfs_trans *tp;
3059 29684635 : struct xfs_inode *wip = NULL; /* whiteout inode */
3060 29684635 : struct xfs_inode *inodes[__XFS_SORT_INODES];
3061 29684635 : int i;
3062 29684635 : int num_inodes = __XFS_SORT_INODES;
3063 29684635 : bool new_parent = (src_dp != target_dp);
3064 29684635 : bool src_is_directory =
3065 29684635 : S_ISDIR(VFS_I(src_ip)->i_mode);
3066 29684635 : int spaceres;
3067 29684635 : bool retried = false;
3068 29684635 : int error, nospace_error = 0;
3069 29684635 : struct xfs_parent_defer *src_ip_pptr = NULL;
3070 29684635 : struct xfs_parent_defer *tgt_ip_pptr = NULL;
3071 29684635 : struct xfs_parent_defer *wip_pptr = NULL;
3072 :
3073 29684635 : trace_xfs_rename(src_dp, target_dp, src_name, target_name);
3074 :
3075 29684617 : if ((flags & RENAME_EXCHANGE) && !target_ip)
3076 : return -EINVAL;
3077 :
3078 : /*
3079 : * If we are doing a whiteout operation, allocate the whiteout inode
3080 : * we will be placing at the target and ensure the type is set
3081 : * appropriately.
3082 : */
3083 29684617 : if (flags & RENAME_WHITEOUT) {
3084 1840587 : error = xfs_rename_alloc_whiteout(idmap, src_name,
3085 : target_dp, &wip);
3086 1840601 : if (error)
3087 : return error;
3088 :
3089 : /* setup target dirent info as whiteout */
3090 1814558 : src_name->type = XFS_DIR3_FT_CHRDEV;
3091 : }
3092 :
3093 29658588 : xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
3094 : inodes, &num_inodes);
3095 :
3096 29658591 : error = xfs_parent_start(mp, &src_ip_pptr);
3097 29658593 : if (error)
3098 0 : goto out_release_wip;
3099 :
3100 29658593 : if (wip) {
3101 1814551 : error = xfs_parent_start_locked(mp, &wip_pptr);
3102 1814542 : if (error)
3103 0 : goto out_src_ip_pptr;
3104 : }
3105 :
3106 29658584 : if (target_ip) {
3107 8393113 : error = xfs_parent_start_locked(mp, &tgt_ip_pptr);
3108 8393107 : if (error)
3109 0 : goto out_wip_pptr;
3110 : }
3111 :
3112 29658578 : retry:
3113 29659809 : nospace_error = 0;
3114 59319614 : spaceres = xfs_rename_space_res(mp, src_name->len, target_ip != NULL,
3115 29659809 : target_name->len, wip != NULL);
3116 29659805 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
3117 29659839 : if (error == -ENOSPC) {
3118 177270 : nospace_error = error;
3119 177270 : spaceres = 0;
3120 177270 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
3121 : &tp);
3122 : }
3123 29659838 : if (error)
3124 298 : goto out_tgt_ip_pptr;
3125 :
3126 : /*
3127 : * We don't allow reservationless renaming when parent pointers are
3128 : * enabled because we can't back out if the xattrs must grow.
3129 : */
3130 29659540 : if (src_ip_pptr && nospace_error) {
3131 177269 : error = nospace_error;
3132 177269 : xfs_trans_cancel(tp);
3133 177269 : goto out_tgt_ip_pptr;
3134 : }
3135 :
3136 : /*
3137 : * Attach the dquots to the inodes
3138 : */
3139 29482271 : error = xfs_qm_vop_rename_dqattach(inodes);
3140 29482262 : if (error) {
3141 803 : xfs_trans_cancel(tp);
3142 803 : goto out_tgt_ip_pptr;
3143 : }
3144 :
3145 : /*
3146 : * Lock all the participating inodes. Depending upon whether
3147 : * the target_name exists in the target directory, and
3148 : * whether the target directory is the same as the source
3149 : * directory, we can lock from 2 to 5 inodes.
3150 : */
3151 29481459 : xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
3152 :
3153 : /*
3154 : * Join all the inodes to the transaction.
3155 : */
3156 29481448 : xfs_trans_ijoin(tp, src_dp, 0);
3157 29481455 : if (new_parent)
3158 28314840 : xfs_trans_ijoin(tp, target_dp, 0);
3159 29481455 : xfs_trans_ijoin(tp, src_ip, 0);
3160 29481452 : if (target_ip)
3161 8344293 : xfs_trans_ijoin(tp, target_ip, 0);
3162 29481447 : if (wip)
3163 1787345 : xfs_trans_ijoin(tp, wip, 0);
3164 :
3165 : /*
3166 : * If we are using project inheritance, we only allow renames
3167 : * into our tree when the project IDs are the same; else the
3168 : * tree quota mechanism would be circumvented.
3169 : */
3170 29481450 : if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
3171 : target_dp->i_projid != src_ip->i_projid)) {
3172 0 : error = -EXDEV;
3173 0 : goto out_trans_cancel;
3174 : }
3175 :
3176 : /* RENAME_EXCHANGE is unique from here on. */
3177 29481450 : if (flags & RENAME_EXCHANGE) {
3178 8088275 : error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
3179 : src_ip_pptr, target_dp, target_name, target_ip,
3180 : tgt_ip_pptr, spaceres);
3181 8088276 : nospace_error = 0;
3182 8088276 : goto out_unlock;
3183 : }
3184 :
3185 : /*
3186 : * Try to reserve quota to handle an expansion of the target directory.
3187 : * We'll allow the rename to continue in reservationless mode if we hit
3188 : * a space usage constraint. If we trigger reservationless mode, save
3189 : * the errno if there isn't any free space in the target directory.
3190 : */
3191 21393175 : if (spaceres != 0) {
3192 21393175 : error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres,
3193 : 0, false);
3194 21393190 : if (error == -EDQUOT || error == -ENOSPC) {
3195 2274 : if (!retried) {
3196 1231 : xfs_trans_cancel(tp);
3197 1231 : xfs_iunlock_rename(inodes, num_inodes);
3198 1231 : xfs_blockgc_free_quota(target_dp, 0);
3199 1231 : retried = true;
3200 1231 : goto retry;
3201 : }
3202 :
3203 : nospace_error = error;
3204 : spaceres = 0;
3205 : error = 0;
3206 : }
3207 21391959 : if (error)
3208 0 : goto out_trans_cancel;
3209 : }
3210 :
3211 : /*
3212 : * We don't allow quotaless renaming when parent pointers are enabled
3213 : * because we can't back out if the xattrs must grow.
3214 : */
3215 21391959 : if (src_ip_pptr && nospace_error) {
3216 1042 : error = nospace_error;
3217 1042 : goto out_trans_cancel;
3218 : }
3219 :
3220 : /*
3221 : * Check for expected errors before we dirty the transaction
3222 : * so we can return an error without a transaction abort.
3223 : */
3224 21390917 : if (target_ip == NULL) {
3225 : /*
3226 : * If there's no space reservation, check the entry will
3227 : * fit before actually inserting it.
3228 : */
3229 21134896 : if (!spaceres) {
3230 0 : error = xfs_dir_canenter(tp, target_dp, target_name);
3231 0 : if (error)
3232 0 : goto out_trans_cancel;
3233 : }
3234 : } else {
3235 : /*
3236 : * If target exists and it's a directory, check that whether
3237 : * it can be destroyed.
3238 : */
3239 256423 : if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
3240 402 : (!xfs_dir_isempty(target_ip) ||
3241 384 : (VFS_I(target_ip)->i_nlink > 2))) {
3242 18 : error = -EEXIST;
3243 18 : goto out_trans_cancel;
3244 : }
3245 : }
3246 :
3247 : /*
3248 : * Lock the AGI buffers we need to handle bumping the nlink of the
3249 : * whiteout inode off the unlinked list and to handle dropping the
3250 : * nlink of the target inode. Per locking order rules, do this in
3251 : * increasing AG order and before directory block allocation tries to
3252 : * grab AGFs because we grab AGIs before AGFs.
3253 : *
3254 : * The (vfs) caller must ensure that if src is a directory then
3255 : * target_ip is either null or an empty directory.
3256 : */
3257 87606917 : for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
3258 66216028 : if (inodes[i] == wip ||
3259 256002 : (inodes[i] == target_ip &&
3260 256002 : (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
3261 2018585 : struct xfs_perag *pag;
3262 2018585 : struct xfs_buf *bp;
3263 :
3264 2018585 : pag = xfs_perag_get(mp,
3265 2018585 : XFS_INO_TO_AGNO(mp, inodes[i]->i_ino));
3266 2018582 : error = xfs_read_agi(pag, tp, &bp);
3267 2018586 : xfs_perag_put(pag);
3268 2018584 : if (error)
3269 9 : goto out_trans_cancel;
3270 : }
3271 : }
3272 :
3273 : /*
3274 : * Directory entry creation below may acquire the AGF. Remove
3275 : * the whiteout from the unlinked list first to preserve correct
3276 : * AGI/AGF locking order. This dirties the transaction so failures
3277 : * after this point will abort and log recovery will clean up the
3278 : * mess.
3279 : *
3280 : * For whiteouts, we need to bump the link count on the whiteout
3281 : * inode. After this point, we have a real link, clear the tmpfile
3282 : * state flag from the inode so it doesn't accidentally get misused
3283 : * in future.
3284 : */
3285 21390889 : if (wip) {
3286 1787348 : struct xfs_perag *pag;
3287 :
3288 1787348 : ASSERT(VFS_I(wip)->i_nlink == 0);
3289 :
3290 1787348 : pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino));
3291 1787348 : error = xfs_iunlink_remove(tp, pag, wip);
3292 1787347 : xfs_perag_put(pag);
3293 1787345 : if (error)
3294 0 : goto out_trans_cancel;
3295 :
3296 1787345 : xfs_bumplink(tp, wip);
3297 1787344 : VFS_I(wip)->i_state &= ~I_LINKABLE;
3298 : }
3299 :
3300 : /*
3301 : * Set up the target.
3302 : */
3303 21390885 : if (target_ip == NULL) {
3304 : /*
3305 : * If target does not exist and the rename crosses
3306 : * directories, adjust the target directory link count
3307 : * to account for the ".." reference from the new entry.
3308 : */
3309 21134893 : error = xfs_dir_createname(tp, target_dp, target_name,
3310 : src_ip->i_ino, spaceres);
3311 21134893 : if (error)
3312 11 : goto out_trans_cancel;
3313 :
3314 21134882 : xfs_trans_ichgtime(tp, target_dp,
3315 : XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3316 :
3317 21134882 : if (new_parent && src_is_directory) {
3318 7863259 : xfs_bumplink(tp, target_dp);
3319 : }
3320 : } else { /* target_ip != NULL */
3321 : /*
3322 : * Link the source inode under the target name.
3323 : * If the source inode is a directory and we are moving
3324 : * it across directories, its ".." entry will be
3325 : * inconsistent until we replace that down below.
3326 : *
3327 : * In case there is already an entry with the same
3328 : * name at the destination directory, remove it first.
3329 : */
3330 255992 : error = xfs_dir_replace(tp, target_dp, target_name,
3331 : src_ip->i_ino, spaceres);
3332 255993 : if (error)
3333 2 : goto out_trans_cancel;
3334 :
3335 255991 : xfs_trans_ichgtime(tp, target_dp,
3336 : XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3337 :
3338 : /*
3339 : * Decrement the link count on the target since the target
3340 : * dir no longer points to it.
3341 : */
3342 255987 : error = xfs_droplink(tp, target_ip);
3343 255993 : if (error)
3344 4 : goto out_trans_cancel;
3345 :
3346 255989 : if (src_is_directory) {
3347 : /*
3348 : * Drop the link from the old "." entry.
3349 : */
3350 384 : error = xfs_droplink(tp, target_ip);
3351 384 : if (error)
3352 0 : goto out_trans_cancel;
3353 : }
3354 : } /* target_ip != NULL */
3355 :
3356 : /*
3357 : * Remove the source.
3358 : */
3359 21390871 : if (new_parent && src_is_directory) {
3360 : /*
3361 : * Rewrite the ".." entry to point to the new
3362 : * directory.
3363 : */
3364 7863267 : error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
3365 : target_dp->i_ino, spaceres);
3366 7863267 : ASSERT(error != -EEXIST);
3367 7863267 : if (error)
3368 0 : goto out_trans_cancel;
3369 : }
3370 :
3371 : /*
3372 : * We always want to hit the ctime on the source inode.
3373 : *
3374 : * This isn't strictly required by the standards since the source
3375 : * inode isn't really being changed, but old unix file systems did
3376 : * it and some incremental backup programs won't work without it.
3377 : */
3378 21390871 : xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
3379 21390870 : xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
3380 :
3381 : /*
3382 : * Adjust the link count on src_dp. This is necessary when
3383 : * renaming a directory, either within one parent when
3384 : * the target existed, or across two parent directories.
3385 : */
3386 21390873 : if (src_is_directory && (new_parent || target_ip != NULL)) {
3387 :
3388 : /*
3389 : * Decrement link count on src_directory since the
3390 : * entry that's moved no longer points to it.
3391 : */
3392 7863643 : error = xfs_droplink(tp, src_dp);
3393 7863643 : if (error)
3394 0 : goto out_trans_cancel;
3395 : }
3396 :
3397 : /*
3398 : * For whiteouts, we only need to update the source dirent with the
3399 : * inode number of the whiteout inode rather than removing it
3400 : * altogether.
3401 : */
3402 21390873 : if (wip)
3403 1787345 : error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
3404 : spaceres);
3405 : else
3406 19603528 : error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
3407 : spaceres);
3408 :
3409 21390864 : if (error)
3410 2 : goto out_trans_cancel;
3411 :
3412 21390862 : if (wip_pptr) {
3413 1786120 : error = xfs_parent_add(tp, wip_pptr, src_dp, src_name, wip);
3414 1786115 : if (error)
3415 0 : goto out_trans_cancel;
3416 : }
3417 :
3418 21390857 : if (src_ip_pptr) {
3419 21243036 : error = xfs_parent_replace(tp, src_ip_pptr, src_dp, src_name,
3420 : target_dp, target_name, src_ip);
3421 21243034 : if (error)
3422 0 : goto out_trans_cancel;
3423 : }
3424 :
3425 21390855 : if (tgt_ip_pptr) {
3426 255980 : error = xfs_parent_remove(tp, tgt_ip_pptr, target_dp,
3427 : target_name, target_ip);
3428 255985 : if (error)
3429 0 : goto out_trans_cancel;
3430 : }
3431 :
3432 21390860 : xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3433 21390859 : xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
3434 21390869 : if (new_parent)
3435 20455415 : xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
3436 :
3437 : /*
3438 : * Inform our hook clients that we've finished a rename operation as
3439 : * follows: removed the source and target files from their directories;
3440 : * that we've added the source to the target directory; and finally
3441 : * that we've added the whiteout, if there was one. All inodes are
3442 : * locked, so it's ok to model a rename this way so long as we say we
3443 : * deleted entries before we add new ones.
3444 : */
3445 21390865 : if (target_ip)
3446 255985 : xfs_dir_update_hook(target_dp, target_ip, -1, target_name);
3447 21390865 : xfs_dir_update_hook(src_dp, src_ip, -1, src_name);
3448 21390864 : xfs_dir_update_hook(target_dp, src_ip, 1, target_name);
3449 21390866 : if (wip)
3450 1787345 : xfs_dir_update_hook(src_dp, wip, 1, src_name);
3451 :
3452 21390864 : error = xfs_finish_rename(tp);
3453 21390874 : nospace_error = 0;
3454 21390874 : goto out_unlock;
3455 :
3456 1088 : out_trans_cancel:
3457 1088 : xfs_trans_cancel(tp);
3458 29480238 : out_unlock:
3459 29480238 : xfs_iunlock_rename(inodes, num_inodes);
3460 29658609 : out_tgt_ip_pptr:
3461 29658609 : xfs_parent_finish(mp, tgt_ip_pptr);
3462 29658607 : out_wip_pptr:
3463 29658607 : xfs_parent_finish(mp, wip_pptr);
3464 29658607 : out_src_ip_pptr:
3465 29658607 : xfs_parent_finish(mp, src_ip_pptr);
3466 29658604 : out_release_wip:
3467 29658604 : if (wip)
3468 1814559 : xfs_irele(wip);
3469 29658607 : if (error == -ENOSPC && nospace_error)
3470 177270 : error = nospace_error;
3471 : return error;
3472 : }
3473 :
3474 : static int
3475 185435466 : xfs_iflush(
3476 : struct xfs_inode *ip,
3477 : struct xfs_buf *bp)
3478 : {
3479 185435466 : struct xfs_inode_log_item *iip = ip->i_itemp;
3480 185435466 : struct xfs_dinode *dip;
3481 185435466 : struct xfs_mount *mp = ip->i_mount;
3482 185435466 : int error;
3483 :
3484 185435466 : ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3485 370870932 : ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
3486 185435466 : ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
3487 : ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3488 185435466 : ASSERT(iip->ili_item.li_buf == bp);
3489 :
3490 185435466 : dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
3491 :
3492 : /*
3493 : * We don't flush the inode if any of the following checks fail, but we
3494 : * do still update the log item and attach to the backing buffer as if
3495 : * the flush happened. This is a formality to facilitate predictable
3496 : * error handling as the caller will shutdown and fail the buffer.
3497 : */
3498 185435466 : error = -EFSCORRUPTED;
3499 185435466 : if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
3500 : mp, XFS_ERRTAG_IFLUSH_1)) {
3501 0 : xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3502 : "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT,
3503 : __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
3504 0 : goto flush_out;
3505 : }
3506 185435466 : if (S_ISREG(VFS_I(ip)->i_mode)) {
3507 110997074 : if (XFS_TEST_ERROR(
3508 : ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
3509 : ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
3510 : mp, XFS_ERRTAG_IFLUSH_3)) {
3511 0 : xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3512 : "%s: Bad regular inode %llu, ptr "PTR_FMT,
3513 : __func__, ip->i_ino, ip);
3514 0 : goto flush_out;
3515 : }
3516 74438392 : } else if (S_ISDIR(VFS_I(ip)->i_mode)) {
3517 50700378 : if (XFS_TEST_ERROR(
3518 : ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
3519 : ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
3520 : ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
3521 : mp, XFS_ERRTAG_IFLUSH_4)) {
3522 0 : xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3523 : "%s: Bad directory inode %llu, ptr "PTR_FMT,
3524 : __func__, ip->i_ino, ip);
3525 0 : goto flush_out;
3526 : }
3527 : }
3528 370870932 : if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
3529 : ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
3530 0 : xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3531 : "%s: detected corrupt incore inode %llu, "
3532 : "total extents = %llu nblocks = %lld, ptr "PTR_FMT,
3533 : __func__, ip->i_ino,
3534 : ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af),
3535 : ip->i_nblocks, ip);
3536 0 : goto flush_out;
3537 : }
3538 185435466 : if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize,
3539 : mp, XFS_ERRTAG_IFLUSH_6)) {
3540 0 : xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3541 : "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT,
3542 : __func__, ip->i_ino, ip->i_forkoff, ip);
3543 0 : goto flush_out;
3544 : }
3545 :
3546 : /*
3547 : * Inode item log recovery for v2 inodes are dependent on the flushiter
3548 : * count for correct sequencing. We bump the flush iteration count so
3549 : * we can detect flushes which postdate a log record during recovery.
3550 : * This is redundant as we now log every change and hence this can't
3551 : * happen but we need to still do it to ensure backwards compatibility
3552 : * with old kernels that predate logging all inode changes.
3553 : */
3554 185435466 : if (!xfs_has_v3inodes(mp))
3555 242 : ip->i_flushiter++;
3556 :
3557 : /*
3558 : * If there are inline format data / attr forks attached to this inode,
3559 : * make sure they are not corrupt.
3560 : */
3561 235509379 : if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
3562 50073913 : xfs_ifork_verify_local_data(ip))
3563 0 : goto flush_out;
3564 185435466 : if (xfs_inode_has_attr_fork(ip) &&
3565 321587445 : ip->i_af.if_format == XFS_DINODE_FMT_LOCAL &&
3566 138748298 : xfs_ifork_verify_local_attr(ip))
3567 0 : goto flush_out;
3568 :
3569 : /*
3570 : * Copy the dirty parts of the inode into the on-disk inode. We always
3571 : * copy out the core of the inode, because if the inode is dirty at all
3572 : * the core must be.
3573 : */
3574 185435466 : xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
3575 :
3576 : /* Wrap, we never let the log put out DI_MAX_FLUSH */
3577 185435466 : if (!xfs_has_v3inodes(mp)) {
3578 242 : if (ip->i_flushiter == DI_MAX_FLUSH)
3579 0 : ip->i_flushiter = 0;
3580 : }
3581 :
3582 185435466 : xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
3583 185435466 : if (xfs_inode_has_attr_fork(ip))
3584 182839147 : xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
3585 :
3586 : /*
3587 : * We've recorded everything logged in the inode, so we'd like to clear
3588 : * the ili_fields bits so we don't log and flush things unnecessarily.
3589 : * However, we can't stop logging all this information until the data
3590 : * we've copied into the disk buffer is written to disk. If we did we
3591 : * might overwrite the copy of the inode in the log with all the data
3592 : * after re-logging only part of it, and in the face of a crash we
3593 : * wouldn't have all the data we need to recover.
3594 : *
3595 : * What we do is move the bits to the ili_last_fields field. When
3596 : * logging the inode, these bits are moved back to the ili_fields field.
3597 : * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since
3598 : * we know that the information those bits represent is permanently on
3599 : * disk. As long as the flush completes before the inode is logged
3600 : * again, then both ili_fields and ili_last_fields will be cleared.
3601 : */
3602 : error = 0;
3603 185435466 : flush_out:
3604 185435466 : spin_lock(&iip->ili_lock);
3605 185435466 : iip->ili_last_fields = iip->ili_fields;
3606 185435466 : iip->ili_fields = 0;
3607 185435466 : iip->ili_fsync_fields = 0;
3608 185435466 : spin_unlock(&iip->ili_lock);
3609 :
3610 : /*
3611 : * Store the current LSN of the inode so that we can tell whether the
3612 : * item has moved in the AIL from xfs_buf_inode_iodone().
3613 : */
3614 185435466 : xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
3615 : &iip->ili_item.li_lsn);
3616 :
3617 : /* generate the checksum. */
3618 185435466 : xfs_dinode_calc_crc(mp, dip);
3619 185435466 : if (error)
3620 0 : xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
3621 185435466 : return error;
3622 : }
3623 :
3624 : /*
3625 : * Non-blocking flush of dirty inode metadata into the backing buffer.
3626 : *
3627 : * The caller must have a reference to the inode and hold the cluster buffer
3628 : * locked. The function will walk across all the inodes on the cluster buffer it
3629 : * can find and lock without blocking, and flush them to the cluster buffer.
3630 : *
3631 : * On successful flushing of at least one inode, the caller must write out the
3632 : * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
3633 : * the caller needs to release the buffer. On failure, the filesystem will be
3634 : * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
3635 : * will be returned.
3636 : */
3637 : int
3638 31230729 : xfs_iflush_cluster(
3639 : struct xfs_buf *bp)
3640 : {
3641 31230729 : struct xfs_mount *mp = bp->b_mount;
3642 31230729 : struct xfs_log_item *lip, *n;
3643 31230729 : struct xfs_inode *ip;
3644 31230729 : struct xfs_inode_log_item *iip;
3645 31230729 : int clcount = 0;
3646 31230729 : int error = 0;
3647 :
3648 : /*
3649 : * We must use the safe variant here as on shutdown xfs_iflush_abort()
3650 : * will remove itself from the list.
3651 : */
3652 223994089 : list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
3653 192763360 : iip = (struct xfs_inode_log_item *)lip;
3654 192763360 : ip = iip->ili_inode;
3655 :
3656 : /*
3657 : * Quick and dirty check to avoid locks if possible.
3658 : */
3659 192763360 : if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING))
3660 109225 : continue;
3661 192654135 : if (xfs_ipincount(ip))
3662 5252442 : continue;
3663 :
3664 : /*
3665 : * The inode is still attached to the buffer, which means it is
3666 : * dirty but reclaim might try to grab it. Check carefully for
3667 : * that, and grab the ilock while still holding the i_flags_lock
3668 : * to guarantee reclaim will not be able to reclaim this inode
3669 : * once we drop the i_flags_lock.
3670 : */
3671 187401693 : spin_lock(&ip->i_flags_lock);
3672 187401693 : ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
3673 187401693 : if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) {
3674 3 : spin_unlock(&ip->i_flags_lock);
3675 3 : continue;
3676 : }
3677 :
3678 : /*
3679 : * ILOCK will pin the inode against reclaim and prevent
3680 : * concurrent transactions modifying the inode while we are
3681 : * flushing the inode. If we get the lock, set the flushing
3682 : * state before we drop the i_flags_lock.
3683 : */
3684 187401690 : if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3685 965542 : spin_unlock(&ip->i_flags_lock);
3686 965542 : continue;
3687 : }
3688 186436148 : __xfs_iflags_set(ip, XFS_IFLUSHING);
3689 186436148 : spin_unlock(&ip->i_flags_lock);
3690 :
3691 : /*
3692 : * Abort flushing this inode if we are shut down because the
3693 : * inode may not currently be in the AIL. This can occur when
3694 : * log I/O failure unpins the inode without inserting into the
3695 : * AIL, leaving a dirty/unpinned inode attached to the buffer
3696 : * that otherwise looks like it should be flushed.
3697 : */
3698 372872296 : if (xlog_is_shutdown(mp->m_log)) {
3699 1000682 : xfs_iunpin_wait(ip);
3700 1000682 : xfs_iflush_abort(ip);
3701 1000682 : xfs_iunlock(ip, XFS_ILOCK_SHARED);
3702 1000682 : error = -EIO;
3703 1000682 : continue;
3704 : }
3705 :
3706 : /* don't block waiting on a log force to unpin dirty inodes */
3707 185435466 : if (xfs_ipincount(ip)) {
3708 0 : xfs_iflags_clear(ip, XFS_IFLUSHING);
3709 0 : xfs_iunlock(ip, XFS_ILOCK_SHARED);
3710 0 : continue;
3711 : }
3712 :
3713 185435466 : if (!xfs_inode_clean(ip))
3714 185435466 : error = xfs_iflush(ip, bp);
3715 : else
3716 0 : xfs_iflags_clear(ip, XFS_IFLUSHING);
3717 185435466 : xfs_iunlock(ip, XFS_ILOCK_SHARED);
3718 185435466 : if (error)
3719 : break;
3720 185435466 : clcount++;
3721 : }
3722 :
3723 31230729 : if (error) {
3724 : /*
3725 : * Shutdown first so we kill the log before we release this
3726 : * buffer. If it is an INODE_ALLOC buffer and pins the tail
3727 : * of the log, failing it before the _log_ is shut down can
3728 : * result in the log tail being moved forward in the journal
3729 : * on disk because log writes can still be taking place. Hence
3730 : * unpinning the tail will allow the ICREATE intent to be
3731 : * removed from the log an recovery will fail with uninitialised
3732 : * inode cluster buffers.
3733 : */
3734 275467 : xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3735 275467 : bp->b_flags |= XBF_ASYNC;
3736 275467 : xfs_buf_ioend_fail(bp);
3737 275467 : return error;
3738 : }
3739 :
3740 30955262 : if (!clcount)
3741 : return -EAGAIN;
3742 :
3743 30879392 : XFS_STATS_INC(mp, xs_icluster_flushcnt);
3744 30879392 : XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
3745 30879392 : return 0;
3746 :
3747 : }
3748 :
3749 : /* Release an inode. */
3750 : void
3751 65952501415 : xfs_irele(
3752 : struct xfs_inode *ip)
3753 : {
3754 65952501415 : trace_xfs_irele(ip, _RET_IP_);
3755 65993901475 : iput(VFS_I(ip));
3756 65881548738 : }
3757 :
3758 : /*
3759 : * Ensure all commited transactions touching the inode are written to the log.
3760 : */
3761 : int
3762 518970 : xfs_log_force_inode(
3763 : struct xfs_inode *ip)
3764 : {
3765 518970 : xfs_csn_t seq = 0;
3766 :
3767 518970 : xfs_ilock(ip, XFS_ILOCK_SHARED);
3768 518972 : if (xfs_ipincount(ip))
3769 29450 : seq = ip->i_itemp->ili_commit_seq;
3770 518972 : xfs_iunlock(ip, XFS_ILOCK_SHARED);
3771 :
3772 518972 : if (!seq)
3773 : return 0;
3774 29450 : return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL);
3775 : }
3776 :
3777 : /*
3778 : * Grab the exclusive iolock for a data copy from src to dest, making sure to
3779 : * abide vfs locking order (lowest pointer value goes first) and breaking the
3780 : * layout leases before proceeding. The loop is needed because we cannot call
3781 : * the blocking break_layout() with the iolocks held, and therefore have to
3782 : * back out both locks.
3783 : */
3784 : static int
3785 137423514 : xfs_iolock_two_inodes_and_break_layout(
3786 : struct inode *src,
3787 : struct inode *dest)
3788 : {
3789 137423514 : int error;
3790 :
3791 137423514 : if (src > dest)
3792 68133895 : swap(src, dest);
3793 :
3794 137423514 : retry:
3795 : /* Wait to break both inodes' layouts before we start locking. */
3796 137423514 : error = break_layout(src, true);
3797 137422644 : if (error)
3798 0 : return error;
3799 137422644 : if (src != dest) {
3800 135729499 : error = break_layout(dest, true);
3801 135729274 : if (error)
3802 0 : return error;
3803 : }
3804 :
3805 : /* Lock one inode and make sure nobody got in and leased it. */
3806 137422419 : inode_lock(src);
3807 137423224 : error = break_layout(src, false);
3808 137423817 : if (error) {
3809 0 : inode_unlock(src);
3810 0 : if (error == -EWOULDBLOCK)
3811 0 : goto retry;
3812 0 : return error;
3813 : }
3814 :
3815 137423817 : if (src == dest)
3816 : return 0;
3817 :
3818 : /* Lock the other inode and make sure nobody got in and leased it. */
3819 135730671 : inode_lock_nested(dest, I_MUTEX_NONDIR2);
3820 135730992 : error = break_layout(dest, false);
3821 135731053 : if (error) {
3822 0 : inode_unlock(src);
3823 0 : inode_unlock(dest);
3824 0 : if (error == -EWOULDBLOCK)
3825 0 : goto retry;
3826 0 : return error;
3827 : }
3828 :
3829 : return 0;
3830 : }
3831 :
3832 : static int
3833 : xfs_mmaplock_two_inodes_and_break_dax_layout(
3834 : struct xfs_inode *ip1,
3835 : struct xfs_inode *ip2)
3836 : {
3837 : int error;
3838 : bool retry;
3839 : struct page *page;
3840 :
3841 : if (ip1->i_ino > ip2->i_ino)
3842 : swap(ip1, ip2);
3843 :
3844 : again:
3845 : retry = false;
3846 : /* Lock the first inode */
3847 : xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
3848 : error = xfs_break_dax_layouts(VFS_I(ip1), &retry);
3849 : if (error || retry) {
3850 : xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
3851 : if (error == 0 && retry)
3852 : goto again;
3853 : return error;
3854 : }
3855 :
3856 : if (ip1 == ip2)
3857 : return 0;
3858 :
3859 : /* Nested lock the second inode */
3860 : xfs_ilock(ip2, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1));
3861 : /*
3862 : * We cannot use xfs_break_dax_layouts() directly here because it may
3863 : * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
3864 : * for this nested lock case.
3865 : */
3866 : page = dax_layout_busy_page(VFS_I(ip2)->i_mapping);
3867 : if (page && page_ref_count(page) != 1) {
3868 : xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
3869 : xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
3870 : goto again;
3871 : }
3872 :
3873 : return 0;
3874 : }
3875 :
3876 : /*
3877 : * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
3878 : * mmap activity.
3879 : */
3880 : int
3881 137418770 : xfs_ilock2_io_mmap(
3882 : struct xfs_inode *ip1,
3883 : struct xfs_inode *ip2)
3884 : {
3885 137418770 : int ret;
3886 :
3887 137418770 : ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
3888 137424490 : if (ret)
3889 : return ret;
3890 :
3891 137424568 : if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
3892 : ret = xfs_mmaplock_two_inodes_and_break_dax_layout(ip1, ip2);
3893 : if (ret) {
3894 : inode_unlock(VFS_I(ip2));
3895 : if (ip1 != ip2)
3896 : inode_unlock(VFS_I(ip1));
3897 : return ret;
3898 : }
3899 : } else
3900 137424568 : filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping,
3901 : VFS_I(ip2)->i_mapping);
3902 :
3903 137424568 : return 0;
3904 : }
3905 :
3906 : /* Unlock both inodes to allow IO and mmap activity. */
3907 : void
3908 137418374 : xfs_iunlock2_io_mmap(
3909 : struct xfs_inode *ip1,
3910 : struct xfs_inode *ip2)
3911 : {
3912 137418374 : if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
3913 : xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
3914 : if (ip1 != ip2)
3915 : xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
3916 : } else
3917 137418374 : filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping,
3918 : VFS_I(ip2)->i_mapping);
3919 :
3920 137418488 : inode_unlock(VFS_I(ip2));
3921 137415981 : if (ip1 != ip2)
3922 135722835 : inode_unlock(VFS_I(ip1));
3923 137419641 : }
3924 :
3925 : /* Compute the number of data and realtime blocks used by a file. */
3926 : void
3927 51257323 : xfs_inode_count_blocks(
3928 : struct xfs_trans *tp,
3929 : struct xfs_inode *ip,
3930 : xfs_filblks_t *dblocks,
3931 : xfs_filblks_t *rblocks)
3932 : {
3933 51257323 : struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
3934 :
3935 51257323 : if (!XFS_IS_REALTIME_INODE(ip)) {
3936 51257323 : *dblocks = ip->i_nblocks;
3937 51257323 : *rblocks = 0;
3938 51257323 : return;
3939 : }
3940 :
3941 0 : *rblocks = 0;
3942 0 : xfs_bmap_count_leaves(ifp, rblocks);
3943 0 : *dblocks = ip->i_nblocks - *rblocks;
3944 : }
3945 :
3946 : static void
3947 : xfs_wait_dax_page(
3948 : struct inode *inode)
3949 : {
3950 : struct xfs_inode *ip = XFS_I(inode);
3951 :
3952 : xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
3953 : schedule();
3954 : xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
3955 : }
3956 :
3957 : int
3958 17987844 : xfs_break_dax_layouts(
3959 : struct inode *inode,
3960 : bool *retry)
3961 : {
3962 17987844 : struct page *page;
3963 :
3964 17987844 : ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
3965 :
3966 17987844 : page = dax_layout_busy_page(inode->i_mapping);
3967 17987844 : if (!page)
3968 17987844 : return 0;
3969 :
3970 : *retry = true;
3971 : return ___wait_var_event(&page->_refcount,
3972 : atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
3973 : 0, 0, xfs_wait_dax_page(inode));
3974 : }
3975 :
3976 : int
3977 95782141 : xfs_break_layouts(
3978 : struct inode *inode,
3979 : uint *iolock,
3980 : enum layout_break_reason reason)
3981 : {
3982 95782141 : bool retry;
3983 95782141 : int error;
3984 :
3985 95782141 : ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
3986 :
3987 95783168 : do {
3988 95783168 : retry = false;
3989 95783168 : switch (reason) {
3990 17988070 : case BREAK_UNMAP:
3991 17988070 : error = xfs_break_dax_layouts(inode, &retry);
3992 17987976 : if (error || retry)
3993 : break;
3994 95782895 : fallthrough;
3995 : case BREAK_WRITE:
3996 95782895 : error = xfs_break_leased_layouts(inode, iolock, &retry);
3997 95782895 : break;
3998 : default:
3999 0 : WARN_ON_ONCE(1);
4000 0 : error = -EINVAL;
4001 : }
4002 95842758 : } while (error == 0 && retry);
4003 :
4004 95841731 : return error;
4005 : }
4006 :
4007 : /* Returns the size of fundamental allocation unit for a file, in bytes. */
4008 : unsigned int
4009 1722363 : xfs_inode_alloc_unitsize(
4010 : struct xfs_inode *ip)
4011 : {
4012 1722363 : unsigned int blocks = 1;
4013 :
4014 1722363 : if (XFS_IS_REALTIME_INODE(ip))
4015 479013 : blocks = ip->i_mount->m_sb.sb_rextsize;
4016 :
4017 1722363 : return XFS_FSB_TO_B(ip->i_mount, blocks);
4018 : }
|