Line data Source code
1 : // SPDX-License-Identifier: LGPL-2.1
2 : /*
3 : * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
4 : * Written by Takashi Sato <t-sato@yk.jp.nec.com>
5 : * Akira Fujita <a-fujita@rs.jp.nec.com>
6 : */
7 :
8 : #include <linux/fs.h>
9 : #include <linux/quotaops.h>
10 : #include <linux/slab.h>
11 : #include <linux/sched/mm.h>
12 : #include "ext4_jbd2.h"
13 : #include "ext4.h"
14 : #include "ext4_extents.h"
15 :
16 : /**
17 : * get_ext_path() - Find an extent path for designated logical block number.
18 : * @inode: inode to be searched
19 : * @lblock: logical block number to find an extent path
20 : * @ppath: pointer to an extent path pointer (for output)
21 : *
22 : * ext4_find_extent wrapper. Return 0 on success, or a negative error value
23 : * on failure.
24 : */
25 : static inline int
26 5453740 : get_ext_path(struct inode *inode, ext4_lblk_t lblock,
27 : struct ext4_ext_path **ppath)
28 : {
29 5453740 : struct ext4_ext_path *path;
30 :
31 5453740 : path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE);
32 5453740 : if (IS_ERR(path))
33 0 : return PTR_ERR(path);
34 5453740 : if (path[ext_depth(inode)].p_ext == NULL) {
35 30 : ext4_free_ext_path(path);
36 30 : *ppath = NULL;
37 30 : return -ENODATA;
38 : }
39 5453710 : *ppath = path;
40 5453710 : return 0;
41 : }
42 :
43 : /**
44 : * ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem
45 : * @first: inode to be locked
46 : * @second: inode to be locked
47 : *
48 : * Acquire write lock of i_data_sem of the two inodes
49 : */
50 : void
51 3933620 : ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
52 : {
53 3933620 : if (first < second) {
54 3220565 : down_write(&EXT4_I(first)->i_data_sem);
55 3220565 : down_write_nested(&EXT4_I(second)->i_data_sem, I_DATA_SEM_OTHER);
56 : } else {
57 713055 : down_write(&EXT4_I(second)->i_data_sem);
58 713055 : down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER);
59 :
60 : }
61 3933620 : }
62 :
63 : /**
64 : * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
65 : *
66 : * @orig_inode: original inode structure to be released its lock first
67 : * @donor_inode: donor inode structure to be released its lock second
68 : * Release write lock of i_data_sem of two inodes (orig and donor).
69 : */
70 : void
71 0 : ext4_double_up_write_data_sem(struct inode *orig_inode,
72 : struct inode *donor_inode)
73 : {
74 0 : up_write(&EXT4_I(orig_inode)->i_data_sem);
75 3933620 : up_write(&EXT4_I(donor_inode)->i_data_sem);
76 0 : }
77 :
78 : /**
79 : * mext_check_coverage - Check that all extents in range has the same type
80 : *
81 : * @inode: inode in question
82 : * @from: block offset of inode
83 : * @count: block count to be checked
84 : * @unwritten: extents expected to be unwritten
85 : * @err: pointer to save error value
86 : *
87 : * Return 1 if all extents in range has expected type, and zero otherwise.
88 : */
89 : static int
90 3590264 : mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
91 : int unwritten, int *err)
92 : {
93 3590264 : struct ext4_ext_path *path = NULL;
94 3590264 : struct ext4_extent *ext;
95 3590264 : int ret = 0;
96 3590264 : ext4_lblk_t last = from + count;
97 7159647 : while (from < last) {
98 3590264 : *err = get_ext_path(inode, from, &path);
99 3590264 : if (*err)
100 0 : goto out;
101 3590264 : ext = path[ext_depth(inode)].p_ext;
102 3590264 : if (unwritten != ext4_ext_is_unwritten(ext))
103 20881 : goto out;
104 7138766 : from += ext4_ext_get_actual_len(ext);
105 : }
106 : ret = 1;
107 3590264 : out:
108 3590264 : ext4_free_ext_path(path);
109 3590264 : return ret;
110 : }
111 :
112 : /**
113 : * mext_folio_double_lock - Grab and lock folio on both @inode1 and @inode2
114 : *
115 : * @inode1: the inode structure
116 : * @inode2: the inode structure
117 : * @index1: folio index
118 : * @index2: folio index
119 : * @folio: result folio vector
120 : *
121 : * Grab two locked folio for inode's by inode order
122 : */
123 : static int
124 1865890 : mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
125 : pgoff_t index1, pgoff_t index2, struct folio *folio[2])
126 : {
127 1865890 : struct address_space *mapping[2];
128 1865890 : unsigned int flags;
129 :
130 1865890 : BUG_ON(!inode1 || !inode2);
131 1865890 : if (inode1 < inode2) {
132 1523206 : mapping[0] = inode1->i_mapping;
133 1523206 : mapping[1] = inode2->i_mapping;
134 : } else {
135 342684 : swap(index1, index2);
136 342684 : mapping[0] = inode2->i_mapping;
137 342684 : mapping[1] = inode1->i_mapping;
138 : }
139 :
140 1865890 : flags = memalloc_nofs_save();
141 1865890 : folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN,
142 : mapping_gfp_mask(mapping[0]));
143 1865890 : if (IS_ERR(folio[0])) {
144 0 : memalloc_nofs_restore(flags);
145 0 : return PTR_ERR(folio[0]);
146 : }
147 :
148 1865890 : folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN,
149 : mapping_gfp_mask(mapping[1]));
150 1865890 : memalloc_nofs_restore(flags);
151 1865890 : if (IS_ERR(folio[1])) {
152 0 : folio_unlock(folio[0]);
153 0 : folio_put(folio[0]);
154 0 : return PTR_ERR(folio[1]);
155 : }
156 : /*
157 : * __filemap_get_folio() may not wait on folio's writeback if
158 : * BDI not demand that. But it is reasonable to be very conservative
159 : * here and explicitly wait on folio's writeback
160 : */
161 1865890 : folio_wait_writeback(folio[0]);
162 1865890 : folio_wait_writeback(folio[1]);
163 1865890 : if (inode1 > inode2)
164 342684 : swap(folio[0], folio[1]);
165 :
166 : return 0;
167 : }
168 :
169 : /* Force page buffers uptodate w/o dropping page's lock */
170 : static int
171 91639 : mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to)
172 : {
173 91639 : struct inode *inode = folio->mapping->host;
174 91639 : sector_t block;
175 91639 : struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
176 91639 : unsigned int blocksize, block_start, block_end;
177 91639 : int i, err, nr = 0, partial = 0;
178 91639 : BUG_ON(!folio_test_locked(folio));
179 91639 : BUG_ON(folio_test_writeback(folio));
180 :
181 133119 : if (folio_test_uptodate(folio))
182 : return 0;
183 :
184 50159 : blocksize = i_blocksize(inode);
185 50159 : head = folio_buffers(folio);
186 50159 : if (!head) {
187 50159 : create_empty_buffers(&folio->page, blocksize, 0);
188 50159 : head = folio_buffers(folio);
189 : }
190 :
191 50159 : block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits);
192 100318 : for (bh = head, block_start = 0; bh != head || !block_start;
193 50159 : block++, block_start = block_end, bh = bh->b_this_page) {
194 50159 : block_end = block_start + blocksize;
195 50159 : if (block_end <= from || block_start >= to) {
196 0 : if (!buffer_uptodate(bh))
197 0 : partial = 1;
198 0 : continue;
199 : }
200 100318 : if (buffer_uptodate(bh))
201 0 : continue;
202 100318 : if (!buffer_mapped(bh)) {
203 50159 : err = ext4_get_block(inode, block, bh, 0);
204 50159 : if (err) {
205 0 : folio_set_error(folio);
206 0 : return err;
207 : }
208 100318 : if (!buffer_mapped(bh)) {
209 19248 : folio_zero_range(folio, block_start, blocksize);
210 19248 : set_buffer_uptodate(bh);
211 19248 : continue;
212 : }
213 : }
214 30911 : BUG_ON(nr >= MAX_BUF_PER_PAGE);
215 30911 : arr[nr++] = bh;
216 : }
217 : /* No io required */
218 50159 : if (!nr)
219 19248 : goto out;
220 :
221 61822 : for (i = 0; i < nr; i++) {
222 30911 : bh = arr[i];
223 30911 : if (!bh_uptodate_or_lock(bh)) {
224 30911 : err = ext4_read_bh(bh, 0, NULL);
225 30911 : if (err)
226 0 : return err;
227 : }
228 : }
229 30911 : out:
230 50159 : if (!partial)
231 50159 : folio_mark_uptodate(folio);
232 : return 0;
233 : }
234 :
235 : /**
236 : * move_extent_per_page - Move extent data per page
237 : *
238 : * @o_filp: file structure of original file
239 : * @donor_inode: donor inode
240 : * @orig_page_offset: page index on original file
241 : * @donor_page_offset: page index on donor file
242 : * @data_offset_in_page: block index where data swapping starts
243 : * @block_len_in_page: the number of blocks to be swapped
244 : * @unwritten: orig extent is unwritten or not
245 : * @err: pointer to save return value
246 : *
247 : * Save the data in original inode blocks and replace original inode extents
248 : * with donor inode extents by calling ext4_swap_extents().
249 : * Finally, write out the saved data in new original inode blocks. Return
250 : * replaced block count.
251 : */
252 : static int
253 1862744 : move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
254 : pgoff_t orig_page_offset, pgoff_t donor_page_offset,
255 : int data_offset_in_page,
256 : int block_len_in_page, int unwritten, int *err)
257 : {
258 1862744 : struct inode *orig_inode = file_inode(o_filp);
259 1862744 : struct folio *folio[2] = {NULL, NULL};
260 1862744 : handle_t *handle;
261 1862744 : ext4_lblk_t orig_blk_offset, donor_blk_offset;
262 1862744 : unsigned long blocksize = orig_inode->i_sb->s_blocksize;
263 1862744 : unsigned int tmp_data_size, data_size, replaced_size;
264 1862744 : int i, err2, jblocks, retries = 0;
265 1862744 : int replaced_count = 0;
266 1862744 : int from = data_offset_in_page << orig_inode->i_blkbits;
267 1862744 : int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits;
268 1862744 : struct super_block *sb = orig_inode->i_sb;
269 1862744 : struct buffer_head *bh = NULL;
270 :
271 : /*
272 : * It needs twice the amount of ordinary journal buffers because
273 : * inode and donor_inode may change each different metadata blocks.
274 : */
275 : again:
276 1865890 : *err = 0;
277 1865890 : jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
278 1865890 : handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
279 1865890 : if (IS_ERR(handle)) {
280 0 : *err = PTR_ERR(handle);
281 0 : return 0;
282 : }
283 :
284 1865890 : orig_blk_offset = orig_page_offset * blocks_per_page +
285 : data_offset_in_page;
286 :
287 1865890 : donor_blk_offset = donor_page_offset * blocks_per_page +
288 : data_offset_in_page;
289 :
290 : /* Calculate data_size */
291 3731780 : if ((orig_blk_offset + block_len_in_page - 1) ==
292 1865890 : ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
293 : /* Replace the last block */
294 66 : tmp_data_size = orig_inode->i_size & (blocksize - 1);
295 : /*
296 : * If data_size equal zero, it shows data_size is multiples of
297 : * blocksize. So we set appropriate value.
298 : */
299 66 : if (tmp_data_size == 0)
300 62 : tmp_data_size = blocksize;
301 :
302 132 : data_size = tmp_data_size +
303 66 : ((block_len_in_page - 1) << orig_inode->i_blkbits);
304 : } else
305 1865824 : data_size = block_len_in_page << orig_inode->i_blkbits;
306 :
307 1865890 : replaced_size = data_size;
308 :
309 1865890 : *err = mext_folio_double_lock(orig_inode, donor_inode, orig_page_offset,
310 : donor_page_offset, folio);
311 1865890 : if (unlikely(*err < 0))
312 0 : goto stop_journal;
313 : /*
314 : * If orig extent was unwritten it can become initialized
315 : * at any time after i_data_sem was dropped, in order to
316 : * serialize with delalloc we have recheck extent while we
317 : * hold page's lock, if it is still the case data copy is not
318 : * necessary, just swap data blocks between orig and donor.
319 : */
320 :
321 1865890 : VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]);
322 1865890 : VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]);
323 1865890 : VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]);
324 :
325 1865890 : if (unwritten) {
326 1795132 : ext4_double_down_write_data_sem(orig_inode, donor_inode);
327 : /* If any of extents in range became initialized we have to
328 : * fallback to data copying */
329 1795132 : unwritten = mext_check_coverage(orig_inode, orig_blk_offset,
330 : block_len_in_page, 1, err);
331 1795132 : if (*err)
332 0 : goto drop_data_sem;
333 :
334 1795132 : unwritten &= mext_check_coverage(donor_inode, donor_blk_offset,
335 : block_len_in_page, 1, err);
336 1795132 : if (*err)
337 0 : goto drop_data_sem;
338 :
339 1795132 : if (!unwritten) {
340 20881 : ext4_double_up_write_data_sem(orig_inode, donor_inode);
341 20881 : goto data_copy;
342 : }
343 1777014 : if ((folio_has_private(folio[0]) &&
344 1774289 : !filemap_release_folio(folio[0], 0)) ||
345 1771526 : (folio_has_private(folio[1]) &&
346 0 : !filemap_release_folio(folio[1], 0))) {
347 2725 : *err = -EBUSY;
348 2725 : goto drop_data_sem;
349 : }
350 1771526 : replaced_count = ext4_swap_extents(handle, orig_inode,
351 : donor_inode, orig_blk_offset,
352 : donor_blk_offset,
353 : block_len_in_page, 1, err);
354 1774251 : drop_data_sem:
355 1774251 : ext4_double_up_write_data_sem(orig_inode, donor_inode);
356 1774251 : goto unlock_folios;
357 : }
358 70758 : data_copy:
359 91639 : *err = mext_page_mkuptodate(folio[0], from, from + replaced_size);
360 91639 : if (*err)
361 0 : goto unlock_folios;
362 :
363 : /* At this point all buffers in range are uptodate, old mapping layout
364 : * is no longer required, try to drop it now. */
365 179739 : if ((folio_has_private(folio[0]) &&
366 178692 : !filemap_release_folio(folio[0], 0)) ||
367 90592 : (folio_has_private(folio[1]) &&
368 0 : !filemap_release_folio(folio[1], 0))) {
369 1047 : *err = -EBUSY;
370 1047 : goto unlock_folios;
371 : }
372 90592 : ext4_double_down_write_data_sem(orig_inode, donor_inode);
373 90592 : replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
374 : orig_blk_offset, donor_blk_offset,
375 : block_len_in_page, 1, err);
376 90592 : ext4_double_up_write_data_sem(orig_inode, donor_inode);
377 90592 : if (*err) {
378 0 : if (replaced_count) {
379 0 : block_len_in_page = replaced_count;
380 0 : replaced_size =
381 0 : block_len_in_page << orig_inode->i_blkbits;
382 : } else
383 0 : goto unlock_folios;
384 : }
385 : /* Perform all necessary steps similar write_begin()/write_end()
386 : * but keeping in mind that i_size will not change */
387 90592 : if (!folio_buffers(folio[0]))
388 90592 : create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0);
389 90592 : bh = folio_buffers(folio[0]);
390 90592 : for (i = 0; i < data_offset_in_page; i++)
391 0 : bh = bh->b_this_page;
392 181184 : for (i = 0; i < block_len_in_page; i++) {
393 90592 : *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
394 90592 : if (*err < 0)
395 : break;
396 90592 : bh = bh->b_this_page;
397 : }
398 90592 : if (!*err)
399 90592 : *err = block_commit_write(&folio[0]->page, from, from + replaced_size);
400 :
401 90592 : if (unlikely(*err < 0))
402 0 : goto repair_branches;
403 :
404 : /* Even in case of data=writeback it is reasonable to pin
405 : * inode to transaction, to prevent unexpected data loss */
406 90592 : *err = ext4_jbd2_inode_add_write(handle, orig_inode,
407 90592 : (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size);
408 :
409 1865890 : unlock_folios:
410 1865890 : folio_unlock(folio[0]);
411 1865890 : folio_put(folio[0]);
412 1865890 : folio_unlock(folio[1]);
413 1865890 : folio_put(folio[1]);
414 1865890 : stop_journal:
415 1865890 : ext4_journal_stop(handle);
416 1865890 : if (*err == -ENOSPC &&
417 0 : ext4_should_retry_alloc(sb, &retries))
418 0 : goto again;
419 : /* Buffer was busy because probably is pinned to journal transaction,
420 : * force transaction commit may help to free it. */
421 1869036 : if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal &&
422 3146 : jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal))
423 3146 : goto again;
424 : return replaced_count;
425 :
426 : repair_branches:
427 : /*
428 : * This should never ever happen!
429 : * Extents are swapped already, but we are not able to copy data.
430 : * Try to swap extents to it's original places
431 : */
432 0 : ext4_double_down_write_data_sem(orig_inode, donor_inode);
433 0 : replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode,
434 : orig_blk_offset, donor_blk_offset,
435 : block_len_in_page, 0, &err2);
436 0 : ext4_double_up_write_data_sem(orig_inode, donor_inode);
437 0 : if (replaced_count != block_len_in_page) {
438 0 : ext4_error_inode_block(orig_inode, (sector_t)(orig_blk_offset),
439 : EIO, "Unable to copy data block,"
440 : " data will be lost.");
441 0 : *err = -EIO;
442 : }
443 0 : replaced_count = 0;
444 0 : goto unlock_folios;
445 : }
446 :
447 : /**
448 : * mext_check_arguments - Check whether move extent can be done
449 : *
450 : * @orig_inode: original inode
451 : * @donor_inode: donor inode
452 : * @orig_start: logical start offset in block for orig
453 : * @donor_start: logical start offset in block for donor
454 : * @len: the number of blocks to be moved
455 : *
456 : * Check the arguments of ext4_move_extents() whether the files can be
457 : * exchanged with each other.
458 : * Return 0 on success, or a negative error value on failure.
459 : */
460 : static int
461 185152 : mext_check_arguments(struct inode *orig_inode,
462 : struct inode *donor_inode, __u64 orig_start,
463 : __u64 donor_start, __u64 *len)
464 : {
465 185152 : __u64 orig_eof, donor_eof;
466 185152 : unsigned int blkbits = orig_inode->i_blkbits;
467 185152 : unsigned int blocksize = 1 << blkbits;
468 :
469 185152 : orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits;
470 185152 : donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits;
471 :
472 :
473 185152 : if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
474 : ext4_debug("ext4 move extent: suid or sgid is set"
475 : " to donor file [ino:orig %lu, donor %lu]\n",
476 : orig_inode->i_ino, donor_inode->i_ino);
477 : return -EINVAL;
478 : }
479 :
480 185152 : if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
481 : return -EPERM;
482 :
483 : /* Ext4 move extent does not support swap files */
484 185152 : if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
485 : ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n",
486 : orig_inode->i_ino, donor_inode->i_ino);
487 : return -ETXTBSY;
488 : }
489 :
490 185152 : if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) {
491 : ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n",
492 : orig_inode->i_ino, donor_inode->i_ino);
493 : return -EOPNOTSUPP;
494 : }
495 :
496 : /* Ext4 move extent supports only extent based file */
497 185152 : if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
498 : ext4_debug("ext4 move extent: orig file is not extents "
499 : "based file [ino:orig %lu]\n", orig_inode->i_ino);
500 : return -EOPNOTSUPP;
501 185152 : } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
502 : ext4_debug("ext4 move extent: donor file is not extents "
503 : "based file [ino:donor %lu]\n", donor_inode->i_ino);
504 : return -EOPNOTSUPP;
505 : }
506 :
507 185152 : if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
508 : ext4_debug("ext4 move extent: File size is 0 byte\n");
509 : return -EINVAL;
510 : }
511 :
512 : /* Start offset should be same */
513 162714 : if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) !=
514 162714 : (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) {
515 : ext4_debug("ext4 move extent: orig and donor's start "
516 : "offsets are not aligned [ino:orig %lu, donor %lu]\n",
517 : orig_inode->i_ino, donor_inode->i_ino);
518 : return -EINVAL;
519 : }
520 :
521 162714 : if ((orig_start >= EXT_MAX_BLOCKS) ||
522 162714 : (donor_start >= EXT_MAX_BLOCKS) ||
523 162714 : (*len > EXT_MAX_BLOCKS) ||
524 162714 : (donor_start + *len >= EXT_MAX_BLOCKS) ||
525 162714 : (orig_start + *len >= EXT_MAX_BLOCKS)) {
526 : ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
527 : "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
528 : orig_inode->i_ino, donor_inode->i_ino);
529 : return -EINVAL;
530 : }
531 162714 : if (orig_eof <= orig_start)
532 0 : *len = 0;
533 162714 : else if (orig_eof < orig_start + *len - 1)
534 0 : *len = orig_eof - orig_start;
535 162714 : if (donor_eof <= donor_start)
536 2721 : *len = 0;
537 159993 : else if (donor_eof < donor_start + *len - 1)
538 0 : *len = donor_eof - donor_start;
539 162714 : if (!*len) {
540 2721 : ext4_debug("ext4 move extent: len should not be 0 "
541 : "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
542 : donor_inode->i_ino);
543 2721 : return -EINVAL;
544 : }
545 :
546 : return 0;
547 : }
548 :
549 : /**
550 : * ext4_move_extents - Exchange the specified range of a file
551 : *
552 : * @o_filp: file structure of the original file
553 : * @d_filp: file structure of the donor file
554 : * @orig_blk: start offset in block for orig
555 : * @donor_blk: start offset in block for donor
556 : * @len: the number of blocks to be moved
557 : * @moved_len: moved block length
558 : *
559 : * This function returns 0 and moved block length is set in moved_len
560 : * if succeed, otherwise returns error value.
561 : *
562 : */
563 : int
564 185151 : ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
565 : __u64 donor_blk, __u64 len, __u64 *moved_len)
566 : {
567 185151 : struct inode *orig_inode = file_inode(o_filp);
568 185151 : struct inode *donor_inode = file_inode(d_filp);
569 185151 : struct ext4_ext_path *path = NULL;
570 185151 : int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits;
571 185151 : ext4_lblk_t o_end, o_start = orig_blk;
572 185151 : ext4_lblk_t d_start = donor_blk;
573 185151 : int ret;
574 :
575 185151 : if (orig_inode->i_sb != donor_inode->i_sb) {
576 : ext4_debug("ext4 move extent: The argument files "
577 : "should be in same FS [ino:orig %lu, donor %lu]\n",
578 : orig_inode->i_ino, donor_inode->i_ino);
579 : return -EINVAL;
580 : }
581 :
582 : /* orig and donor should be different inodes */
583 185151 : if (orig_inode == donor_inode) {
584 : ext4_debug("ext4 move extent: The argument files should not "
585 : "be same inode [ino:orig %lu, donor %lu]\n",
586 : orig_inode->i_ino, donor_inode->i_ino);
587 : return -EINVAL;
588 : }
589 :
590 : /* Regular file check */
591 185151 : if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
592 : ext4_debug("ext4 move extent: The argument files should be "
593 : "regular file [ino:orig %lu, donor %lu]\n",
594 : orig_inode->i_ino, donor_inode->i_ino);
595 : return -EINVAL;
596 : }
597 :
598 : /* TODO: it's not obvious how to swap blocks for inodes with full
599 : journaling enabled */
600 370301 : if (ext4_should_journal_data(orig_inode) ||
601 : ext4_should_journal_data(donor_inode)) {
602 0 : ext4_msg(orig_inode->i_sb, KERN_ERR,
603 : "Online defrag not supported with data journaling");
604 0 : return -EOPNOTSUPP;
605 : }
606 :
607 185150 : if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) {
608 0 : ext4_msg(orig_inode->i_sb, KERN_ERR,
609 : "Online defrag not supported for encrypted files");
610 0 : return -EOPNOTSUPP;
611 : }
612 :
613 : /* Protect orig and donor inodes against a truncate */
614 185150 : lock_two_nondirectories(orig_inode, donor_inode);
615 :
616 : /* Wait for all existing dio workers */
617 185152 : inode_dio_wait(orig_inode);
618 185152 : inode_dio_wait(donor_inode);
619 :
620 : /* Protect extent tree against block allocations via delalloc */
621 185152 : ext4_double_down_write_data_sem(orig_inode, donor_inode);
622 : /* Check the filesystem environment whether move_extent can be done */
623 185152 : ret = mext_check_arguments(orig_inode, donor_inode, orig_blk,
624 : donor_blk, &len);
625 185152 : if (ret)
626 25159 : goto out;
627 159993 : o_end = o_start + len;
628 :
629 2022774 : while (o_start < o_end) {
630 1863476 : struct ext4_extent *ex;
631 1863476 : ext4_lblk_t cur_blk, next_blk;
632 1863476 : pgoff_t orig_page_index, donor_page_index;
633 1863476 : int offset_in_page;
634 1863476 : int unwritten, cur_len;
635 :
636 1863476 : ret = get_ext_path(orig_inode, o_start, &path);
637 1863476 : if (ret)
638 30 : goto out;
639 1863446 : ex = path[path->p_depth].p_ext;
640 1863446 : cur_blk = le32_to_cpu(ex->ee_block);
641 1863446 : cur_len = ext4_ext_get_actual_len(ex);
642 : /* Check hole before the start pos */
643 1863446 : if (cur_blk + cur_len - 1 < o_start) {
644 699 : next_blk = ext4_ext_next_allocated_block(path);
645 699 : if (next_blk == EXT_MAX_BLOCKS) {
646 36 : ret = -ENODATA;
647 36 : goto out;
648 : }
649 663 : d_start += next_blk - o_start;
650 663 : o_start = next_blk;
651 663 : continue;
652 : /* Check hole after the start pos */
653 1862747 : } else if (cur_blk > o_start) {
654 : /* Skip hole */
655 119 : d_start += cur_blk - o_start;
656 119 : o_start = cur_blk;
657 : /* Extent inside requested range ?*/
658 119 : if (cur_blk >= o_end)
659 3 : goto out;
660 : } else { /* in_range(o_start, o_blk, o_len) */
661 1862628 : cur_len += cur_blk - o_start;
662 : }
663 1862744 : unwritten = ext4_ext_is_unwritten(ex);
664 1862744 : if (o_end - o_start < cur_len)
665 1580567 : cur_len = o_end - o_start;
666 :
667 0 : orig_page_index = o_start >> (PAGE_SHIFT -
668 1862744 : orig_inode->i_blkbits);
669 0 : donor_page_index = d_start >> (PAGE_SHIFT -
670 1862744 : donor_inode->i_blkbits);
671 1862744 : offset_in_page = o_start % blocks_per_page;
672 1862744 : if (cur_len > blocks_per_page - offset_in_page)
673 : cur_len = blocks_per_page - offset_in_page;
674 : /*
675 : * Up semaphore to avoid following problems:
676 : * a. transaction deadlock among ext4_journal_start,
677 : * ->write_begin via pagefault, and jbd2_journal_commit
678 : * b. racing with ->read_folio, ->write_begin, and
679 : * ext4_get_block in move_extent_per_page
680 : */
681 1862744 : ext4_double_up_write_data_sem(orig_inode, donor_inode);
682 : /* Swap original branches with new branches */
683 1862744 : move_extent_per_page(o_filp, donor_inode,
684 : orig_page_index, donor_page_index,
685 : offset_in_page, cur_len,
686 : unwritten, &ret);
687 1862744 : ext4_double_down_write_data_sem(orig_inode, donor_inode);
688 1862744 : if (ret < 0)
689 : break;
690 1862118 : o_start += cur_len;
691 1862118 : d_start += cur_len;
692 : }
693 159924 : *moved_len = o_start - orig_blk;
694 159924 : if (*moved_len > len)
695 2 : *moved_len = len;
696 :
697 159922 : out:
698 185152 : if (*moved_len) {
699 159558 : ext4_discard_preallocations(orig_inode, 0);
700 159558 : ext4_discard_preallocations(donor_inode, 0);
701 : }
702 :
703 185152 : ext4_free_ext_path(path);
704 185152 : ext4_double_up_write_data_sem(orig_inode, donor_inode);
705 185152 : unlock_two_nondirectories(orig_inode, donor_inode);
706 :
707 185145 : return ret;
708 : }
|