Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * High-level sync()-related operations
4 : */
5 :
6 : #include <linux/blkdev.h>
7 : #include <linux/kernel.h>
8 : #include <linux/file.h>
9 : #include <linux/fs.h>
10 : #include <linux/slab.h>
11 : #include <linux/export.h>
12 : #include <linux/namei.h>
13 : #include <linux/sched.h>
14 : #include <linux/writeback.h>
15 : #include <linux/syscalls.h>
16 : #include <linux/linkage.h>
17 : #include <linux/pagemap.h>
18 : #include <linux/quotaops.h>
19 : #include <linux/backing-dev.h>
20 : #include "internal.h"
21 :
22 : #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
23 : SYNC_FILE_RANGE_WAIT_AFTER)
24 :
25 : /*
26 : * Write out and wait upon all dirty data associated with this
27 : * superblock. Filesystem data as well as the underlying block
28 : * device. Takes the superblock lock.
29 : */
30 493839 : int sync_filesystem(struct super_block *sb)
31 : {
32 493839 : int ret = 0;
33 :
34 : /*
35 : * We need to be protected against the filesystem going from
36 : * r/o to r/w or vice versa.
37 : */
38 493839 : WARN_ON(!rwsem_is_locked(&sb->s_umount));
39 :
40 : /*
41 : * No point in syncing out anything if the filesystem is read-only.
42 : */
43 493839 : if (sb_rdonly(sb))
44 : return 0;
45 :
46 : /*
47 : * Do the filesystem syncing work. For simple filesystems
48 : * writeback_inodes_sb(sb) just dirties buffers with inodes so we have
49 : * to submit I/O for these buffers via sync_blockdev(). This also
50 : * speeds up the wait == 1 case since in that case write_inode()
51 : * methods call sync_dirty_buffer() and thus effectively write one block
52 : * at a time.
53 : */
54 473361 : writeback_inodes_sb(sb, WB_REASON_SYNC);
55 473352 : if (sb->s_op->sync_fs) {
56 472633 : ret = sb->s_op->sync_fs(sb, 0);
57 472634 : if (ret)
58 : return ret;
59 : }
60 473353 : ret = sync_blockdev_nowait(sb->s_bdev);
61 473353 : if (ret)
62 : return ret;
63 :
64 473353 : sync_inodes_sb(sb);
65 473363 : if (sb->s_op->sync_fs) {
66 472644 : ret = sb->s_op->sync_fs(sb, 1);
67 472610 : if (ret)
68 : return ret;
69 : }
70 460395 : return sync_blockdev(sb->s_bdev);
71 : }
72 : EXPORT_SYMBOL(sync_filesystem);
73 :
74 76657229 : static void sync_inodes_one_sb(struct super_block *sb, void *arg)
75 : {
76 76657229 : if (!sb_rdonly(sb))
77 76657284 : sync_inodes_sb(sb);
78 76657138 : }
79 :
80 153169044 : static void sync_fs_one_sb(struct super_block *sb, void *arg)
81 : {
82 153169044 : if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) &&
83 118262694 : sb->s_op->sync_fs)
84 5368747 : sb->s_op->sync_fs(sb, *(int *)arg);
85 153164688 : }
86 :
87 : /*
88 : * Sync everything. We start by waking flusher threads so that most of
89 : * writeback runs on all devices in parallel. Then we sync all inodes reliably
90 : * which effectively also waits for all flusher threads to finish doing
91 : * writeback. At this point all data is on disk so metadata should be stable
92 : * and we tell filesystems to sync their metadata via ->sync_fs() calls.
93 : * Finally, we writeout all block devices because some filesystems (e.g. ext2)
94 : * just write metadata (such as inodes or bitmaps) to block device page cache
95 : * and do not sync it on their own in ->sync_fs().
96 : */
97 1344845 : void ksys_sync(void)
98 : {
99 1344845 : int nowait = 0, wait = 1;
100 :
101 1344845 : wakeup_flusher_threads(WB_REASON_SYNC);
102 1344835 : iterate_supers(sync_inodes_one_sb, NULL);
103 1344848 : iterate_supers(sync_fs_one_sb, &nowait);
104 1344821 : iterate_supers(sync_fs_one_sb, &wait);
105 1344544 : sync_bdevs(false);
106 1344851 : sync_bdevs(true);
107 1344849 : if (unlikely(laptop_mode))
108 0 : laptop_sync_completion();
109 1344849 : }
110 :
111 1344843 : SYSCALL_DEFINE0(sync)
112 : {
113 1344843 : ksys_sync();
114 1344826 : return 0;
115 : }
116 :
117 0 : static void do_sync_work(struct work_struct *work)
118 : {
119 0 : int nowait = 0;
120 :
121 : /*
122 : * Sync twice to reduce the possibility we skipped some inodes / pages
123 : * because they were temporarily locked
124 : */
125 0 : iterate_supers(sync_inodes_one_sb, &nowait);
126 0 : iterate_supers(sync_fs_one_sb, &nowait);
127 0 : sync_bdevs(false);
128 0 : iterate_supers(sync_inodes_one_sb, &nowait);
129 0 : iterate_supers(sync_fs_one_sb, &nowait);
130 0 : sync_bdevs(false);
131 0 : printk("Emergency Sync complete\n");
132 0 : kfree(work);
133 0 : }
134 :
135 0 : void emergency_sync(void)
136 : {
137 0 : struct work_struct *work;
138 :
139 0 : work = kmalloc(sizeof(*work), GFP_ATOMIC);
140 0 : if (work) {
141 0 : INIT_WORK(work, do_sync_work);
142 0 : schedule_work(work);
143 : }
144 0 : }
145 :
146 : /*
147 : * sync a single super
148 : */
149 596772 : SYSCALL_DEFINE1(syncfs, int, fd)
150 : {
151 298386 : struct fd f = fdget(fd);
152 298386 : struct super_block *sb;
153 298386 : int ret, ret2;
154 :
155 298386 : if (!f.file)
156 : return -EBADF;
157 298386 : sb = f.file->f_path.dentry->d_sb;
158 :
159 298386 : down_read(&sb->s_umount);
160 298386 : ret = sync_filesystem(sb);
161 298386 : up_read(&sb->s_umount);
162 :
163 298386 : ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err);
164 :
165 298386 : fdput(f);
166 298386 : return ret ? ret : ret2;
167 : }
168 :
169 : /**
170 : * vfs_fsync_range - helper to sync a range of data & metadata to disk
171 : * @file: file to sync
172 : * @start: offset in bytes of the beginning of data range to sync
173 : * @end: offset in bytes of the end of data range (inclusive)
174 : * @datasync: perform only datasync
175 : *
176 : * Write back data in range @start..@end and metadata for @file to disk. If
177 : * @datasync is set only metadata needed to access modified file data is
178 : * written.
179 : */
180 15820573 : int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
181 : {
182 15820573 : struct inode *inode = file->f_mapping->host;
183 :
184 15820573 : if (!file->f_op->fsync)
185 : return -EINVAL;
186 15820381 : if (!datasync && (inode->i_state & I_DIRTY_TIME))
187 24 : mark_inode_dirty_sync(inode);
188 15820381 : return file->f_op->fsync(file, start, end, datasync);
189 : }
190 : EXPORT_SYMBOL(vfs_fsync_range);
191 :
192 : /**
193 : * vfs_fsync - perform a fsync or fdatasync on a file
194 : * @file: file to sync
195 : * @datasync: only perform a fdatasync operation
196 : *
197 : * Write back data and metadata for @file to disk. If @datasync is
198 : * set only metadata needed to access modified file data is written.
199 : */
200 85932 : int vfs_fsync(struct file *file, int datasync)
201 : {
202 85932 : return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
203 : }
204 : EXPORT_SYMBOL(vfs_fsync);
205 :
206 12785050 : static int do_fsync(unsigned int fd, int datasync)
207 : {
208 12785050 : struct fd f = fdget(fd);
209 12784102 : int ret = -EBADF;
210 :
211 12784102 : if (f.file) {
212 12784102 : ret = vfs_fsync(f.file, datasync);
213 12781033 : fdput(f);
214 : }
215 12781036 : return ret;
216 : }
217 :
218 23165551 : SYSCALL_DEFINE1(fsync, unsigned int, fd)
219 : {
220 11586136 : return do_fsync(fd, 0);
221 : }
222 :
223 2399882 : SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
224 : {
225 1199966 : return do_fsync(fd, 1);
226 : }
227 :
228 86415 : int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
229 : unsigned int flags)
230 : {
231 86415 : int ret;
232 86415 : struct address_space *mapping;
233 86415 : loff_t endbyte; /* inclusive */
234 86415 : umode_t i_mode;
235 :
236 86415 : ret = -EINVAL;
237 86415 : if (flags & ~VALID_FLAGS)
238 0 : goto out;
239 :
240 86415 : endbyte = offset + nbytes;
241 :
242 86415 : if ((s64)offset < 0)
243 0 : goto out;
244 86415 : if ((s64)endbyte < 0)
245 0 : goto out;
246 86415 : if (endbyte < offset)
247 0 : goto out;
248 :
249 86415 : if (sizeof(pgoff_t) == 4) {
250 : if (offset >= (0x100000000ULL << PAGE_SHIFT)) {
251 : /*
252 : * The range starts outside a 32 bit machine's
253 : * pagecache addressing capabilities. Let it "succeed"
254 : */
255 : ret = 0;
256 : goto out;
257 : }
258 : if (endbyte >= (0x100000000ULL << PAGE_SHIFT)) {
259 : /*
260 : * Out to EOF
261 : */
262 : nbytes = 0;
263 : }
264 : }
265 :
266 86415 : if (nbytes == 0)
267 : endbyte = LLONG_MAX;
268 : else
269 717 : endbyte--; /* inclusive */
270 :
271 86415 : i_mode = file_inode(file)->i_mode;
272 86415 : ret = -ESPIPE;
273 86415 : if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
274 : !S_ISLNK(i_mode))
275 0 : goto out;
276 :
277 86415 : mapping = file->f_mapping;
278 86415 : ret = 0;
279 86415 : if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
280 646 : ret = file_fdatawait_range(file, offset, endbyte);
281 646 : if (ret < 0)
282 0 : goto out;
283 : }
284 :
285 86415 : if (flags & SYNC_FILE_RANGE_WRITE) {
286 86375 : int sync_mode = WB_SYNC_NONE;
287 :
288 86375 : if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) ==
289 : SYNC_FILE_RANGE_WRITE_AND_WAIT)
290 633 : sync_mode = WB_SYNC_ALL;
291 :
292 86375 : ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
293 : sync_mode);
294 86468 : if (ret < 0)
295 0 : goto out;
296 : }
297 :
298 86508 : if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
299 855 : ret = file_fdatawait_range(file, offset, endbyte);
300 :
301 85653 : out:
302 86508 : return ret;
303 : }
304 :
305 : /*
306 : * ksys_sync_file_range() permits finely controlled syncing over a segment of
307 : * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
308 : * zero then ksys_sync_file_range() will operate from offset out to EOF.
309 : *
310 : * The flag bits are:
311 : *
312 : * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
313 : * before performing the write.
314 : *
315 : * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
316 : * range which are not presently under writeback. Note that this may block for
317 : * significant periods due to exhaustion of disk request structures.
318 : *
319 : * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
320 : * after performing the write.
321 : *
322 : * Useful combinations of the flag bits are:
323 : *
324 : * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
325 : * in the range which were dirty on entry to ksys_sync_file_range() are placed
326 : * under writeout. This is a start-write-for-data-integrity operation.
327 : *
328 : * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
329 : * are not presently under writeout. This is an asynchronous flush-to-disk
330 : * operation. Not suitable for data integrity operations.
331 : *
332 : * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
333 : * completion of writeout of all pages in the range. This will be used after an
334 : * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
335 : * for that operation to complete and to return the result.
336 : *
337 : * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER
338 : * (a.k.a. SYNC_FILE_RANGE_WRITE_AND_WAIT):
339 : * a traditional sync() operation. This is a write-for-data-integrity operation
340 : * which will ensure that all pages in the range which were dirty on entry to
341 : * ksys_sync_file_range() are written to disk. It should be noted that disk
342 : * caches are not flushed by this call, so there are no guarantees here that the
343 : * data will be available on disk after a crash.
344 : *
345 : *
346 : * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
347 : * I/O errors or ENOSPC conditions and will return those to the caller, after
348 : * clearing the EIO and ENOSPC flags in the address_space.
349 : *
350 : * It should be noted that none of these operations write out the file's
351 : * metadata. So unless the application is strictly performing overwrites of
352 : * already-instantiated disk blocks, there are no guarantees here that the data
353 : * will be available after a crash.
354 : */
355 86281 : int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
356 : unsigned int flags)
357 : {
358 86281 : int ret;
359 86281 : struct fd f;
360 :
361 86281 : ret = -EBADF;
362 86281 : f = fdget(fd);
363 86487 : if (f.file)
364 86487 : ret = sync_file_range(f.file, offset, nbytes, flags);
365 :
366 86503 : fdput(f);
367 86556 : return ret;
368 : }
369 :
370 172962 : SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
371 : unsigned int, flags)
372 : {
373 86405 : return ksys_sync_file_range(fd, offset, nbytes, flags);
374 : }
375 :
376 : #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_SYNC_FILE_RANGE)
377 : COMPAT_SYSCALL_DEFINE6(sync_file_range, int, fd, compat_arg_u64_dual(offset),
378 : compat_arg_u64_dual(nbytes), unsigned int, flags)
379 : {
380 : return ksys_sync_file_range(fd, compat_arg_u64_glue(offset),
381 : compat_arg_u64_glue(nbytes), flags);
382 : }
383 : #endif
384 :
385 : /* It would be nice if people remember that not all the world's an i386
386 : when they introduce new system calls */
387 0 : SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
388 : loff_t, offset, loff_t, nbytes)
389 : {
390 0 : return ksys_sync_file_range(fd, offset, nbytes, flags);
391 : }
|