Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * High-level sync()-related operations
4 : */
5 :
6 : #include <linux/blkdev.h>
7 : #include <linux/kernel.h>
8 : #include <linux/file.h>
9 : #include <linux/fs.h>
10 : #include <linux/slab.h>
11 : #include <linux/export.h>
12 : #include <linux/namei.h>
13 : #include <linux/sched.h>
14 : #include <linux/writeback.h>
15 : #include <linux/syscalls.h>
16 : #include <linux/linkage.h>
17 : #include <linux/pagemap.h>
18 : #include <linux/quotaops.h>
19 : #include <linux/backing-dev.h>
20 : #include "internal.h"
21 :
22 : #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
23 : SYNC_FILE_RANGE_WAIT_AFTER)
24 :
25 : /*
26 : * Write out and wait upon all dirty data associated with this
27 : * superblock. Filesystem data as well as the underlying block
28 : * device. Takes the superblock lock.
29 : */
30 117813 : int sync_filesystem(struct super_block *sb)
31 : {
32 117813 : int ret = 0;
33 :
34 : /*
35 : * We need to be protected against the filesystem going from
36 : * r/o to r/w or vice versa.
37 : */
38 117813 : WARN_ON(!rwsem_is_locked(&sb->s_umount));
39 :
40 : /*
41 : * No point in syncing out anything if the filesystem is read-only.
42 : */
43 117813 : if (sb_rdonly(sb))
44 : return 0;
45 :
46 : /*
47 : * Do the filesystem syncing work. For simple filesystems
48 : * writeback_inodes_sb(sb) just dirties buffers with inodes so we have
49 : * to submit I/O for these buffers via sync_blockdev(). This also
50 : * speeds up the wait == 1 case since in that case write_inode()
51 : * methods call sync_dirty_buffer() and thus effectively write one block
52 : * at a time.
53 : */
54 104049 : writeback_inodes_sb(sb, WB_REASON_SYNC);
55 104049 : if (sb->s_op->sync_fs) {
56 103881 : ret = sb->s_op->sync_fs(sb, 0);
57 103880 : if (ret)
58 : return ret;
59 : }
60 104048 : ret = sync_blockdev_nowait(sb->s_bdev);
61 104049 : if (ret)
62 : return ret;
63 :
64 104049 : sync_inodes_sb(sb);
65 104049 : if (sb->s_op->sync_fs) {
66 103881 : ret = sb->s_op->sync_fs(sb, 1);
67 103834 : if (ret)
68 : return ret;
69 : }
70 92277 : return sync_blockdev(sb->s_bdev);
71 : }
72 : EXPORT_SYMBOL(sync_filesystem);
73 :
74 60613700 : static void sync_inodes_one_sb(struct super_block *sb, void *arg)
75 : {
76 60613700 : if (!sb_rdonly(sb))
77 60614260 : sync_inodes_sb(sb);
78 60613418 : }
79 :
80 121222167 : static void sync_fs_one_sb(struct super_block *sb, void *arg)
81 : {
82 121222167 : if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) &&
83 94945825 : sb->s_op->sync_fs)
84 4060780 : sb->s_op->sync_fs(sb, *(int *)arg);
85 121221428 : }
86 :
87 : /*
88 : * Sync everything. We start by waking flusher threads so that most of
89 : * writeback runs on all devices in parallel. Then we sync all inodes reliably
90 : * which effectively also waits for all flusher threads to finish doing
91 : * writeback. At this point all data is on disk so metadata should be stable
92 : * and we tell filesystems to sync their metadata via ->sync_fs() calls.
93 : * Finally, we writeout all block devices because some filesystems (e.g. ext2)
94 : * just write metadata (such as inodes or bitmaps) to block device page cache
95 : * and do not sync it on their own in ->sync_fs().
96 : */
97 1010029 : void ksys_sync(void)
98 : {
99 1010029 : int nowait = 0, wait = 1;
100 :
101 1010029 : wakeup_flusher_threads(WB_REASON_SYNC);
102 1010035 : iterate_supers(sync_inodes_one_sb, NULL);
103 1010029 : iterate_supers(sync_fs_one_sb, &nowait);
104 1010035 : iterate_supers(sync_fs_one_sb, &wait);
105 1009429 : sync_bdevs(false);
106 1010035 : sync_bdevs(true);
107 1010035 : if (unlikely(laptop_mode))
108 0 : laptop_sync_completion();
109 1010035 : }
110 :
111 1010031 : SYSCALL_DEFINE0(sync)
112 : {
113 1010031 : ksys_sync();
114 1010024 : return 0;
115 : }
116 :
117 0 : static void do_sync_work(struct work_struct *work)
118 : {
119 0 : int nowait = 0;
120 :
121 : /*
122 : * Sync twice to reduce the possibility we skipped some inodes / pages
123 : * because they were temporarily locked
124 : */
125 0 : iterate_supers(sync_inodes_one_sb, &nowait);
126 0 : iterate_supers(sync_fs_one_sb, &nowait);
127 0 : sync_bdevs(false);
128 0 : iterate_supers(sync_inodes_one_sb, &nowait);
129 0 : iterate_supers(sync_fs_one_sb, &nowait);
130 0 : sync_bdevs(false);
131 0 : printk("Emergency Sync complete\n");
132 0 : kfree(work);
133 0 : }
134 :
135 0 : void emergency_sync(void)
136 : {
137 0 : struct work_struct *work;
138 :
139 0 : work = kmalloc(sizeof(*work), GFP_ATOMIC);
140 0 : if (work) {
141 0 : INIT_WORK(work, do_sync_work);
142 0 : schedule_work(work);
143 : }
144 0 : }
145 :
146 : /*
147 : * sync a single super
148 : */
149 78442 : SYSCALL_DEFINE1(syncfs, int, fd)
150 : {
151 39221 : struct fd f = fdget(fd);
152 39221 : struct super_block *sb;
153 39221 : int ret, ret2;
154 :
155 39221 : if (!f.file)
156 : return -EBADF;
157 39221 : sb = f.file->f_path.dentry->d_sb;
158 :
159 39221 : down_read(&sb->s_umount);
160 39221 : ret = sync_filesystem(sb);
161 39221 : up_read(&sb->s_umount);
162 :
163 39221 : ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err);
164 :
165 39221 : fdput(f);
166 39221 : return ret ? ret : ret2;
167 : }
168 :
169 : /**
170 : * vfs_fsync_range - helper to sync a range of data & metadata to disk
171 : * @file: file to sync
172 : * @start: offset in bytes of the beginning of data range to sync
173 : * @end: offset in bytes of the end of data range (inclusive)
174 : * @datasync: perform only datasync
175 : *
176 : * Write back data in range @start..@end and metadata for @file to disk. If
177 : * @datasync is set only metadata needed to access modified file data is
178 : * written.
179 : */
180 17110019 : int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
181 : {
182 17110019 : struct inode *inode = file->f_mapping->host;
183 :
184 17110019 : if (!file->f_op->fsync)
185 : return -EINVAL;
186 17109813 : if (!datasync && (inode->i_state & I_DIRTY_TIME))
187 4 : mark_inode_dirty_sync(inode);
188 17109813 : return file->f_op->fsync(file, start, end, datasync);
189 : }
190 : EXPORT_SYMBOL(vfs_fsync_range);
191 :
192 : /**
193 : * vfs_fsync - perform a fsync or fdatasync on a file
194 : * @file: file to sync
195 : * @datasync: only perform a fdatasync operation
196 : *
197 : * Write back data and metadata for @file to disk. If @datasync is
198 : * set only metadata needed to access modified file data is written.
199 : */
200 64731 : int vfs_fsync(struct file *file, int datasync)
201 : {
202 64731 : return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
203 : }
204 : EXPORT_SYMBOL(vfs_fsync);
205 :
206 16351442 : static int do_fsync(unsigned int fd, int datasync)
207 : {
208 16351442 : struct fd f = fdget(fd);
209 16350695 : int ret = -EBADF;
210 :
211 16350695 : if (f.file) {
212 16350695 : ret = vfs_fsync(f.file, datasync);
213 16349758 : fdput(f);
214 : }
215 16349757 : return ret;
216 : }
217 :
218 30723012 : SYSCALL_DEFINE1(fsync, unsigned int, fd)
219 : {
220 15362444 : return do_fsync(fd, 0);
221 : }
222 :
223 1977691 : SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
224 : {
225 988848 : return do_fsync(fd, 1);
226 : }
227 :
228 10727 : int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
229 : unsigned int flags)
230 : {
231 10727 : int ret;
232 10727 : struct address_space *mapping;
233 10727 : loff_t endbyte; /* inclusive */
234 10727 : umode_t i_mode;
235 :
236 10727 : ret = -EINVAL;
237 10727 : if (flags & ~VALID_FLAGS)
238 0 : goto out;
239 :
240 10727 : endbyte = offset + nbytes;
241 :
242 10727 : if ((s64)offset < 0)
243 0 : goto out;
244 10727 : if ((s64)endbyte < 0)
245 0 : goto out;
246 10727 : if (endbyte < offset)
247 0 : goto out;
248 :
249 10727 : if (sizeof(pgoff_t) == 4) {
250 : if (offset >= (0x100000000ULL << PAGE_SHIFT)) {
251 : /*
252 : * The range starts outside a 32 bit machine's
253 : * pagecache addressing capabilities. Let it "succeed"
254 : */
255 : ret = 0;
256 : goto out;
257 : }
258 : if (endbyte >= (0x100000000ULL << PAGE_SHIFT)) {
259 : /*
260 : * Out to EOF
261 : */
262 : nbytes = 0;
263 : }
264 : }
265 :
266 10727 : if (nbytes == 0)
267 : endbyte = LLONG_MAX;
268 : else
269 16 : endbyte--; /* inclusive */
270 :
271 10727 : i_mode = file_inode(file)->i_mode;
272 10727 : ret = -ESPIPE;
273 10727 : if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
274 : !S_ISLNK(i_mode))
275 0 : goto out;
276 :
277 10727 : mapping = file->f_mapping;
278 10727 : ret = 0;
279 10727 : if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
280 2 : ret = file_fdatawait_range(file, offset, endbyte);
281 2 : if (ret < 0)
282 0 : goto out;
283 : }
284 :
285 10727 : if (flags & SYNC_FILE_RANGE_WRITE) {
286 10719 : int sync_mode = WB_SYNC_NONE;
287 :
288 10719 : if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) ==
289 : SYNC_FILE_RANGE_WRITE_AND_WAIT)
290 0 : sync_mode = WB_SYNC_ALL;
291 :
292 10719 : ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
293 : sync_mode);
294 10786 : if (ret < 0)
295 0 : goto out;
296 : }
297 :
298 10794 : if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
299 38 : ret = file_fdatawait_range(file, offset, endbyte);
300 :
301 10756 : out:
302 10794 : return ret;
303 : }
304 :
305 : /*
306 : * ksys_sync_file_range() permits finely controlled syncing over a segment of
307 : * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
308 : * zero then ksys_sync_file_range() will operate from offset out to EOF.
309 : *
310 : * The flag bits are:
311 : *
312 : * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
313 : * before performing the write.
314 : *
315 : * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
316 : * range which are not presently under writeback. Note that this may block for
317 : * significant periods due to exhaustion of disk request structures.
318 : *
319 : * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
320 : * after performing the write.
321 : *
322 : * Useful combinations of the flag bits are:
323 : *
324 : * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
325 : * in the range which were dirty on entry to ksys_sync_file_range() are placed
326 : * under writeout. This is a start-write-for-data-integrity operation.
327 : *
328 : * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
329 : * are not presently under writeout. This is an asynchronous flush-to-disk
330 : * operation. Not suitable for data integrity operations.
331 : *
332 : * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
333 : * completion of writeout of all pages in the range. This will be used after an
334 : * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
335 : * for that operation to complete and to return the result.
336 : *
337 : * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER
338 : * (a.k.a. SYNC_FILE_RANGE_WRITE_AND_WAIT):
339 : * a traditional sync() operation. This is a write-for-data-integrity operation
340 : * which will ensure that all pages in the range which were dirty on entry to
341 : * ksys_sync_file_range() are written to disk. It should be noted that disk
342 : * caches are not flushed by this call, so there are no guarantees here that the
343 : * data will be available on disk after a crash.
344 : *
345 : *
346 : * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
347 : * I/O errors or ENOSPC conditions and will return those to the caller, after
348 : * clearing the EIO and ENOSPC flags in the address_space.
349 : *
350 : * It should be noted that none of these operations write out the file's
351 : * metadata. So unless the application is strictly performing overwrites of
352 : * already-instantiated disk blocks, there are no guarantees here that the data
353 : * will be available after a crash.
354 : */
355 10784 : int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
356 : unsigned int flags)
357 : {
358 10784 : int ret;
359 10784 : struct fd f;
360 :
361 10784 : ret = -EBADF;
362 10784 : f = fdget(fd);
363 10802 : if (f.file)
364 10802 : ret = sync_file_range(f.file, offset, nbytes, flags);
365 :
366 10795 : fdput(f);
367 10821 : return ret;
368 : }
369 :
370 21553 : SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
371 : unsigned int, flags)
372 : {
373 10732 : return ksys_sync_file_range(fd, offset, nbytes, flags);
374 : }
375 :
376 : #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_SYNC_FILE_RANGE)
377 : COMPAT_SYSCALL_DEFINE6(sync_file_range, int, fd, compat_arg_u64_dual(offset),
378 : compat_arg_u64_dual(nbytes), unsigned int, flags)
379 : {
380 : return ksys_sync_file_range(fd, compat_arg_u64_glue(offset),
381 : compat_arg_u64_glue(nbytes), flags);
382 : }
383 : #endif
384 :
385 : /* It would be nice if people remember that not all the world's an i386
386 : when they introduce new system calls */
387 0 : SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
388 : loff_t, offset, loff_t, nbytes)
389 : {
390 0 : return ksys_sync_file_range(fd, offset, nbytes, flags);
391 : }
|