android_kernel_asus_sm8350/fs/ext4/file.c
Jan Kara a22f52d883 ext4: avoid crash when inline data creation follows DIO write
commit 4bb26f2885ac6930984ee451b952c5a6042f2c0e upstream.

When inode is created and written to using direct IO, there is nothing
to clear the EXT4_STATE_MAY_INLINE_DATA flag. Thus when inode gets
truncated later to say 1 byte and written using normal write, we will
try to store the data as inline data. This confuses the code later
because the inode now has both normal block and inline data allocated
and the confusion manifests for example as:

kernel BUG at fs/ext4/inode.c:2721!
invalid opcode: 0000 [#1] PREEMPT SMP KASAN
CPU: 0 PID: 359 Comm: repro Not tainted 5.19.0-rc8-00001-g31ba1e3b8305-dirty #15
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.fc36 04/01/2014
RIP: 0010:ext4_writepages+0x363d/0x3660
RSP: 0018:ffffc90000ccf260 EFLAGS: 00010293
RAX: ffffffff81e1abcd RBX: 0000008000000000 RCX: ffff88810842a180
RDX: 0000000000000000 RSI: 0000008000000000 RDI: 0000000000000000
RBP: ffffc90000ccf650 R08: ffffffff81e17d58 R09: ffffed10222c680b
R10: dfffe910222c680c R11: 1ffff110222c680a R12: ffff888111634128
R13: ffffc90000ccf880 R14: 0000008410000000 R15: 0000000000000001
FS:  00007f72635d2640(0000) GS:ffff88811b000000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000565243379180 CR3: 000000010aa74000 CR4: 0000000000150eb0
Call Trace:
 <TASK>
 do_writepages+0x397/0x640
 filemap_fdatawrite_wbc+0x151/0x1b0
 file_write_and_wait_range+0x1c9/0x2b0
 ext4_sync_file+0x19e/0xa00
 vfs_fsync_range+0x17b/0x190
 ext4_buffered_write_iter+0x488/0x530
 ext4_file_write_iter+0x449/0x1b90
 vfs_write+0xbcd/0xf40
 ksys_write+0x198/0x2c0
 __x64_sys_write+0x7b/0x90
 do_syscall_64+0x3d/0x90
 entry_SYSCALL_64_after_hwframe+0x63/0xcd
 </TASK>

Fix the problem by clearing EXT4_STATE_MAY_INLINE_DATA when we are doing
direct IO write to a file.

Cc: stable@kernel.org
Reported-by: Tadeusz Struk <tadeusz.struk@linaro.org>
Reported-by: syzbot+bd13648a53ed6933ca49@syzkaller.appspotmail.com
Link: https://syzkaller.appspot.com/bug?id=a1e89d09bbbcbd5c4cb45db230ee28c822953984
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Lukas Czerner <lczerner@redhat.com>
Tested-by: Tadeusz Struk<tadeusz.struk@linaro.org>
Link: https://lore.kernel.org/r/20220727155753.13969-1-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2022-10-26 13:22:18 +02:00

548 lines
14 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/file.c
*
* Copyright (C) 1992, 1993, 1994, 1995
* Remy Card (card@masi.ibp.fr)
* Laboratoire MASI - Institut Blaise Pascal
* Universite Pierre et Marie Curie (Paris VI)
*
* from
*
* linux/fs/minix/file.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* ext4 fs regular file handling primitives
*
* 64-bit file support on 64-bit platforms by Jakub Jelinek
* (jj@sunsite.ms.mff.cuni.cz)
*/
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/mount.h>
#include <linux/path.h>
#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include <linux/uio.h>
#include <linux/mman.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
#ifdef CONFIG_FS_DAX
static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
if (iocb->ki_flags & IOCB_NOWAIT) {
if (!inode_trylock_shared(inode))
return -EAGAIN;
} else {
inode_lock_shared(inode);
}
/*
* Recheck under inode lock - at this point we are sure it cannot
* change anymore
*/
if (!IS_DAX(inode)) {
inode_unlock_shared(inode);
/* Fallback to buffered IO in case we cannot support DAX */
return generic_file_read_iter(iocb, to);
}
ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops);
inode_unlock_shared(inode);
file_accessed(iocb->ki_filp);
return ret;
}
#endif
static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb))))
return -EIO;
if (!iov_iter_count(to))
return 0; /* skip atime */
#ifdef CONFIG_FS_DAX
if (IS_DAX(file_inode(iocb->ki_filp)))
return ext4_dax_read_iter(iocb, to);
#endif
return generic_file_read_iter(iocb, to);
}
/*
* Called when an inode is released. Note that this is different
* from ext4_file_open: open gets called at every open, but release
* gets called only when /all/ the files are closed.
*/
static int ext4_release_file(struct inode *inode, struct file *filp)
{
if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
ext4_alloc_da_blocks(inode);
ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
}
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
(atomic_read(&inode->i_writecount) == 1) &&
!EXT4_I(inode)->i_reserved_data_blocks)
{
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
}
if (is_dx(inode) && filp->private_data)
ext4_htree_free_dir_info(filp->private_data);
return 0;
}
static void ext4_unwritten_wait(struct inode *inode)
{
wait_queue_head_t *wq = ext4_ioend_wq(inode);
wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
}
/*
* This tests whether the IO in question is block-aligned or not.
* Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
* are converted to written only after the IO is complete. Until they are
* mapped, these blocks appear as holes, so dio_zero_block() will assume that
* it needs to zero out portions of the start and/or end block. If 2 AIO
* threads are at work on the same unwritten block, they must be synchronized
* or one thread will zero the other's data, causing corruption.
*/
static int
ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
{
struct super_block *sb = inode->i_sb;
int blockmask = sb->s_blocksize - 1;
if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize))
return 0;
if ((pos | iov_iter_alignment(from)) & blockmask)
return 1;
return 0;
}
/* Is IO overwriting allocated and initialized blocks? */
static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
{
struct ext4_map_blocks map;
unsigned int blkbits = inode->i_blkbits;
int err, blklen;
if (pos + len > i_size_read(inode))
return false;
map.m_lblk = pos >> blkbits;
map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
blklen = map.m_len;
err = ext4_map_blocks(NULL, inode, &map, 0);
/*
* 'err==len' means that all of the blocks have been preallocated,
* regardless of whether they have been initialized or not. To exclude
* unwritten extents, we need to check m_flags.
*/
return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
}
static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
ret = generic_write_checks(iocb, from);
if (ret <= 0)
return ret;
if (unlikely(IS_IMMUTABLE(inode)))
return -EPERM;
/*
* If we have encountered a bitmap-format file, the size limit
* is smaller than s_maxbytes, which is for extent-mapped files.
*/
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
if (iocb->ki_pos >= sbi->s_bitmap_maxbytes)
return -EFBIG;
iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
}
return iov_iter_count(from);
}
#ifdef CONFIG_FS_DAX
static ssize_t
ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
if (iocb->ki_flags & IOCB_NOWAIT) {
if (!inode_trylock(inode))
return -EAGAIN;
} else {
inode_lock(inode);
}
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
ret = file_remove_privs(iocb->ki_filp);
if (ret)
goto out;
ret = file_update_time(iocb->ki_filp);
if (ret)
goto out;
ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
out:
inode_unlock(inode);
if (ret > 0)
ret = generic_write_sync(iocb, ret);
return ret;
}
#endif
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
int o_direct = iocb->ki_flags & IOCB_DIRECT;
int unaligned_aio = 0;
int overwrite = 0;
ssize_t ret;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
#ifdef CONFIG_FS_DAX
if (IS_DAX(inode))
return ext4_dax_write_iter(iocb, from);
#endif
if (!inode_trylock(inode)) {
if (iocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
inode_lock(inode);
}
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
/*
* Unaligned direct AIO must be serialized among each other as zeroing
* of partial blocks of two competing unaligned AIOs can result in data
* corruption.
*/
if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
!is_sync_kiocb(iocb) &&
ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
unaligned_aio = 1;
ext4_unwritten_wait(inode);
}
iocb->private = &overwrite;
/* Check whether we do a DIO overwrite or not */
if (o_direct && !unaligned_aio) {
if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
if (ext4_should_dioread_nolock(inode))
overwrite = 1;
} else if (iocb->ki_flags & IOCB_NOWAIT) {
ret = -EAGAIN;
goto out;
}
}
ret = __generic_file_write_iter(iocb, from);
/*
* Unaligned direct AIO must be the only IO in flight. Otherwise
* overlapping aligned IO after unaligned might result in data
* corruption.
*/
if (ret == -EIOCBQUEUED && unaligned_aio)
ext4_unwritten_wait(inode);
inode_unlock(inode);
if (ret > 0)
ret = generic_write_sync(iocb, ret);
return ret;
out:
inode_unlock(inode);
return ret;
}
#ifdef CONFIG_FS_DAX
static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
enum page_entry_size pe_size)
{
int error = 0;
vm_fault_t result;
int retries = 0;
handle_t *handle = NULL;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct super_block *sb = inode->i_sb;
/*
* We have to distinguish real writes from writes which will result in a
* COW page; COW writes should *not* poke the journal (the file will not
* be changed). Doing so would cause unintended failures when mounted
* read-only.
*
* We check for VM_SHARED rather than vmf->cow_page since the latter is
* unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
* other sizes, dax_iomap_fault will handle splitting / fallback so that
* we eventually come back with a COW page.
*/
bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
(vmf->vma->vm_flags & VM_SHARED);
pfn_t pfn;
if (write) {
sb_start_pagefault(sb);
file_update_time(vmf->vma->vm_file);
down_read(&EXT4_I(inode)->i_mmap_sem);
retry:
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
EXT4_DATA_TRANS_BLOCKS(sb));
if (IS_ERR(handle)) {
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
return VM_FAULT_SIGBUS;
}
} else {
down_read(&EXT4_I(inode)->i_mmap_sem);
}
result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
if (write) {
ext4_journal_stop(handle);
if ((result & VM_FAULT_ERROR) && error == -ENOSPC &&
ext4_should_retry_alloc(sb, &retries))
goto retry;
/* Handling synchronous page fault? */
if (result & VM_FAULT_NEEDDSYNC)
result = dax_finish_sync_fault(vmf, pe_size, pfn);
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
} else {
up_read(&EXT4_I(inode)->i_mmap_sem);
}
return result;
}
static vm_fault_t ext4_dax_fault(struct vm_fault *vmf)
{
return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
}
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
.huge_fault = ext4_dax_huge_fault,
.page_mkwrite = ext4_dax_fault,
.pfn_mkwrite = ext4_dax_fault,
};
#else
#define ext4_dax_vm_ops ext4_file_vm_ops
#endif
static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = ext4_filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
};
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_mapping->host;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct dax_device *dax_dev = sbi->s_daxdev;
if (unlikely(ext4_forced_shutdown(sbi)))
return -EIO;
/*
* We don't support synchronous mappings for non-DAX files and
* for DAX files if underneath dax_device is not synchronous.
*/
if (!daxdev_mapping_supported(vma, dax_dev))
return -EOPNOTSUPP;
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
vma->vm_flags |= VM_HUGEPAGE;
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
return 0;
}
static int ext4_sample_last_mounted(struct super_block *sb,
struct vfsmount *mnt)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct path path;
char buf[64], *cp;
handle_t *handle;
int err;
if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED))
return 0;
if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
return 0;
sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
/*
* Sample where the filesystem has been mounted and
* store it in the superblock for sysadmin convenience
* when trying to sort through large numbers of block
* devices or filesystem images.
*/
memset(buf, 0, sizeof(buf));
path.mnt = mnt;
path.dentry = mnt->mnt_root;
cp = d_path(&path, buf, sizeof(buf));
err = 0;
if (IS_ERR(cp))
goto out;
handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
err = PTR_ERR(handle);
if (IS_ERR(handle))
goto out;
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
if (err)
goto out_journal;
strncpy(sbi->s_es->s_last_mounted, cp,
sizeof(sbi->s_es->s_last_mounted));
ext4_handle_dirty_super(handle, sb);
out_journal:
ext4_journal_stop(handle);
out:
sb_end_intwrite(sb);
return err;
}
static int ext4_file_open(struct inode * inode, struct file * filp)
{
int ret;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
if (ret)
return ret;
ret = fscrypt_file_open(inode, filp);
if (ret)
return ret;
ret = fsverity_file_open(inode, filp);
if (ret)
return ret;
/*
* Set up the jbd2_inode if we are opening the inode for
* writing and the journal is present
*/
if (filp->f_mode & FMODE_WRITE) {
ret = ext4_inode_attach_jinode(inode);
if (ret < 0)
return ret;
}
filp->f_mode |= FMODE_NOWAIT;
return dquot_file_open(inode, filp);
}
/*
* ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
* by calling generic_file_llseek_size() with the appropriate maxbytes
* value for each.
*/
loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
loff_t maxbytes;
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
else
maxbytes = inode->i_sb->s_maxbytes;
switch (whence) {
default:
return generic_file_llseek_size(file, offset, whence,
maxbytes, i_size_read(inode));
case SEEK_HOLE:
inode_lock_shared(inode);
offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops);
inode_unlock_shared(inode);
break;
case SEEK_DATA:
inode_lock_shared(inode);
offset = iomap_seek_data(inode, offset, &ext4_iomap_ops);
inode_unlock_shared(inode);
break;
}
/*
* Make sure inline data cannot be created anymore since we are going
* to allocate blocks for DIO. We know the inode does not have any
* inline data now because ext4_dio_supported() checked for that.
*/
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
if (offset < 0)
return offset;
return vfs_setpos(file, offset, maxbytes);
}
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read_iter = ext4_file_read_iter,
.write_iter = ext4_file_write_iter,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
.mmap = ext4_file_mmap,
.mmap_supported_flags = MAP_SYNC,
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
.get_unmapped_area = thp_get_unmapped_area,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
};
const struct inode_operations ext4_file_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_file_getattr,
.listxattr = ext4_listxattr,
.get_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
};