-----BEGIN PGP SIGNATURE----- iQIzBAABCAAdFiEEZH8oZUiU471FcZm+ONu9yGCSaT4FAl4iAaQACgkQONu9yGCS aT5vIg/+Lj4wdF3UuUWonHdWBhnfG2FKCWFTYJKPpFXFRMltAa27XKns/CvR8CBW 9ztOH928CR8K9BS7HbfGtsgOEOVzILb4+akco5UhrTH93dc2T6RwSDiMpaULgeIF x/n834yNlsHs1NSmjjuimBe1j4NcZwPnnNVGKmFojkv04QPsFjP6HCp7PR2/PMXP CVO5JBXqMYtMRprY0xkpAGCStqVZPF6uwfTPrKRgaOCTpkKsqBEFJbwqOoqGQWou fQPOmEFjw+e9rIKzJgou6k4YGrWITcpNnUMdxavCszcQFTeUnY1vpLTiVxyZC1E3 R+7ulfe+/zoQvWIer9H85ySLuOjSmmXb5CM9Fc0WLSsvKmTKfUNe/g5Cce+rngPY x/+tIBvXgFSoGR4oO5dEHhXn9Hzqr0OHbZy1dLKY1RU4NzxLsAtR2DH4ps25I4ux Ty2P0kYwm5Sz43MspnFAPTaU5kC3qHVNMjanbb5I7xGF2m0HZmh0zRHBC50DqP4Y nmLUklpX4EGVAYGb94YZMa3ugksSvie2SLgk838UQG+lGqaQoxAyAeRmDdyR1zE7 GHlkNxWj8cbkBsPDSYt6Wvrt+7+e8Bbk5Y/fM5+j02h6ehs9wqOaQ985CfvrrYix RyGc7pWt1FPL7Kqv/CtbDieglS/P0BMPPGYX2rfidk6i+0knWaE= =53PP -----END PGP SIGNATURE----- Merge 5.4.13 into android-5.4 Changes in 5.4.13 HID: hidraw, uhid: Always report EPOLLOUT rtc: mt6397: fix alarm register overwrite phy: mapphone-mdm6600: Fix uninitialized status value regression RDMA/bnxt_re: Avoid freeing MR resources if dereg fails RDMA/bnxt_re: Fix Send Work Entry state check while polling completions IB/hfi1: Don't cancel unused work item mtd: rawnand: stm32_fmc2: avoid to lock the CPU bus i2c: bcm2835: Store pointer to bus clock ASoC: SOF: imx8: fix memory allocation failure check on priv->pd_dev ASoC: soc-core: Set dpcm_playback / dpcm_capture ASoC: stm32: spdifrx: fix inconsistent lock state ASoC: stm32: spdifrx: fix race condition in irq handler ASoC: stm32: spdifrx: fix input pin state management pinctrl: lochnagar: select GPIOLIB netfilter: nft_flow_offload: fix underflow in flowtable reference counter ASoC: SOF: imx8: Fix dsp_box offset mtd: onenand: omap2: Pass correct flags for prep_dma_memcpy gpio: zynq: Fix for bug in zynq_gpio_restore_context API pinctrl: meson: Fix wrong shift value when get drive-strength selftests: loopback.sh: skip this test if the driver does not support iommu/vt-d: Unlink device if failed to add to group iommu: Remove device link to group on failure bpf: cgroup: prevent out-of-order release of cgroup bpf fs: move guard_bio_eod() after bio_set_op_attrs scsi: mpt3sas: Fix double free in attach error handling gpio: Fix error message on out-of-range GPIO in lookup table PM / devfreq: tegra: Add COMMON_CLK dependency PCI: amlogic: Fix probed clock names drm/tegra: Fix ordering of cleanup code hsr: add hsr root debugfs directory hsr: rename debugfs file when interface name is changed hsr: reset network header when supervision frame is created s390/qeth: fix qdio teardown after early init error s390/qeth: fix false reporting of VNIC CHAR config failure s390/qeth: Fix vnicc_is_in_use if rx_bcast not set s390/qeth: vnicc Fix init to default s390/qeth: fix initialization on old HW cifs: Adjust indentation in smb2_open_file scsi: smartpqi: Update attribute name to `driver_version` MAINTAINERS: Append missed file to the database ath9k: use iowrite32 over __raw_writel can: j1939: fix address claim code example dt-bindings: reset: Fix brcmstb-reset example reset: brcmstb: Remove resource checks afs: Fix missing cell comparison in afs_test_super() perf vendor events s390: Remove name from L1D_RO_EXCL_WRITES description syscalls/x86: Wire up COMPAT_SYSCALL_DEFINE0 syscalls/x86: Use COMPAT_SYSCALL_DEFINE0 for IA32 (rt_)sigreturn syscalls/x86: Use the correct function type for sys_ni_syscall syscalls/x86: Fix function types in COND_SYSCALL hsr: fix slab-out-of-bounds Read in hsr_debugfs_rename() btrfs: simplify inode locking for RWF_NOWAIT netfilter: nf_tables_offload: release flow_rule on error from commit path netfilter: nft_meta: use 64-bit time arithmetic ASoC: dt-bindings: mt8183: add missing update ASoC: simple_card_utils.h: Add missing include ASoC: fsl_esai: Add spin lock to protect reset, stop and start ASoC: SOF: Intel: Broadwell: clarify mutual exclusion with legacy driver ASoC: core: Fix compile warning with CONFIG_DEBUG_FS=n ASoC: rsnd: fix DALIGN register for SSIU RDMA/hns: Prevent undefined behavior in hns_roce_set_user_sq_size() RDMA/hns: remove a redundant le16_to_cpu RDMA/hns: Modify return value of restrack functions RDMA/counter: Prevent QP counter manual binding in auto mode RDMA/siw: Fix port number endianness in a debug message RDMA/hns: Fix build error again RDMA/hns: Release qp resources when failed to destroy qp xprtrdma: Add unique trace points for posting Local Invalidate WRs xprtrdma: Connection becomes unstable after a reconnect xprtrdma: Fix MR list handling xprtrdma: Close window between waking RPC senders and posting Receives RDMA/hns: Fix to support 64K page for srq RDMA/hns: Bugfix for qpc/cqc timer configuration rdma: Remove nes ABI header RDMA/mlx5: Return proper error value RDMA/srpt: Report the SCSI residual to the initiator uaccess: Add non-pagefault user-space write function bpf: Make use of probe_user_write in probe write helper bpf: skmsg, fix potential psock NULL pointer dereference bpf: Support pre-2.25-binutils objcopy for vmlinux BTF libbpf: Fix Makefile' libbpf symbol mismatch diagnostic afs: Fix use-after-loss-of-ref afs: Fix afs_lookup() to not clobber the version on a new dentry keys: Fix request_key() cache scsi: enclosure: Fix stale device oops with hot replug scsi: sd: Clear sdkp->protection_type if disk is reformatted without PI platform/mellanox: fix potential deadlock in the tmfifo driver platform/x86: asus-wmi: Fix keyboard brightness cannot be set to 0 platform/x86: GPD pocket fan: Use default values when wrong modparams are given asm-generic/nds32: don't redefine cacheflush primitives Documentation/ABI: Fix documentation inconsistency for mlxreg-io sysfs interfaces Documentation/ABI: Add missed attribute for mlxreg-io sysfs interfaces xprtrdma: Fix create_qp crash on device unload xprtrdma: Fix completion wait during device removal xprtrdma: Fix oops in Receive handler after device removal dm: add dm-clone to the documentation index scsi: ufs: Give an unique ID to each ufs-bsg crypto: cavium/nitrox - fix firmware assignment to AE cores crypto: hisilicon - select NEED_SG_DMA_LENGTH in qm Kconfig crypto: arm64/aes-neonbs - add return value of skcipher_walk_done() in __xts_crypt() crypto: virtio - implement missing support for output IVs crypto: algif_skcipher - Use chunksize instead of blocksize crypto: geode-aes - convert to skcipher API and make thread-safe NFSv2: Fix a typo in encode_sattr() nfsd: Fix cld_net->cn_tfm initialization nfsd: v4 support requires CRYPTO_SHA256 NFSv4.x: Handle bad/dead sessions correctly in nfs41_sequence_process() NFSv4.x: Drop the slot if nfs4_delegreturn_prepare waits for layoutreturn iio: imu: st_lsm6dsx: fix gyro gain definitions for LSM9DS1 iio: imu: adis16480: assign bias value only if operation succeeded mei: fix modalias documentation clk: meson: axg-audio: fix regmap last register clk: samsung: exynos5420: Preserve CPU clocks configuration during suspend/resume clk: Fix memory leak in clk_unregister() dmaengine: dw: platform: Mark 'hclk' clock optional clk: imx: pll14xx: Fix quick switch of S/K parameter rsi: fix potential null dereference in rsi_probe() affs: fix a memory leak in affs_remount pinctl: ti: iodelay: fix error checking on pinctrl_count_index_with_args call pinctrl: sh-pfc: Fix PINMUX_IPSR_PHYS() to set GPSR pinctrl: sh-pfc: Do not use platform_get_irq() to count interrupts pinctrl: lewisburg: Update pin list according to v1.1v6 PCI: pciehp: Do not disable interrupt twice on suspend Revert "drm/virtio: switch virtio_gpu_wait_ioctl() to gem helper." drm/amdgpu: cleanup creating BOs at fixed location (v2) drm/amdgpu/discovery: reserve discovery data at the top of VRAM scsi: sd: enable compat ioctls for sed-opal arm64: dts: apq8096-db820c: Increase load on l21 for SDCARD gfs2: add compat_ioctl support af_unix: add compat_ioctl support compat_ioctl: handle SIOCOUTQNSD PCI: aardvark: Use LTSSM state to build link training flag PCI: aardvark: Fix PCI_EXP_RTCTL register configuration PCI: dwc: Fix find_next_bit() usage PCI: Fix missing bridge dma_ranges resource list cleanup PCI/PM: Clear PCIe PME Status even for legacy power management tools: PCI: Fix fd leakage PCI/PTM: Remove spurious "d" from granularity message powerpc/powernv: Disable native PCIe port management MIPS: PCI: remember nasid changed by set interrupt affinity MIPS: Loongson: Fix return value of loongson_hwmon_init MIPS: SGI-IP27: Fix crash, when CPUs are disabled via nr_cpus parameter tty: serial: imx: use the sg count from dma_map_sg tty: serial: pch_uart: correct usage of dma_unmap_sg ARM: 8943/1: Fix topology setup in case of CPU hotplug for CONFIG_SCHED_MC media: ov6650: Fix incorrect use of JPEG colorspace media: ov6650: Fix some format attributes not under control media: ov6650: Fix .get_fmt() V4L2_SUBDEV_FORMAT_TRY support media: ov6650: Fix default format not applied on device probe media: rcar-vin: Fix incorrect return statement in rvin_try_format() media: hantro: h264: Fix the frame_num wraparound case media: v4l: cadence: Fix how unsued lanes are handled in 'csi2rx_start()' media: exynos4-is: Fix recursive locking in isp_video_release() media: coda: fix deadlock between decoder picture run and start command media: cedrus: Use correct H264 8x8 scaling list media: hantro: Do not reorder H264 scaling list media: aspeed-video: Fix memory leaks in aspeed_video_probe media: hantro: Set H264 FIELDPIC_FLAG_E flag correctly iommu/mediatek: Correct the flush_iotlb_all callback iommu/mediatek: Add a new tlb_lock for tlb_flush memory: mtk-smi: Add PM suspend and resume ops Revert "ubifs: Fix memory leak bug in alloc_ubifs_info() error path" ubifs: Fixed missed le64_to_cpu() in journal ubifs: do_kill_orphans: Fix a memory leak bug spi: sprd: Fix the incorrect SPI register mtd: spi-nor: fix silent truncation in spi_nor_read() mtd: spi-nor: fix silent truncation in spi_nor_read_raw() spi: pxa2xx: Set controller->max_transfer_size in dma mode spi: atmel: fix handling of cs_change set on non-last xfer spi: rspi: Use platform_get_irq_byname_optional() for optional irqs spi: lpspi: fix memory leak in fsl_lpspi_probe iwlwifi: mvm: consider ieee80211 station max amsdu value rtlwifi: Remove unnecessary NULL check in rtl_regd_init iwlwifi: mvm: fix support for single antenna diversity sch_cake: Add missing NLA policy entry TCA_CAKE_SPLIT_GSO f2fs: fix potential overflow NFSD fixing possible null pointer derefering in copy offload rtc: msm6242: Fix reading of 10-hour digit rtc: brcmstb-waketimer: add missed clk_disable_unprepare rtc: bd70528: Add MODULE ALIAS to autoload module gpio: mpc8xxx: Add platform device to gpiochip->parent scsi: libcxgbi: fix NULL pointer dereference in cxgbi_device_destroy() scsi: target/iblock: Fix protection error with blocks greater than 512B selftests: firmware: Fix it to do root uid check and skip rseq/selftests: Turn off timeout setting riscv: export flush_icache_all to modules mips: cacheinfo: report shared CPU map mips: Fix gettimeofday() in the vdso library tomoyo: Suppress RCU warning at list_for_each_entry_rcu(). MIPS: Prevent link failure with kcov instrumentation drm/arm/mali: make malidp_mw_connector_helper_funcs static rxrpc: Unlock new call in rxrpc_new_incoming_call() rather than the caller rxrpc: Don't take call->user_mutex in rxrpc_new_incoming_call() rxrpc: Fix missing security check on incoming calls dmaengine: k3dma: Avoid null pointer traversal s390/qeth: lock the card while changing its hsuid ioat: ioat_alloc_ring() failure handling. drm/amdgpu: enable gfxoff for raven1 refresh media: intel-ipu3: Align struct ipu3_uapi_awb_fr_config_s to 32 bytes kbuild/deb-pkg: annotate libelf-dev dependency as :native hexagon: parenthesize registers in asm predicates hexagon: work around compiler crash ocfs2: call journal flush to mark journal as empty after journal recovery when mount Linux 5.4.13 Signed-off-by: Greg Kroah-Hartman <gregkh@google.com> Change-Id: I90734cd9d80f000e05a8109a529916ae641cdede
796 lines
22 KiB
C
796 lines
22 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* fs/mpage.c
|
|
*
|
|
* Copyright (C) 2002, Linus Torvalds.
|
|
*
|
|
* Contains functions related to preparing and submitting BIOs which contain
|
|
* multiple pagecache pages.
|
|
*
|
|
* 15May2002 Andrew Morton
|
|
* Initial version
|
|
* 27Jun2002 axboe@suse.de
|
|
* use bio_add_page() to build bio's just the right size
|
|
*/
|
|
|
|
#include <linux/kernel.h>
|
|
#include <linux/export.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/kdev_t.h>
|
|
#include <linux/gfp.h>
|
|
#include <linux/bio.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/buffer_head.h>
|
|
#include <linux/blkdev.h>
|
|
#include <linux/highmem.h>
|
|
#include <linux/prefetch.h>
|
|
#include <linux/mpage.h>
|
|
#include <linux/mm_inline.h>
|
|
#include <linux/writeback.h>
|
|
#include <linux/backing-dev.h>
|
|
#include <linux/pagevec.h>
|
|
#include <linux/cleancache.h>
|
|
#include "internal.h"
|
|
|
|
#define CREATE_TRACE_POINTS
|
|
#include <trace/events/android_fs.h>
|
|
|
|
EXPORT_TRACEPOINT_SYMBOL(android_fs_datawrite_start);
|
|
EXPORT_TRACEPOINT_SYMBOL(android_fs_datawrite_end);
|
|
EXPORT_TRACEPOINT_SYMBOL(android_fs_dataread_start);
|
|
EXPORT_TRACEPOINT_SYMBOL(android_fs_dataread_end);
|
|
|
|
/*
|
|
* I/O completion handler for multipage BIOs.
|
|
*
|
|
* The mpage code never puts partial pages into a BIO (except for end-of-file).
|
|
* If a page does not map to a contiguous run of blocks then it simply falls
|
|
* back to block_read_full_page().
|
|
*
|
|
* Why is this? If a page's completion depends on a number of different BIOs
|
|
* which can complete in any order (or at the same time) then determining the
|
|
* status of that page is hard. See end_buffer_async_read() for the details.
|
|
* There is no point in duplicating all that complexity.
|
|
*/
|
|
static void mpage_end_io(struct bio *bio)
|
|
{
|
|
struct bio_vec *bv;
|
|
struct bvec_iter_all iter_all;
|
|
|
|
if (trace_android_fs_dataread_end_enabled() &&
|
|
(bio_data_dir(bio) == READ)) {
|
|
struct page *first_page = bio->bi_io_vec[0].bv_page;
|
|
|
|
if (first_page != NULL)
|
|
trace_android_fs_dataread_end(first_page->mapping->host,
|
|
page_offset(first_page),
|
|
bio->bi_iter.bi_size);
|
|
}
|
|
|
|
bio_for_each_segment_all(bv, bio, iter_all) {
|
|
struct page *page = bv->bv_page;
|
|
page_endio(page, bio_op(bio),
|
|
blk_status_to_errno(bio->bi_status));
|
|
}
|
|
|
|
bio_put(bio);
|
|
}
|
|
|
|
static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio)
|
|
{
|
|
if (trace_android_fs_dataread_start_enabled() && (op == REQ_OP_READ)) {
|
|
struct page *first_page = bio->bi_io_vec[0].bv_page;
|
|
|
|
if (first_page != NULL) {
|
|
char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
|
|
|
|
path = android_fstrace_get_pathname(pathbuf,
|
|
MAX_TRACE_PATHBUF_LEN,
|
|
first_page->mapping->host);
|
|
trace_android_fs_dataread_start(
|
|
first_page->mapping->host,
|
|
page_offset(first_page),
|
|
bio->bi_iter.bi_size,
|
|
current->pid,
|
|
path,
|
|
current->comm);
|
|
}
|
|
}
|
|
bio->bi_end_io = mpage_end_io;
|
|
bio_set_op_attrs(bio, op, op_flags);
|
|
guard_bio_eod(bio);
|
|
submit_bio(bio);
|
|
return NULL;
|
|
}
|
|
|
|
static struct bio *
|
|
mpage_alloc(struct block_device *bdev,
|
|
sector_t first_sector, int nr_vecs,
|
|
gfp_t gfp_flags)
|
|
{
|
|
struct bio *bio;
|
|
|
|
/* Restrict the given (page cache) mask for slab allocations */
|
|
gfp_flags &= GFP_KERNEL;
|
|
bio = bio_alloc(gfp_flags, nr_vecs);
|
|
|
|
if (bio == NULL && (current->flags & PF_MEMALLOC)) {
|
|
while (!bio && (nr_vecs /= 2))
|
|
bio = bio_alloc(gfp_flags, nr_vecs);
|
|
}
|
|
|
|
if (bio) {
|
|
bio_set_dev(bio, bdev);
|
|
bio->bi_iter.bi_sector = first_sector;
|
|
}
|
|
return bio;
|
|
}
|
|
|
|
/*
|
|
* support function for mpage_readpages. The fs supplied get_block might
|
|
* return an up to date buffer. This is used to map that buffer into
|
|
* the page, which allows readpage to avoid triggering a duplicate call
|
|
* to get_block.
|
|
*
|
|
* The idea is to avoid adding buffers to pages that don't already have
|
|
* them. So when the buffer is up to date and the page size == block size,
|
|
* this marks the page up to date instead of adding new buffers.
|
|
*/
|
|
static void
|
|
map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
|
|
{
|
|
struct inode *inode = page->mapping->host;
|
|
struct buffer_head *page_bh, *head;
|
|
int block = 0;
|
|
|
|
if (!page_has_buffers(page)) {
|
|
/*
|
|
* don't make any buffers if there is only one buffer on
|
|
* the page and the page just needs to be set up to date
|
|
*/
|
|
if (inode->i_blkbits == PAGE_SHIFT &&
|
|
buffer_uptodate(bh)) {
|
|
SetPageUptodate(page);
|
|
return;
|
|
}
|
|
create_empty_buffers(page, i_blocksize(inode), 0);
|
|
}
|
|
head = page_buffers(page);
|
|
page_bh = head;
|
|
do {
|
|
if (block == page_block) {
|
|
page_bh->b_state = bh->b_state;
|
|
page_bh->b_bdev = bh->b_bdev;
|
|
page_bh->b_blocknr = bh->b_blocknr;
|
|
break;
|
|
}
|
|
page_bh = page_bh->b_this_page;
|
|
block++;
|
|
} while (page_bh != head);
|
|
}
|
|
|
|
struct mpage_readpage_args {
|
|
struct bio *bio;
|
|
struct page *page;
|
|
unsigned int nr_pages;
|
|
bool is_readahead;
|
|
sector_t last_block_in_bio;
|
|
struct buffer_head map_bh;
|
|
unsigned long first_logical_block;
|
|
get_block_t *get_block;
|
|
};
|
|
|
|
/*
|
|
* This is the worker routine which does all the work of mapping the disk
|
|
* blocks and constructs largest possible bios, submits them for IO if the
|
|
* blocks are not contiguous on the disk.
|
|
*
|
|
* We pass a buffer_head back and forth and use its buffer_mapped() flag to
|
|
* represent the validity of its disk mapping and to decide when to do the next
|
|
* get_block() call.
|
|
*/
|
|
static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
|
|
{
|
|
struct page *page = args->page;
|
|
struct inode *inode = page->mapping->host;
|
|
const unsigned blkbits = inode->i_blkbits;
|
|
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
|
|
const unsigned blocksize = 1 << blkbits;
|
|
struct buffer_head *map_bh = &args->map_bh;
|
|
sector_t block_in_file;
|
|
sector_t last_block;
|
|
sector_t last_block_in_file;
|
|
sector_t blocks[MAX_BUF_PER_PAGE];
|
|
unsigned page_block;
|
|
unsigned first_hole = blocks_per_page;
|
|
struct block_device *bdev = NULL;
|
|
int length;
|
|
int fully_mapped = 1;
|
|
int op_flags;
|
|
unsigned nblocks;
|
|
unsigned relative_block;
|
|
gfp_t gfp;
|
|
|
|
if (args->is_readahead) {
|
|
op_flags = REQ_RAHEAD;
|
|
gfp = readahead_gfp_mask(page->mapping);
|
|
} else {
|
|
op_flags = 0;
|
|
gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
|
|
}
|
|
|
|
if (page_has_buffers(page))
|
|
goto confused;
|
|
|
|
block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
|
|
last_block = block_in_file + args->nr_pages * blocks_per_page;
|
|
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
|
|
if (last_block > last_block_in_file)
|
|
last_block = last_block_in_file;
|
|
page_block = 0;
|
|
|
|
/*
|
|
* Map blocks using the result from the previous get_blocks call first.
|
|
*/
|
|
nblocks = map_bh->b_size >> blkbits;
|
|
if (buffer_mapped(map_bh) &&
|
|
block_in_file > args->first_logical_block &&
|
|
block_in_file < (args->first_logical_block + nblocks)) {
|
|
unsigned map_offset = block_in_file - args->first_logical_block;
|
|
unsigned last = nblocks - map_offset;
|
|
|
|
for (relative_block = 0; ; relative_block++) {
|
|
if (relative_block == last) {
|
|
clear_buffer_mapped(map_bh);
|
|
break;
|
|
}
|
|
if (page_block == blocks_per_page)
|
|
break;
|
|
blocks[page_block] = map_bh->b_blocknr + map_offset +
|
|
relative_block;
|
|
page_block++;
|
|
block_in_file++;
|
|
}
|
|
bdev = map_bh->b_bdev;
|
|
}
|
|
|
|
/*
|
|
* Then do more get_blocks calls until we are done with this page.
|
|
*/
|
|
map_bh->b_page = page;
|
|
while (page_block < blocks_per_page) {
|
|
map_bh->b_state = 0;
|
|
map_bh->b_size = 0;
|
|
|
|
if (block_in_file < last_block) {
|
|
map_bh->b_size = (last_block-block_in_file) << blkbits;
|
|
if (args->get_block(inode, block_in_file, map_bh, 0))
|
|
goto confused;
|
|
args->first_logical_block = block_in_file;
|
|
}
|
|
|
|
if (!buffer_mapped(map_bh)) {
|
|
fully_mapped = 0;
|
|
if (first_hole == blocks_per_page)
|
|
first_hole = page_block;
|
|
page_block++;
|
|
block_in_file++;
|
|
continue;
|
|
}
|
|
|
|
/* some filesystems will copy data into the page during
|
|
* the get_block call, in which case we don't want to
|
|
* read it again. map_buffer_to_page copies the data
|
|
* we just collected from get_block into the page's buffers
|
|
* so readpage doesn't have to repeat the get_block call
|
|
*/
|
|
if (buffer_uptodate(map_bh)) {
|
|
map_buffer_to_page(page, map_bh, page_block);
|
|
goto confused;
|
|
}
|
|
|
|
if (first_hole != blocks_per_page)
|
|
goto confused; /* hole -> non-hole */
|
|
|
|
/* Contiguous blocks? */
|
|
if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
|
|
goto confused;
|
|
nblocks = map_bh->b_size >> blkbits;
|
|
for (relative_block = 0; ; relative_block++) {
|
|
if (relative_block == nblocks) {
|
|
clear_buffer_mapped(map_bh);
|
|
break;
|
|
} else if (page_block == blocks_per_page)
|
|
break;
|
|
blocks[page_block] = map_bh->b_blocknr+relative_block;
|
|
page_block++;
|
|
block_in_file++;
|
|
}
|
|
bdev = map_bh->b_bdev;
|
|
}
|
|
|
|
if (first_hole != blocks_per_page) {
|
|
zero_user_segment(page, first_hole << blkbits, PAGE_SIZE);
|
|
if (first_hole == 0) {
|
|
SetPageUptodate(page);
|
|
unlock_page(page);
|
|
goto out;
|
|
}
|
|
} else if (fully_mapped) {
|
|
SetPageMappedToDisk(page);
|
|
}
|
|
|
|
if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
|
|
cleancache_get_page(page) == 0) {
|
|
SetPageUptodate(page);
|
|
goto confused;
|
|
}
|
|
|
|
/*
|
|
* This page will go to BIO. Do we need to send this BIO off first?
|
|
*/
|
|
if (args->bio && (args->last_block_in_bio != blocks[0] - 1))
|
|
args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
|
|
|
|
alloc_new:
|
|
if (args->bio == NULL) {
|
|
if (first_hole == blocks_per_page) {
|
|
if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9),
|
|
page))
|
|
goto out;
|
|
}
|
|
args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
|
|
min_t(int, args->nr_pages,
|
|
BIO_MAX_PAGES),
|
|
gfp);
|
|
if (args->bio == NULL)
|
|
goto confused;
|
|
}
|
|
|
|
length = first_hole << blkbits;
|
|
if (bio_add_page(args->bio, page, length, 0) < length) {
|
|
args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
|
|
goto alloc_new;
|
|
}
|
|
|
|
relative_block = block_in_file - args->first_logical_block;
|
|
nblocks = map_bh->b_size >> blkbits;
|
|
if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
|
|
(first_hole != blocks_per_page))
|
|
args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
|
|
else
|
|
args->last_block_in_bio = blocks[blocks_per_page - 1];
|
|
out:
|
|
return args->bio;
|
|
|
|
confused:
|
|
if (args->bio)
|
|
args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
|
|
if (!PageUptodate(page))
|
|
block_read_full_page(page, args->get_block);
|
|
else
|
|
unlock_page(page);
|
|
goto out;
|
|
}
|
|
|
|
/**
|
|
* mpage_readpages - populate an address space with some pages & start reads against them
|
|
* @mapping: the address_space
|
|
* @pages: The address of a list_head which contains the target pages. These
|
|
* pages have their ->index populated and are otherwise uninitialised.
|
|
* The page at @pages->prev has the lowest file offset, and reads should be
|
|
* issued in @pages->prev to @pages->next order.
|
|
* @nr_pages: The number of pages at *@pages
|
|
* @get_block: The filesystem's block mapper function.
|
|
*
|
|
* This function walks the pages and the blocks within each page, building and
|
|
* emitting large BIOs.
|
|
*
|
|
* If anything unusual happens, such as:
|
|
*
|
|
* - encountering a page which has buffers
|
|
* - encountering a page which has a non-hole after a hole
|
|
* - encountering a page with non-contiguous blocks
|
|
*
|
|
* then this code just gives up and calls the buffer_head-based read function.
|
|
* It does handle a page which has holes at the end - that is a common case:
|
|
* the end-of-file on blocksize < PAGE_SIZE setups.
|
|
*
|
|
* BH_Boundary explanation:
|
|
*
|
|
* There is a problem. The mpage read code assembles several pages, gets all
|
|
* their disk mappings, and then submits them all. That's fine, but obtaining
|
|
* the disk mappings may require I/O. Reads of indirect blocks, for example.
|
|
*
|
|
* So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be
|
|
* submitted in the following order:
|
|
*
|
|
* 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16
|
|
*
|
|
* because the indirect block has to be read to get the mappings of blocks
|
|
* 13,14,15,16. Obviously, this impacts performance.
|
|
*
|
|
* So what we do it to allow the filesystem's get_block() function to set
|
|
* BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block
|
|
* after this one will require I/O against a block which is probably close to
|
|
* this one. So you should push what I/O you have currently accumulated.
|
|
*
|
|
* This all causes the disk requests to be issued in the correct order.
|
|
*/
|
|
int
|
|
mpage_readpages(struct address_space *mapping, struct list_head *pages,
|
|
unsigned nr_pages, get_block_t get_block)
|
|
{
|
|
struct mpage_readpage_args args = {
|
|
.get_block = get_block,
|
|
.is_readahead = true,
|
|
};
|
|
unsigned page_idx;
|
|
|
|
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
|
|
struct page *page = lru_to_page(pages);
|
|
|
|
prefetchw(&page->flags);
|
|
list_del(&page->lru);
|
|
if (!add_to_page_cache_lru(page, mapping,
|
|
page->index,
|
|
readahead_gfp_mask(mapping))) {
|
|
args.page = page;
|
|
args.nr_pages = nr_pages - page_idx;
|
|
args.bio = do_mpage_readpage(&args);
|
|
}
|
|
put_page(page);
|
|
}
|
|
BUG_ON(!list_empty(pages));
|
|
if (args.bio)
|
|
mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio);
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL(mpage_readpages);
|
|
|
|
/*
|
|
* This isn't called much at all
|
|
*/
|
|
int mpage_readpage(struct page *page, get_block_t get_block)
|
|
{
|
|
struct mpage_readpage_args args = {
|
|
.page = page,
|
|
.nr_pages = 1,
|
|
.get_block = get_block,
|
|
};
|
|
|
|
args.bio = do_mpage_readpage(&args);
|
|
if (args.bio)
|
|
mpage_bio_submit(REQ_OP_READ, 0, args.bio);
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL(mpage_readpage);
|
|
|
|
/*
|
|
* Writing is not so simple.
|
|
*
|
|
* If the page has buffers then they will be used for obtaining the disk
|
|
* mapping. We only support pages which are fully mapped-and-dirty, with a
|
|
* special case for pages which are unmapped at the end: end-of-file.
|
|
*
|
|
* If the page has no buffers (preferred) then the page is mapped here.
|
|
*
|
|
* If all blocks are found to be contiguous then the page can go into the
|
|
* BIO. Otherwise fall back to the mapping's writepage().
|
|
*
|
|
* FIXME: This code wants an estimate of how many pages are still to be
|
|
* written, so it can intelligently allocate a suitably-sized BIO. For now,
|
|
* just allocate full-size (16-page) BIOs.
|
|
*/
|
|
|
|
struct mpage_data {
|
|
struct bio *bio;
|
|
sector_t last_block_in_bio;
|
|
get_block_t *get_block;
|
|
unsigned use_writepage;
|
|
};
|
|
|
|
/*
|
|
* We have our BIO, so we can now mark the buffers clean. Make
|
|
* sure to only clean buffers which we know we'll be writing.
|
|
*/
|
|
static void clean_buffers(struct page *page, unsigned first_unmapped)
|
|
{
|
|
unsigned buffer_counter = 0;
|
|
struct buffer_head *bh, *head;
|
|
if (!page_has_buffers(page))
|
|
return;
|
|
head = page_buffers(page);
|
|
bh = head;
|
|
|
|
do {
|
|
if (buffer_counter++ == first_unmapped)
|
|
break;
|
|
clear_buffer_dirty(bh);
|
|
bh = bh->b_this_page;
|
|
} while (bh != head);
|
|
|
|
/*
|
|
* we cannot drop the bh if the page is not uptodate or a concurrent
|
|
* readpage would fail to serialize with the bh and it would read from
|
|
* disk before we reach the platter.
|
|
*/
|
|
if (buffer_heads_over_limit && PageUptodate(page))
|
|
try_to_free_buffers(page);
|
|
}
|
|
|
|
/*
|
|
* For situations where we want to clean all buffers attached to a page.
|
|
* We don't need to calculate how many buffers are attached to the page,
|
|
* we just need to specify a number larger than the maximum number of buffers.
|
|
*/
|
|
void clean_page_buffers(struct page *page)
|
|
{
|
|
clean_buffers(page, ~0U);
|
|
}
|
|
|
|
static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
|
|
void *data)
|
|
{
|
|
struct mpage_data *mpd = data;
|
|
struct bio *bio = mpd->bio;
|
|
struct address_space *mapping = page->mapping;
|
|
struct inode *inode = page->mapping->host;
|
|
const unsigned blkbits = inode->i_blkbits;
|
|
unsigned long end_index;
|
|
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
|
|
sector_t last_block;
|
|
sector_t block_in_file;
|
|
sector_t blocks[MAX_BUF_PER_PAGE];
|
|
unsigned page_block;
|
|
unsigned first_unmapped = blocks_per_page;
|
|
struct block_device *bdev = NULL;
|
|
int boundary = 0;
|
|
sector_t boundary_block = 0;
|
|
struct block_device *boundary_bdev = NULL;
|
|
int length;
|
|
struct buffer_head map_bh;
|
|
loff_t i_size = i_size_read(inode);
|
|
int ret = 0;
|
|
int op_flags = wbc_to_write_flags(wbc);
|
|
|
|
if (page_has_buffers(page)) {
|
|
struct buffer_head *head = page_buffers(page);
|
|
struct buffer_head *bh = head;
|
|
|
|
/* If they're all mapped and dirty, do it */
|
|
page_block = 0;
|
|
do {
|
|
BUG_ON(buffer_locked(bh));
|
|
if (!buffer_mapped(bh)) {
|
|
/*
|
|
* unmapped dirty buffers are created by
|
|
* __set_page_dirty_buffers -> mmapped data
|
|
*/
|
|
if (buffer_dirty(bh))
|
|
goto confused;
|
|
if (first_unmapped == blocks_per_page)
|
|
first_unmapped = page_block;
|
|
continue;
|
|
}
|
|
|
|
if (first_unmapped != blocks_per_page)
|
|
goto confused; /* hole -> non-hole */
|
|
|
|
if (!buffer_dirty(bh) || !buffer_uptodate(bh))
|
|
goto confused;
|
|
if (page_block) {
|
|
if (bh->b_blocknr != blocks[page_block-1] + 1)
|
|
goto confused;
|
|
}
|
|
blocks[page_block++] = bh->b_blocknr;
|
|
boundary = buffer_boundary(bh);
|
|
if (boundary) {
|
|
boundary_block = bh->b_blocknr;
|
|
boundary_bdev = bh->b_bdev;
|
|
}
|
|
bdev = bh->b_bdev;
|
|
} while ((bh = bh->b_this_page) != head);
|
|
|
|
if (first_unmapped)
|
|
goto page_is_mapped;
|
|
|
|
/*
|
|
* Page has buffers, but they are all unmapped. The page was
|
|
* created by pagein or read over a hole which was handled by
|
|
* block_read_full_page(). If this address_space is also
|
|
* using mpage_readpages then this can rarely happen.
|
|
*/
|
|
goto confused;
|
|
}
|
|
|
|
/*
|
|
* The page has no buffers: map it to disk
|
|
*/
|
|
BUG_ON(!PageUptodate(page));
|
|
block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
|
|
last_block = (i_size - 1) >> blkbits;
|
|
map_bh.b_page = page;
|
|
for (page_block = 0; page_block < blocks_per_page; ) {
|
|
|
|
map_bh.b_state = 0;
|
|
map_bh.b_size = 1 << blkbits;
|
|
if (mpd->get_block(inode, block_in_file, &map_bh, 1))
|
|
goto confused;
|
|
if (buffer_new(&map_bh))
|
|
clean_bdev_bh_alias(&map_bh);
|
|
if (buffer_boundary(&map_bh)) {
|
|
boundary_block = map_bh.b_blocknr;
|
|
boundary_bdev = map_bh.b_bdev;
|
|
}
|
|
if (page_block) {
|
|
if (map_bh.b_blocknr != blocks[page_block-1] + 1)
|
|
goto confused;
|
|
}
|
|
blocks[page_block++] = map_bh.b_blocknr;
|
|
boundary = buffer_boundary(&map_bh);
|
|
bdev = map_bh.b_bdev;
|
|
if (block_in_file == last_block)
|
|
break;
|
|
block_in_file++;
|
|
}
|
|
BUG_ON(page_block == 0);
|
|
|
|
first_unmapped = page_block;
|
|
|
|
page_is_mapped:
|
|
end_index = i_size >> PAGE_SHIFT;
|
|
if (page->index >= end_index) {
|
|
/*
|
|
* The page straddles i_size. It must be zeroed out on each
|
|
* and every writepage invocation because it may be mmapped.
|
|
* "A file is mapped in multiples of the page size. For a file
|
|
* that is not a multiple of the page size, the remaining memory
|
|
* is zeroed when mapped, and writes to that region are not
|
|
* written out to the file."
|
|
*/
|
|
unsigned offset = i_size & (PAGE_SIZE - 1);
|
|
|
|
if (page->index > end_index || !offset)
|
|
goto confused;
|
|
zero_user_segment(page, offset, PAGE_SIZE);
|
|
}
|
|
|
|
/*
|
|
* This page will go to BIO. Do we need to send this BIO off first?
|
|
*/
|
|
if (bio && mpd->last_block_in_bio != blocks[0] - 1)
|
|
bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
|
|
|
|
alloc_new:
|
|
if (bio == NULL) {
|
|
if (first_unmapped == blocks_per_page) {
|
|
if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9),
|
|
page, wbc))
|
|
goto out;
|
|
}
|
|
bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
|
|
BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH);
|
|
if (bio == NULL)
|
|
goto confused;
|
|
|
|
wbc_init_bio(wbc, bio);
|
|
bio->bi_write_hint = inode->i_write_hint;
|
|
}
|
|
|
|
/*
|
|
* Must try to add the page before marking the buffer clean or
|
|
* the confused fail path above (OOM) will be very confused when
|
|
* it finds all bh marked clean (i.e. it will not write anything)
|
|
*/
|
|
wbc_account_cgroup_owner(wbc, page, PAGE_SIZE);
|
|
length = first_unmapped << blkbits;
|
|
if (bio_add_page(bio, page, length, 0) < length) {
|
|
bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
|
|
goto alloc_new;
|
|
}
|
|
|
|
clean_buffers(page, first_unmapped);
|
|
|
|
BUG_ON(PageWriteback(page));
|
|
set_page_writeback(page);
|
|
unlock_page(page);
|
|
if (boundary || (first_unmapped != blocks_per_page)) {
|
|
bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
|
|
if (boundary_block) {
|
|
write_boundary_block(boundary_bdev,
|
|
boundary_block, 1 << blkbits);
|
|
}
|
|
} else {
|
|
mpd->last_block_in_bio = blocks[blocks_per_page - 1];
|
|
}
|
|
goto out;
|
|
|
|
confused:
|
|
if (bio)
|
|
bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
|
|
|
|
if (mpd->use_writepage) {
|
|
ret = mapping->a_ops->writepage(page, wbc);
|
|
} else {
|
|
ret = -EAGAIN;
|
|
goto out;
|
|
}
|
|
/*
|
|
* The caller has a ref on the inode, so *mapping is stable
|
|
*/
|
|
mapping_set_error(mapping, ret);
|
|
out:
|
|
mpd->bio = bio;
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
|
|
* @mapping: address space structure to write
|
|
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
|
|
* @get_block: the filesystem's block mapper function.
|
|
* If this is NULL then use a_ops->writepage. Otherwise, go
|
|
* direct-to-BIO.
|
|
*
|
|
* This is a library function, which implements the writepages()
|
|
* address_space_operation.
|
|
*
|
|
* If a page is already under I/O, generic_writepages() skips it, even
|
|
* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
|
|
* but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
|
|
* and msync() need to guarantee that all the data which was dirty at the time
|
|
* the call was made get new I/O started against them. If wbc->sync_mode is
|
|
* WB_SYNC_ALL then we were called for data integrity and we must wait for
|
|
* existing IO to complete.
|
|
*/
|
|
int
|
|
mpage_writepages(struct address_space *mapping,
|
|
struct writeback_control *wbc, get_block_t get_block)
|
|
{
|
|
struct blk_plug plug;
|
|
int ret;
|
|
|
|
blk_start_plug(&plug);
|
|
|
|
if (!get_block)
|
|
ret = generic_writepages(mapping, wbc);
|
|
else {
|
|
struct mpage_data mpd = {
|
|
.bio = NULL,
|
|
.last_block_in_bio = 0,
|
|
.get_block = get_block,
|
|
.use_writepage = 1,
|
|
};
|
|
|
|
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
|
|
if (mpd.bio) {
|
|
int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
|
|
REQ_SYNC : 0);
|
|
mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
|
|
}
|
|
}
|
|
blk_finish_plug(&plug);
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL(mpage_writepages);
|
|
|
|
int mpage_writepage(struct page *page, get_block_t get_block,
|
|
struct writeback_control *wbc)
|
|
{
|
|
struct mpage_data mpd = {
|
|
.bio = NULL,
|
|
.last_block_in_bio = 0,
|
|
.get_block = get_block,
|
|
.use_writepage = 0,
|
|
};
|
|
int ret = __mpage_writepage(page, wbc, &mpd);
|
|
if (mpd.bio) {
|
|
int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
|
|
REQ_SYNC : 0);
|
|
mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
|
|
}
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL(mpage_writepage);
|