Merge keystone/android14-6.1-keystone-qcom-release.6.1.1 (925907e) into msm-pineapple

* refs/heads/tmp-925907e: ANDROID: GKI: Remove CONFIG_LOCALVERSION="-mainline" on 6.1 branch ANDROID: gki_defconfig: sample large page_alloc allocations with HW_TAGS KASAN FROMLIST: kasan: allow sampling page_alloc allocations for HW_TAGS ANDROID: fscrypt, blk-crypto: drop HW-wrapped key compatibility check ANDROID: GKI: Enable CONFIG_NF_CONNTRACK_PROCFS ANDROID: mm: arm64: Allow remapping logical mappings as uncached ANDROID: fuse-bpf: Fix crash from assuming iter is kvec ANDROID: fuse-bpf: Add /sys/fs flags for fuse-bpf version ANDROID: fuse-bpf: Make sure to declare functions ANDROID: fuse-bpf v1.1 ANDROID: KVM: arm64: Add helper for pKVM modules addr conversion ANDROID: timer: Add vendor hook for timer calc index ANDROID: KVM: arm64: Fix calculation for number of relocs in .hyp.reloc ANDROID: KVM: arm64: Ignore modules with empty .hyp.text section Revert "ANDROID: KVM: arm64: Make gen-hyprel emit delimiters" ANDROID: KVM: arm64: Resolve hyp module addresses using ELF sections ANDROID: dma-buf: Add vendorhook to allow mmaping more memory than a DMA-BUF holds ANDROID: fips140: add fips140_lab_util program ANDROID: fips140: add kernel crypto module ANDROID: arm64: simd: omit capability check in may_use_simd() ANDROID: arm64: disable LSE when building the FIPS140 module ANDROID: arm64: only permit certain alternatives in the FIPS140 module ANDROID: jump_label: disable jump labels in fips140.ko ANDROID: crypto: define fips_enabled to 1 in fips140.ko ANDROID: crypto: lib/aes - add vendor hooks for AES library routines ANDROID: crypto: lib/sha256 - add vendor hook for sha256() routine ANDROID: kbuild: customize module linker script for fips140 module ANDROID: GKI: Remove usage of __GENKSYMS__ in vendor hooks source ANDROID: iommu: Add vendor hook to alloc_iova() ANDROID: iommu: Add vendor hook to select alloc_iova algorithm ANDROID: iommu: Add a vendor field in iova_domain ANDROID: usb: gadget: f_accessory: update SS/SSP descriptors ANDROID: kbuild: Search external devicetree path when running clean target Revert "ANDROID: KVM: arm64: Coalesce host stage2 entries on ownership reclaim" ANDROID: KVM: arm64: Keep the pKVM private range under 1GiB ANDROID: KVM: arm64: Specify stage-2-protected regions in DT ANDROID: KVM: arm64: Introduce concept of pKVM moveable regions ANDROID: KVM: arm64: Correctly flag MMIO pages as PKVM_PAGE_RESTRICTED_PROT ANDROID: KVM: arm64: Introduce default_host_prot() ANDROID: KVM: arm64: Introduce a hyp panic module notifier ANDROID: KVM: arm64: Expose linear map APIs to pKVM modules ANDROID: scheduler: add vendor-specific wake flag ANDROID: Add a vendor hook that allow a module to modify the wake flag ANDROID: futex: Add vendor hook for wait queue ANDROID: rwsem: Add vendor hook to the rw-semaphore FROMLIST: sched/pelt: Introduce PELT multiplier ANDROID: GKI: Export clocksource_mmio_init ANDROID: update "fscrypt: add support for hardware-wrapped keys" to v7 ANDROID: update "dm: add support for passing through derive_sw_secret" ANDROID: update "block: add basic hardware-wrapped key support" to v7 ANDROID: dm-default-key: update for blk-crypto changes UPSTREAM: blk-crypto: Add a missing include directive UPSTREAM: blk-crypto: move internal only declarations to blk-crypto-internal.h BACKPORT: blk-crypto: add a blk_crypto_config_supported_natively helper BACKPORT: blk-crypto: don't use struct request_queue for public interfaces f2fs: let's avoid panic if extent_tree is not created f2fs: should use a temp extent_info for lookup f2fs: don't mix to use union values in extent_info f2fs: initialize extent_cache parameter f2fs: fix to avoid NULL pointer dereference in f2fs_issue_flush() ANDROID: update the BRANCH constant ANDROID: inline isolate_and_split_free_page ANDROID: mm: compaction: fix isolate_and_split_free_page() redefinition ANDROID: implement wrapper for reverse migration ANDROID: KVM: Remove function_nocfi() leftover in pKVM modules ANDROID: KVM: arm64: Always declare pKVM module loading functions ANDROID: GKI: Source GKI_BUILD_CONFIG_FRAGMENT after setting all variables ANDROID: cpuidle: export cpuidle_driver_state_disabled UPSTREAM: mm/madvise: fix madvise_pageout for private file mappings ANDROID: KVM: arm64: Allow trap handling from pKVM modules ANDROID: KVM: arm64: Notify pKVM modules of PSCI events ANDROID: KVM: arm64: Allow handling illegal aborts from pKVM modules ANDROID: KVM: arm64: Allow SMC handling from pKVM modules fscrypt: add additional documentation for SM4 support fscrypt: remove unused Speck definitions fscrypt: Add SM4 XTS/CTS symmetric algorithm support blk-crypto: Add support for SM4-XTS blk crypto mode blk-crypto: pass a gendisk to blk_crypto_sysfs_{,un}register fscrypt: add comment for fscrypt_valid_enc_modes_v1() blk-crypto: Add a missing include directive blk-crypto: move internal only declarations to blk-crypto-internal.h blk-crypto: add a blk_crypto_config_supported_natively helper blk-crypto: don't use struct request_queue for public interfaces fscrypt: pass super_block to fscrypt_put_master_key_activeref() Linux 6.1.1 KEYS: encrypted: fix key instantiation with user-provided data cifs: fix oops during encryption usb: dwc3: pci: Update PCIe device ID for USB3 controller on CPU sub-system for Raptor Lake usb: typec: ucsi: Resume in separate work igb: Initialize mailbox message for VF reset staging: r8188eu: fix led register settings xhci: Apply XHCI_RESET_TO_DEFAULT quirk to ADL-N ALSA: hda/realtek: fix mute/micmute LEDs for a HP ProBook USB: serial: f81534: fix division by zero on line-speed change USB: serial: f81232: fix division by zero on line-speed change USB: serial: cp210x: add Kamstrup RF sniffer PIDs USB: serial: option: add Quectel EM05-G modem usb: gadget: uvc: Prevent buffer overflow in setup handler udf: Fix extending file within last block udf: Do not bother looking for prealloc extents if i_lenExtents matches i_size udf: Fix preallocation discarding at indirect extent boundary udf: Discard preallocation before extending file with a hole irqchip/ls-extirq: Fix endianness detection mips: ralink: mt7621: do not use kzalloc too early mips: ralink: mt7621: soc queries and tests as functions mips: ralink: mt7621: define MT7621_SYSC_BASE with __iomem PCI: mt7621: Add sentinel to quirks table libbpf: Fix uninitialized warning in btf_dump_dump_type_data x86/vdso: Conditionally export __vdso_sgx_enter_enclave() Revert "ANDROID: GKI: remove CONFIG_CMDLINE_EXTEND from arm64 gki_defconfig" ANDROID: Revert "arm64: Drop support for CMDLINE_EXTEND" ANDROID: of: Support CONFIG_CMDLINE_EXTEND config option FROMGIT: asm-generic/io: Add _RET_IP_ to MMIO trace for more accurate debug info ANDROID: firmware_loader: Add support for customer firmware paths ANDROID: GKI: Enable CONFIG_RT_SOFTIRQ_AWARE_SCHED FROMLIST: trace: Add trace points for tasklet entry/exit FROMLIST: softirq: defer softirq processing to ksoftirqd if CPU is busy with RT FROMLIST: sched: Avoid placing RT threads on cores handling long softirqs FROMLIST: softirq: Add generic accessor to percpu softirq_pending data ANDROID: sched/cpuset: Add vendor hook to change tasks affinity ANDROID: cpuset: Make cpusets restore on hotplug fsverity: simplify fsverity_get_digest() fsverity: stop using PG_error to track error status f2fs: reset wait_ms to default if any of the victims have been selected f2fs: fix some format WARNING in debug.c and sysfs.c f2fs: don't call f2fs_issue_discard_timeout() when discard_cmd_cnt is 0 in f2fs_put_super() f2fs: fix iostat parameter for discard f2fs: Fix spelling mistake in label: free_bio_enrty_cache -> free_bio_entry_cache f2fs: add block_age-based extent cache f2fs: allocate the extent_cache by default f2fs: refactor extent_cache to support for read and more f2fs: remove unnecessary __init_extent_tree f2fs: move internal functions into extent_cache.c f2fs: specify extent cache for read explicitly f2fs: introduce f2fs_is_readonly() for readability f2fs: remove F2FS_SET_FEATURE() and F2FS_CLEAR_FEATURE() macro f2fs: do some cleanup for f2fs module init MAINTAINERS: Add f2fs bug tracker link f2fs: remove the unused flush argument to change_curseg f2fs: open code allocate_segment_by_default f2fs: remove struct segment_allocation default_salloc_ops f2fs: introduce discard_urgent_util sysfs node f2fs: define MIN_DISCARD_GRANULARITY macro f2fs: init discard policy after thread wakeup f2fs: avoid victim selection from previous victim section f2fs: truncate blocks in batch in __complete_revoke_list() f2fs: make __queue_discard_cmd() return void f2fs: fix description about discard_granularity node f2fs: move set_file_temperature into f2fs_new_inode f2fs: fix to enable compress for newly created file if extension matches f2fs: set zstd compress level correctly f2fs: change type for 'sbi->readdir_ra' f2fs: cleanup for 'f2fs_tuning_parameters' function f2fs: fix to alloc_mode changed after remount on a small volume device f2fs: remove submit label in __submit_discard_cmd() f2fs: fix to do sanity check on i_extra_isize in is_alive() f2fs: introduce F2FS_IOC_START_ATOMIC_REPLACE f2fs: fix to set flush_merge opt and show noflush_merge f2fs: initialize locks earlier in f2fs_fill_super() f2fs: optimize iteration over sparse directories f2fs: fix to avoid accessing uninitialized spinlock f2fs: correct i_size change for atomic writes f2fs: add proc entry to show discard_plist info f2fs: allow to read node block after shutdown f2fs: replace ternary operator with max() f2fs: replace gc_urgent_high_remaining with gc_remaining_trials f2fs: add missing bracket in doc f2fs: use sysfs_emit instead of sprintf f2fs: introduce gc_mode sysfs node f2fs: fix to destroy sbi->post_read_wq in error path of f2fs_fill_super() f2fs: fix return val in f2fs_start_ckpt_thread() f2fs: fix the msg data type f2fs: fix the assign logic of iocb f2fs: Fix typo in comments f2fs: introduce max_ordered_discard sysfs node f2fs: allow to set compression for inlined file f2fs: add barrier mount option f2fs: fix normal discard process f2fs: cleanup in f2fs_create_flush_cmd_control() f2fs: fix gc mode when gc_urgent_high_remaining is 1 f2fs: remove batched_trim_sections node f2fs: support fault injection for f2fs_is_valid_blkaddr() f2fs: fix to invalidate dcc->f2fs_issue_discard in error path f2fs: Fix the race condition of resize flag between resizefs f2fs: let's avoid to get cp_rwsem twice by f2fs_evict_inode by d_invalidate f2fs: should put a page when checking the summary info ANDROID: GKI: Update GKI modules protected exports ANDROID: GKI: Add list of protected GKI modules ANDROID: GKI: Only protect exports if KMI symbols are present ANDROID: GKI: Protect exports of protected GKI modules UPSTREAM: crypto: algboss - compile out test-related code when tests disabled UPSTREAM: crypto: kdf - silence noisy self-test UPSTREAM: crypto: kdf - skip self-test when tests disabled UPSTREAM: crypto: api - compile out crypto_boot_test_finished when tests disabled UPSTREAM: crypto: algboss - optimize registration of internal algorithms UPSTREAM: crypto: api - optimize algorithm registration when self-tests disabled ANDROID: KVM: arm64: Add support for non-cacheable mappings ANDROID: KVM: arm64: Don't filter out KVM_FUNC_MMIO_GUARD_MAP hypercalls ANDROID: KVM: arm64: Coalesce host stage2 entries on ownership reclaim ANDROID: KVM: arm64: Move kvm_pte_table to the common header ANDROID: KVM: arm64: Have different callbacks for PTE manipulation ANDROID: KVM: arm64: Move PTE attributes definitions to the common header ANDROID: KVM: arm64: Split stage2_put_pte function ANDROID: KVM: arm64: Pass the pagetable struct as an argument to the freewalker ANDROID: KVM: arm64: Fix link with CONFIG_MODULES=n ANDROID: KVM: arm64: Fix build with CONFIG_MODULES=n ANDROID: KVM: arm64: Block module loading based on cmdline or HVC ANDROID: KVM: arm64: Support unaligned fixmap in the nVHE hyp ANDROID: KVM: arm64: Add support for custom hypercall registration ANDROID: KVM: arm64: Return a token for a pKVM module registration ANDROID: KVM: arm64: Introduce hyp_protect_host_page() ANDROID: KVM: arm64: Add a permission fault handler ANDROID: KVM: arm64: Introduce PKVM_PAGE_RESTRICTED_PROT ANDROID: KVM: arm64: Expose kvm_flush_dcache_to_poc() in module_ops ANDROID: KVM: arm64: Expose hyp fixmap helpers in module_ops ANDROID: KVM: arm64: Expose puts and putx64 in pKVM ABI ANDROID: KVM: arm64: Add serial framework for pKVM ANDROID: KVM: arm64: Expose __pkvm_create_private_mapping to pKVM modules ANDROID: KVM: arm64: Include .note.gnu.property in .hyp.rodata ANDROID: KVM: arm64: Allow loading modules to the pKVM hypervisor ANDROID: KVM: arm64: Refactor nvhe Makefile ANDROID: KVM: arm64: Make gen-hyprel emit delimiters ANDROID: KVM: arm64: Move gen-hyprel into a tool directory ANDROID: KVM: arm64: Add mapping removal interface for nVHE hyp ANDROID: arm64: patching: Add aarch64_addr_write() ANDROID: arm64: patching: Refactor __aarch64_insn_write() ANDROID: KVM: arm64: Use correct pkvm owners type ANDROID: KVM: arm64: s2mpu: S2MPU V9 code ANDROID: KVM: arm64: s2mpu: Add MMIO and defines for V9 S2MPU ANDROID: KVM: arm64: s2mpu: rename versions to match major arch ANDROID: KVM: arm64: s2mpu: Abstract register initialization with version_ops ANDROID: KVM: arm64: s2mpu: Pass driver version during init ANDROID: KVM: arm64: s2mpu: Add SMPT and MPT functions to pgtable abstraction ANDROID: KVM: arm64: s2mpu: Abstract page table ops ANDROID: KVM: arm64: iommu: Support dynamic driver registration in IOMMU layer ANDROID: KVM: arm64: Use 32-bit function ID for PSCI MEM_PROTECT call Revert "ANDROID: virtio_balloon: New module parameter "pkvm"" ANDROID: KVM: arm64: s2mpu: Fix SYNC latency regression ANDROID: KVM: arm64: iommu: Add host_stage2_idmap_complete ANDROID: KVM: arm64: Don't update IOMMUs unnecessarily ANDROID: KVM: arm64: s2mpu: Add SysMMU_SYNC timeout ANDROID: KVM: arm64: s2mpu: Allow r/o access to control regs ANDROID: KVM: arm64: s2mpu: Allow reading MPTC entries ANDROID: KVM: arm64: s2mpu: Allow L1ENTRY_* r/o access ANDROID: KVM: arm64: s2mpu: Refactor DABT handler ANDROID: KVM: arm64: s2mpu: Extract L1ENTRY_* consts ANDROID: KVM: arm64: s2mpu: Initialize MPTs to PROT_RW ANDROID: KVM: arm64: iommu: Optimize snapshot_host_stage2 ANDROID: KVM: arm64: iommu: Fix upper bound of PT walk ANDROID: KVM: arm64: iommu: Add pkvm_iommu_finalize ANDROID: KVM: arm64: iommu: No powered check in DABT handler ANDROID: KVM: arm64: s2mpu: Create SysMMU_SYNC driver ANDROID: KVM: arm64: iommu: Create parent/child relation ANDROID: KVM: arm64: iommu: Run validate() on struct pkvm_iommu ANDROID: KVM: arm64: iommu: Create private mapping last ANDROID: KVM: arm64: iommu: Free memory on registration error ANDROID: KVM: arm64: iommu: Harden __pkvm_iommu_pm_notify ANDROID: KVM: arm64: Remove unused IOMMU hooks, kvm_iommu_ops ANDROID: KVM: arm64: s2mpu: Implement host stage2 idmap callbacks ANDROID: KVM: arm64: s2mpu: Move mpt_update_flags into FMPT ANDROID: KVM: arm64: s2mpu: Replace DABT handler with callback ANDROID: KVM: arm64: s2mpu: Replace SMC handler with PM callbacks ANDROID: KVM: arm64: s2mpu: Add driver initializer ANDROID: KVM: arm64: s2mpu: Remove host_stage2_adjust_mmio_range ANDROID: KVM: arm64: s2mpu: Replace struct s2mpu with pkvm_iommu ANDROID: KVM: arm64: s2mpu: Remove all EL1 code ANDROID: KVM: arm64: s2mpu: Move SFR init to EL2 ANDROID: KVM: arm64: iommu: Snapshot host stage-2 at driver init ANDROID: KVM: arm64: iommu: Host stage-2 idmap callbacks ANDROID: KVM: arm64: iommu: DABT handler callback ANDROID: KVM: arm64: iommu: Suspend/resume callbacks ANDROID: KVM: arm64: iommu: Register device hypcall ANDROID: KVM: arm64: iommu: Avoid mapping devices in host stage-2 ANDROID: KVM: arm64: iommu: Driver initialization hypcall ANDROID: KVM: arm64: Fix host MMIO DABT handler IPA ANDROID: KVM: arm64: Wait on S2MPU.STATUS after invalidation ANDROID: KVM: arm64: Remove kernel-doc in S2MPU driver ANDROID: KVM: arm64: Initialize pkvm_pgtable.mm_ops earlier ANDROID: KVM: arm64: Mark select_iommu_ops static ANDROID: Enable KVM_S2MPU in gki_defconfig ANDROID: KVM: arm64: Unmap S2MPU MMIO registers from host stage-2 ANDROID: KVM: arm64: Implement MMIO handler in S2MPU driver ANDROID: KVM: arm64: Modify S2MPU MPT in 'host_stage2_set_owner' ANDROID: KVM: arm64: Set up S2MPU Memory Protection Table ANDROID: KVM: arm64: Reprogram S2MPUs in 'host_smc_handler' ANDROID: KVM: arm64: Enable S2MPUs in __pkvm_init_stage2_iommu ANDROID: KVM: arm64: Copy S2MPU configuration to hyp ANDROID: KVM: arm64: Implement IRQ handler for S2MPU faults ANDROID: KVM: arm64: Allocate context IDs for valid VIDs ANDROID: KVM: arm64: Read and check S2MPU_VERSION ANDROID: KVM: arm64: Parse S2MPU MMIO region ANDROID: KVM: arm64: Create empty S2MPU driver ANDROID: KVM: arm64: Add 'host_stage2_adjust_mmio_range' to kvm_iommu_ops ANDROID: KVM: arm64: Add 'host_mmio_dabt_handler' to kvm_iommu_ops ANDROID: KVM: arm64: Add 'host_stage2_set_owner' to kvm_iommu_ops ANDROID: KVM: arm64: Add 'host_smc_handler' to kvm_iommu_ops ANDROID: KVM: arm64: Introduce IOMMU driver infrastructure ANDROID: KVM: arm64: Update pKVM hyp state series to v6 ANDROID: KVM: arm64: Add protected_shared_mem statistic ANDROID: KVM: arm64: count KVM s2 mmu usage in nVHE protected mode ANDROID: KVM: arm64: Add protected_hyp_mem VM statistic ANDROID: KVM: arm64: Fix sparse __percpu warning ANDROID: KVM: arm64: Relax SMCCC version check during FF-A proxy init ANDROID: KVM: arm64: Increase size of FF-A buffer BACKPORT: FROMLIST: KVM: arm64: pkvm: Add support for fragmented FF-A descriptors FROMLIST: KVM: arm64: Handle FFA_MEM_LEND calls from the host FROMLIST: KVM: arm64: Handle FFA_MEM_RECLAIM calls from the host FROMLIST: KVM: arm64: Handle FFA_MEM_SHARE calls from the host BACKPORT: FROMLIST: KVM: arm64: Add FF-A helpers to share/unshare memory with secure world FROMLIST: KVM: arm64: Handle FFA_RXTX_MAP and FFA_RXTX_UNMAP calls from the host FROMLIST: KVM: arm64: Allocate pages for hypervisor FF-A mailboxes FROMLIST: KVM: arm64: Handle FFA_FEATURES call from the host BACKPORT: FROMLIST: KVM: arm64: Probe FF-A version and host/hyp partition ID during init FROMLIST: KVM: arm64: Block unsafe FF-A calls from the host FROMLIST: firmware: arm_ffa: Move comment before the field it is documenting FROMLIST: firmware: arm_ffa: Move constants to header file ANDROID: KVM: arm64: Issue CMOs when tearing down shadow pages ANDROID: KVM: arm64: Use PSCI MEM_PROTECT to zap guest pages on reset ANDROID: KVM: arm64: Check IPA range for pvmfw during guest donation ANDROID: KVM: arm64: Use fixmap when poisoning pvmfw pages ANDROID: KVM: arm64: Rename pkvm_clear_pvmfw_pages() ANDROID: KVM: arm64: Rename hyp_zero_page() and make available as helper ANDROID: KVM: arm64: Don't check for hyp_fixmap_map() returning NULL ANDROID: virtio_balloon: Do not clear VIRTIO_F_ACCESS_PLATFORM ANDROID: virtio_balloon: New module parameter "pkvm" ANDROID: KVM: arm64: Introduce kvm_has_memrelinquish_services ANDROID: KVM: arm64: Flush nVHE hyp_vcpu memcache ANDROID: KVM: arm64: Avoid unnecessary unmap walk in MEM_RELINQUISH hypercall ANDROID: KVM: arm64: Strictly check page type in MEM_RELINQUISH hypercall ANDROID: KVM: Include prototype for page_relinquish before definition ANDROID: arm64: ioremap/iounmap use stage-2 granule size ANDROID: arm64: Check if pfn is valid for all ioremap loop iterations ANDROID: arm64: Auto-enroll MMIO guard on protected vms ANDROID: KVM: arm64: Add some documentation for the MMIO guard feature ANDROID: KVM: arm64: Plumb MMIO checking into the fault handling ANDROID: KVM: arm64: pkvm: Wire MMIO guard hypercalls ANDROID: KVM: arm64: pkvm: Add MMIO guard infrastructure ANDROID: KVM: arm64: Introduce KVM_ARCH_FLAG_MMIO_GUARD flag ANDROID: KVM: arm64: Expose topup_hyp_memcache() to the rest of KVM ANDROID: KVM: arm64: Define MMIO guard hypercalls ANDROID: KVM: arm64: FAR_EL2 mask as a define ANDROID: KVM: arm64: Turn kvm_pgtable_stage2_set_owner into kvm_pgtable_stage2_annotate ANDROID: memory relinquish: Fix build dependencies ANDROID: KVM: arm64: Monitor Debug support for non-protected guests ANDROID: KVM: arm64: Factor out code for saving/restoring guest debug regs ANDROID: KVM: arm64: Flush the vcpu iflags for non-protected VMs ANDROID: virtio_balloon: Do not translate reported pages through DMA API ANDROID: KVM: arm64: balloon: Notify hyp before reporting free pages to host ANDROID: KVM: arm64: memory balloon: Notify hyp when ballooning ANDROID: Define mem_relinquish interface for releasing memory to a hypervisor. ANDROID: KVM: arm64: Implement MEM_RELINQUISH SMCCC hypercall ANDROID: KVM: arm64: Turn llist of pinned pages into an rb-tree FROMLIST: KVM: arm64: pkvm: Fixup boot mode to reflect that the kernel resumes from EL1 ANDROID: KVM: arm64: Add .hyp.data section ANDROID: KVM: arm64: relay entropy requests from protected guests directly to secure ANDROID: KVM: arm64: Introduce KVM_CAP_ARM_PROTECTED_VM to set/query PVM firmware ANDROID: KVM: arm64: Reset primary vCPU according to PVM firmware boot protocol ANDROID: KVM: arm64: Copy pvmfw into guest pages during donation from the host ANDROID: KVM: arm64: Clear pvmfw pages on clean host shutdown ANDROID: KVM: arm64: Ignore length of 0 in kvm_flush_dcache_to_poc() ANDROID: KVM: arm64: Unmap PVM firmware from host stage-2 during de-privilege ANDROID: KVM: arm64: Parse reserved-memory node for pkvm guest firmware region ANDROID: Documentation: KVM: Add some documentation for Protected KVM on arm64 ANDROID: BACKPORT: KVM: arm64: Introduce KVM_VM_TYPE_ARM_PROTECTED machine type for PVMs ANDROID: KVM: arm64: Expose memory sharing hypercalls to protected guests ANDROID: KVM: arm64: Reformat/beautify PTP hypercall documentation ANDROID: KVM: arm64: Document the KVM/arm64-specific calls in hypercalls.rst ANDROID: KVM: arm64: Rename firmware pseudo-register documentation file ANDROID: KVM: arm64: Extend memory sharing to allow guest-to-host transitions ANDROID: KVM: arm64: Avoid BBM when changing only s/w bits in Stage-2 PTE ANDROID: KVM: arm64: Support TLB invalidation in guest context ANDROID: KVM: arm64: Inject SIGSEGV on illegal accesses ANDROID: KVM: arm64: Refactor enter_exception64() ANDROID: KVM: arm64: Add is_pkvm_initialized() helper ANDROID: KVM: arm64: Don't expose TLBI hypercalls after de-privilege ANDROID: KVM: arm64: Handle PSCI for protected VMs in EL2 ANDROID: KVM: arm64: Factor out vcpu_reset code for core registers and PSCI ANDROID: KVM: arm64: Move some kvm_psci functions to a shared header ANDROID: KVM: arm64: Move pstate reset values to kvm_arm.h ANDROID: KVM: arm64: Add HVC handling for protected guests at EL2 ANDROID: KVM: arm64: Track the SVE state in the hypervisor vcpu structure ANDROID: KVM: arm64: Initialize hypervisor vm state at EL2 ANDROID: KVM: arm64: Refactor kvm_vcpu_enable_ptrauth() for hyp use ANDROID: KVM: arm64: Do not update virtual timer state for protected VMs ANDROID: KVM: arm64: Move vgic state between host and hypervisor vcpu structures ANDROID: KVM: arm64: Add EL2 entry/exit handlers for pKVM guests ANDROID: KVM: arm64: Donate memory to protected guests ANDROID: KVM: arm64: Force injection of a data abort on NISV MMIO exit ANDROID: KVM: arm64: Advertise GICv3 sysreg interface to protected guests ANDROID: KVM: arm64: Fix initializing traps in protected mode ANDROID: KVM: arm64: Move pkvm_vcpu_init_traps to hyp vcpu init ANDROID: KVM: arm64: Reset sysregs for protected VMs ANDROID: KVM: arm64: Refactor reset_mpidr to extract its computation ANDROID: KVM: arm64: Do not support MTE for protected VMs ANDROID: KVM: arm64: Restrict protected VM capabilities ANDROID: KVM: arm64: Trap debug break and watch from guest ANDROID: KVM: arm64: Check directly whether the vcpu is protected ANDROID: KVM: arm64: Reduce host/hyp vcpu state copying ANDROID: KVM: arm64: Lazy host FP save/restore ANDROID: KVM: arm64: Reintroduce __sve_save_state ANDROID: KVM: arm64: Introduce lazy-ish state sync for non-protected VMs ANDROID: KVM: arm64: Introduce per-EC entry/exit handlers ANDROID: KVM: arm64: Ensure that TLBs and I-cache are private to each vcpu ANDROID: KVM: arm64: Add hyp per_cpu variable to track current physical cpu number ANDROID: KVM: arm64: Skip __kvm_adjust_pc() for protected vcpus ANDROID: KVM: arm64: Add current host and hyp vCPU lookup primitive ANDROID: KVM: arm64: Introduce the pkvm_vcpu_{load,put} hypercalls ANDROID: KVM: arm64: Add the {flush,sync}_hyp_timer_state() primitives ANDROID: KVM: arm64: Introduce predicates to check for protected state ANDROID: KVM: arm64: Add the {flush,sync}_hyp_vgic_state() primitives ANDROID: KVM: arm64: Simplify vgic-v3 hypercalls ANDROID: KVM: arm64: Make vcpu_{read,write}_sys_reg available to HYP code ANDROID: KVM: arm64: Split up nvhe/fixed_config.h ANDROID: KVM: arm64: Extend memory donation to allow host-to-guest transitions ANDROID: KVM: arm64: Handle guest stage-2 page-tables entirely at EL2 ANDROID: KVM: arm64: Disallow dirty logging and RO memslots with pKVM ANDROID: KVM: arm64: Do not allow memslot changes after first VM run under pKVM ANDROID: KVM: arm64: Check for PTE validity when checking for executable/cacheable ANDROID: KVM: arm64: Extend memory sharing to allow host-to-guest transitions ANDROID: KVM: arm64: Provide a hypercall for the host to reclaim guest memory ANDROID: KVM: arm64: Add PC_UPDATE_REQ flags covering all PC updates ANDROID: KVM: arm64: Add vcpu flag copy primitive ANDROID: KVM: arm64: Repurpose a byte of 'order' for flags in 'struct hyp_page' FROMLIST: KVM: arm64: Use the pKVM hyp vCPU structure in handle___kvm_vcpu_run() FROMLIST: KVM: arm64: Don't unnecessarily map host kernel sections at EL2 FROMLIST: KVM: arm64: Explicitly map 'kvm_vgic_global_state' at EL2 FROMLIST: KVM: arm64: Maintain a copy of 'kvm_arm_vmid_bits' at EL2 FROMLIST: KVM: arm64: Unmap 'kvm_arm_hyp_percpu_base' from the host FROMLIST: BACKPORT: KVM: arm64: Return guest memory from EL2 via dedicated teardown memcache FROMLIST: KVM: arm64: Instantiate guest stage-2 page-tables at EL2 FROMLIST: KVM: arm64: Consolidate stage-2 initialisation into a single function FROMLIST: KVM: arm64: Add generic hyp_memcache helpers FROMLIST: KVM: arm64: Provide I-cache invalidation by virtual address at EL2 FROMLIST: KVM: arm64: Initialise hypervisor copies of host symbols unconditionally FROMLIST: KVM: arm64: Add per-cpu fixmap infrastructure at EL2 FROMLIST: KVM: arm64: Instantiate pKVM hypervisor VM and vCPU structures from EL1 FROMLIST: KVM: arm64: Add infrastructure to create and track pKVM instances at EL2 FROMLIST: KVM: arm64: Rename 'host_kvm' to 'host_mmu' FROMLIST: KVM: arm64: Add hyp_spinlock_t static initializer FROMLIST: KVM: arm64: Include asm/kvm_mmu.h in nvhe/mem_protect.h FROMLIST: KVM: arm64: Add helpers to pin memory shared with the hypervisor at EL2 FROMLIST: KVM: arm64: Prevent the donation of no-map pages FROMLIST: KVM: arm64: Implement do_donate() helper for donating memory FROMLIST: KVM: arm64: Unify identifiers used to distinguish host and hypervisor FROMLIST: KVM: arm64: Fix-up hyp stage-1 refcounts for all pages mapped at EL2 FROMLIST: KVM: arm64: Back the hypervisor 'struct hyp_page' array for all memory FROMLIST: KVM: arm64: Allow attaching of non-coalescable pages to a hyp pool FROMLIST: KVM: arm64: Move hyp refcount manipulation helpers to common header file ANDROID: arm64: Register earlycon fixmap with the MMIO guard ANDROID: arm64: Add a helper to retrieve the PTE of a fixmap ANDROID: BACKPORT: arm64: Enroll into KVM's MMIO guard if required ANDROID: BACKPORT: arm64: Implement ioremap/iounmap hooks calling into KVM's MMIO guard ANDROID: mm/vmalloc: Add arch-specific callbacks to track io{remap,unmap} physical pages ANDROID: BACKPORT: arm64: mm: Implement memory encryption API using KVM sharing hypercalls ANDROID: drivers: hv: Include memory encryption header FROMLIST: firmware/smccc: Call arch-specific hook on discovering KVM services ANDROID: GKI: Enable CONFIG_CFI_CLANG Conflicts: drivers/android/vendor_hooks.c include/linux/compaction.h include/trace/hooks/timer.h mm/compaction.c Change-Id: Ie45fe12a1d6b67f4edb8c23ebb4409754b063385 Upstream-Build: ks_qcom-android14-6.1-keystone-qcom-release@9501271 UKQ2.230118.001 Signed-off-by: jianzhou <quic_jianzhou@quicinc.com>#
2023-02-21 21:29:07 -08:00 · 2023-02-21 21:29:07 -08:00 · 5eefc0e780
commit 5eefc0e780
parent aaf0584234 925907e3aa
333 changed files with 72332 additions and 148921 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -327,6 +327,23 @@ copy_to_dist_dir(
    flat = True,
 )

+kernel_build(
+    name = "fips140",
+    outs = [],
+    base_kernel = ":kernel_aarch64",
+    build_config = "build.config.gki.aarch64.fips140",
+    module_outs = ["crypto/fips140.ko"],
+)
+
+copy_to_dist_dir(
+    name = "fips140_dist",
+    data = [
+        ":fips140",
+    ],
+    dist_dir = "out/fips140/dist",
+    flat = True,
+)
+
 # allmodconfig build tests.
 # These are build tests only, so:
 # - outs are intentionally set to empty to not copy anything to DIST_DIR
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@ -99,6 +99,12 @@ Description:	Controls the issue rate of discard commands that consist of small
 		checkpoint is triggered, and issued during the checkpoint.
 		By default, it is disabled with 0.

+What:		/sys/fs/f2fs/<disk>/max_ordered_discard
+Date:		October 2022
+Contact:	"Yangtao Li" <frank.li@vivo.com>
+Description:	Controls the maximum ordered discard, the unit size is one block(4KB).
+		Set it to 16 by default.
+
 What:		/sys/fs/f2fs/<disk>/max_discard_request
 Date:		December 2021
 Contact:	"Konstantin Vyshetsky" <vkon@google.com>
@ -132,7 +138,8 @@ Contact:	"Chao Yu" <yuchao0@huawei.com>
 Description:	Controls discard granularity of inner discard thread. Inner thread
 		will not issue discards with size that is smaller than granularity.
 		The unit size is one block(4KB), now only support configuring
-		in range of [1, 512]. Default value is 4(=16KB).
+		in range of [1, 512]. Default value is 16.
+		For small devices, default value is 1.

 What:		/sys/fs/f2fs/<disk>/umount_discard_timeout
 Date:		January 2019
@ -235,7 +242,7 @@ Description:	Shows total written kbytes issued to disk.
 What:		/sys/fs/f2fs/<disk>/features
 Date:		July 2017
 Contact:	"Jaegeuk Kim" <jaegeuk@kernel.org>
-Description:	<deprecated: should use /sys/fs/f2fs/<disk>/feature_list/
+Description:	<deprecated: should use /sys/fs/f2fs/<disk>/feature_list/>
 		Shows all enabled features in current device.
 		Supported features:
 		encryption, blkzoned, extra_attr, projquota, inode_checksum,
@ -592,10 +599,10 @@ Description:	With "mode=fragment:block" mount options, we can scatter block allo
 		in the length of 1..<max_fragment_hole> by turns. This value can be set
 		between 1..512 and the default value is 4.

-What:		/sys/fs/f2fs/<disk>/gc_urgent_high_remaining
-Date:		December 2021
-Contact:	"Daeho Jeong" <daehojeong@google.com>
-Description:	You can set the trial count limit for GC urgent high mode with this value.
+What:		/sys/fs/f2fs/<disk>/gc_remaining_trials
+Date:		October 2022
+Contact:	"Yangtao Li" <frank.li@vivo.com>
+Description:	You can set the trial count limit for GC urgent and idle mode with this value.
 		If GC thread gets to the limit, the mode will turn back to GC normal mode.
 		By default, the value is zero, which means there is no limit like before.

@ -634,3 +641,31 @@ Date:		July 2022
 Contact:	"Daeho Jeong" <daehojeong@google.com>
 Description:	Show the accumulated total revoked atomic write block count after boot.
 		If you write "0" here, you can initialize to "0".
+
+What:		/sys/fs/f2fs/<disk>/gc_mode
+Date:		October 2022
+Contact:	"Yangtao Li" <frank.li@vivo.com>
+Description:	Show the current gc_mode as a string.
+		This is a read-only entry.
+
+What:		/sys/fs/f2fs/<disk>/discard_urgent_util
+Date:		November 2022
+Contact:	"Yangtao Li" <frank.li@vivo.com>
+Description:	When space utilization exceeds this, do background DISCARD aggressively.
+		Does DISCARD forcibly in a period of given min_discard_issue_time when the number
+		of discards is not 0 and set discard granularity to 1.
+		Default: 80
+
+What:		/sys/fs/f2fs/<disk>/hot_data_age_threshold
+Date:		November 2022
+Contact:	"Ping Xiong" <xiongping1@xiaomi.com>
+Description:	When DATA SEPARATION is on, it controls the age threshold to indicate
+		the data blocks as hot. By default it was initialized as 262144 blocks
+		(equals to 1GB).
+
+What:		/sys/fs/f2fs/<disk>/warm_data_age_threshold
+Date:		November 2022
+Contact:	"Ping Xiong" <xiongping1@xiaomi.com>
+Description:	When DATA SEPARATION is on, it controls the age threshold to indicate
+		the data blocks as warm. By default it was initialized as 2621440 blocks
+		(equals to 10GB).
--- a/Documentation/ABI/testing/sysfs-fs-fuse
+++ b/Documentation/ABI/testing/sysfs-fs-fuse
@ -0,0 +1,19 @@
+What:		/sys/fs/fuse/features/fuse_bpf
+Date:		December 2022
+Contact:	Paul Lawrence <paullawrence@google.com>
+Description:
+		Read-only file that contains the word 'supported' if fuse-bpf is
+		supported, does not exist otherwise
+
+What:		/sys/fs/fuse/bpf_prog_type_fuse
+Date:		December 2022
+Contact:	Paul Lawrence <paullawrence@google.com>
+Description:
+		bpf_prog_type_fuse defines the program type of bpf programs that
+		may be passed to fuse-bpf. For upstream bpf program types, this
+		is a constant defined in a contiguous array of constants.
+		bpf_prog_type_fuse is appended to the end of the list, so it may
+		change and therefore its value must be read from this file.
+
+		Contents is ASCII decimal representation of bpf_prog_type_fuse
+
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@ -2184,6 +2184,9 @@
 			1 - Bypass the IOMMU for DMA.
 			unset - Use value of CONFIG_IOMMU_DEFAULT_PASSTHROUGH.

+	ioremap_guard	[ARM64] enable the KVM MMIO guard functionality
+			if available.
+
 	io7=		[HW] IO7 for Marvel-based Alpha systems
 			See comment before marvel_specify_io7 in
 			arch/alpha/kernel/core_marvel.c.
@ -2529,7 +2532,9 @@
 			      protected guests.

 			protected: nVHE-based mode with support for guests whose
-				   state is kept private from the host.
+				   state is kept private from the host. See
+				   Documentation/virt/kvm/arm/pkvm.rst for more
+				   information about this mode of operation.

 			Defaults to VHE/nVHE based on hardware support. Setting
 			mode to "protected" will disable kexec and hibernation
--- a/Documentation/block/inline-encryption.rst
+++ b/Documentation/block/inline-encryption.rst
@ -142,7 +142,7 @@ Therefore, we also introduce *blk-crypto-fallback*, which is an implementation
 of inline encryption using the kernel crypto API.  blk-crypto-fallback is built
 into the block layer, so it works on any block device without any special setup.
 Essentially, when a bio with an encryption context is submitted to a
-request_queue that doesn't support that encryption context, the block layer will
+block_device that doesn't support that encryption context, the block layer will
 handle en/decryption of the bio using blk-crypto-fallback.

 For encryption, the data cannot be encrypted in-place, as callers usually rely
@ -187,7 +187,7 @@ API presented to users of the block layer

 ``blk_crypto_config_supported()`` allows users to check ahead of time whether
 inline encryption with particular crypto settings will work on a particular
-request_queue -- either via hardware or via blk-crypto-fallback.  This function
+block_device -- either via hardware or via blk-crypto-fallback.  This function
 takes in a ``struct blk_crypto_config`` which is like blk_crypto_key, but omits
 the actual bytes of the key and instead just contains the algorithm, data unit
 size, etc.  This function can be useful if blk-crypto-fallback is disabled.
@ -195,7 +195,7 @@ size, etc.  This function can be useful if blk-crypto-fallback is disabled.
 ``blk_crypto_init_key()`` allows users to initialize a blk_crypto_key.

 Users must call ``blk_crypto_start_using_key()`` before actually starting to use
-a blk_crypto_key on a request_queue (even if ``blk_crypto_config_supported()``
+a blk_crypto_key on a block_device (even if ``blk_crypto_config_supported()``
 was called earlier).  This is needed to initialize blk-crypto-fallback if it
 will be needed.  This must not be called from the data path, as this may have to
 allocate resources, which may deadlock in that case.
@ -207,7 +207,7 @@ for en/decryption.  Users don't need to worry about freeing the bio_crypt_ctx
 later, as that happens automatically when the bio is freed or reset.

 Finally, when done using inline encryption with a blk_crypto_key on a
-request_queue, users must call ``blk_crypto_evict_key()``.  This ensures that
+block_device, users must call ``blk_crypto_evict_key()``.  This ensures that
 the key is evicted from all keyslots it may be programmed into and unlinked from
 any kernel data structures it may be linked into.

@ -221,9 +221,9 @@ as follows:
 5. ``blk_crypto_evict_key()`` (after all I/O has completed)
 6. Zeroize the blk_crypto_key (this has no dedicated function)

-If a blk_crypto_key is being used on multiple request_queues, then
+If a blk_crypto_key is being used on multiple block_devices, then
 ``blk_crypto_config_supported()`` (if used), ``blk_crypto_start_using_key()``,
-and ``blk_crypto_evict_key()`` must be called on each request_queue.
+and ``blk_crypto_evict_key()`` must be called on each block_device.

 API presented to device drivers
 ===============================
@ -388,8 +388,8 @@ such as in file-based encryption.  Key wrapping is a commonly used technique.)
 The key which wraps (encrypts) hardware-wrapped keys is a hardware-internal key
 that is never exposed to software; it is either a persistent key (a "long-term
 wrapping key") or a per-boot key (an "ephemeral wrapping key").  The long-term
-wrapped form of the key is what is initially unlocked, but it is discarded as
-soon as it is converted into an ephemerally-wrapped key.  In-use
+wrapped form of the key is what is initially unlocked, but it is erased from
+memory as soon as it is converted into an ephemerally-wrapped key.  In-use
 hardware-wrapped keys are always ephemerally-wrapped, not long-term wrapped.

 As inline encryption hardware can only be used to encrypt/decrypt data on-disk,
@ -442,8 +442,8 @@ The components are:
  for cryptographic applications that require up to a 256-bit security strength.
  Some use cases (e.g. full-disk encryption) won't require the software secret.

-Example: in the case of fscrypt, the fscrypt master key (the key used to unlock
-a particular set of encrypted directories) is made hardware-wrapped.  The inline
+Example: in the case of fscrypt, the fscrypt master key (the key that protects a
+particular set of encrypted directories) is made hardware-wrapped.  The inline
 encryption key is used as the file contents encryption key, while the software
 secret (rather than the master key directly) is used to key fscrypt's KDF
 (HKDF-SHA512) to derive other subkeys such as filenames encryption keys.
@ -512,5 +512,6 @@ the hardware RNG and its use to generate the key, as well as the testing of the
 "import" mode as that should cover all parts other than the key generation.

 For an example of a test that verifies the ciphertext written to disk in the
-"import" mode, see `Android's vts_kernel_encryption_test
+"import" mode, see the fscrypt hardware-wrapped key tests in xfstests, or
+`Android's vts_kernel_encryption_test
 <https://android.googlesource.com/platform/test/vts-testcase/kernel/+/refs/heads/master/encryption/>`_.
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@ -140,6 +140,23 @@ disabling KASAN altogether or controlling its features:
 - ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc
  allocations (default: ``on``).

+- ``kasan.page_alloc.sample=<sampling interval>`` makes KASAN tag only every
+  Nth page_alloc allocation with the order equal or greater than
+  ``kasan.page_alloc.sample.order``, where N is the value of the ``sample``
+  parameter (default: ``1``, or tag every such allocation).
+  This parameter is intended to mitigate the performance overhead introduced
+  by KASAN.
+  Note that enabling this parameter makes Hardware Tag-Based KASAN skip checks
+  of allocations chosen by sampling and thus miss bad accesses to these
+  allocations. Use the default value for accurate bug detection.
+
+- ``kasan.page_alloc.sample.order=<minimum page order>`` specifies the minimum
+  order of allocations that are affected by sampling (default: ``3``).
+  Only applies when ``kasan.page_alloc.sample`` is set to a value greater
+  than ``1``.
+  This parameter is intended to allow sampling only large page_alloc
+  allocations, which is the biggest source of the performance overhead.
+
 Error reports
 ~~~~~~~~~~~~~

--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@ -25,10 +25,14 @@ a consistency checking tool (fsck.f2fs), and a debugging tool (dump.f2fs).

 - git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs-tools.git

-For reporting bugs and sending patches, please use the following mailing list:
+For sending patches, please use the following mailing list:

 - linux-f2fs-devel@lists.sourceforge.net

+For reporting bugs, please use the following f2fs bug tracker link:
+
+- https://bugzilla.kernel.org/enter_bug.cgi?product=File%20System&component=f2fs
+
 Background and Design issues
 ============================

@ -154,6 +158,8 @@ nobarrier		 This option can be used if underlying storage guarantees
 			 If this option is set, no cache_flush commands are issued
 			 but f2fs still guarantees the write ordering of all the
 			 data writes.
+barrier		 If this option is set, cache_flush commands are allowed to be
+			 issued.
 fastboot		 This option is used when a system wants to reduce mount
 			 time as much as possible, even though normal performance
 			 can be sacrificed.
@ -199,6 +205,7 @@ fault_type=%d		 Support configuring fault injection type, should be
 			 FAULT_SLAB_ALLOC	  0x000008000
 			 FAULT_DQUOT_INIT	  0x000010000
 			 FAULT_LOCK_OP		  0x000020000
+			 FAULT_BLKADDR		  0x000040000
 			 ===================	  ===========
 mode=%s			 Control block allocation mode which supports "adaptive"
 			 and "lfs". In "lfs" mode, there should be no random
@ -340,6 +347,10 @@ memory=%s		 Control memory mode. This supports "normal" and "low" modes.
 			 Because of the nature of low memory devices, in this mode, f2fs
 			 will try to save memory sometimes by sacrificing performance.
 			 "normal" mode is the default mode and same as before.
+age_extent_cache	 Enable an age extent cache based on rb-tree. It records
+			 data block update frequency of the extent per inode, in
+			 order to provide better temperature hints for data block
+			 allocation.
 ======================== ============================================================

 Debugfs Entries
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@ -338,6 +338,7 @@ Currently, the following pairs of encryption modes are supported:
 - AES-128-CBC for contents and AES-128-CTS-CBC for filenames
 - Adiantum for both contents and filenames
 - AES-256-XTS for contents and AES-256-HCTR2 for filenames (v2 policies only)
+- SM4-XTS for contents and SM4-CTS-CBC for filenames (v2 policies only)

 If unsure, you should use the (AES-256-XTS, AES-256-CTS-CBC) pair.

@ -369,6 +370,12 @@ CONFIG_CRYPTO_HCTR2 must be enabled.  Also, fast implementations of XCTR and
 POLYVAL should be enabled, e.g. CRYPTO_POLYVAL_ARM64_CE and
 CRYPTO_AES_ARM64_CE_BLK for ARM64.

+SM4 is a Chinese block cipher that is an alternative to AES.  It has
+not seen as much security review as AES, and it only has a 128-bit key
+size.  It may be useful in cases where its use is mandated.
+Otherwise, it should not be used.  For SM4 support to be available, it
+also needs to be enabled in the kernel crypto API.
+
 New encryption modes can be added relatively easily, without changes
 to individual filesystems.  However, authenticated encryption (AE)
 modes are not currently supported because of the difficulty of dealing
--- a/Documentation/security/keys/trusted-encrypted.rst
+++ b/Documentation/security/keys/trusted-encrypted.rst
@ -350,7 +350,8 @@ Load an encrypted key "evm" from saved blob::

 Instantiate an encrypted key "evm" using user-provided decrypted data::

-    $ keyctl add encrypted evm "new default user:kmk 32 `cat evm_decrypted_data.blob`" @u
+    $ evmkey=$(dd if=/dev/urandom bs=1 count=32 | xxd -c32 -p)
+    $ keyctl add encrypted evm "new default user:kmk 32 $evmkey" @u
    794890253

    $ keyctl print 794890253
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@ -6427,6 +6427,13 @@ Note that KVM does not skip the faulting instruction as it does for
 KVM_EXIT_MMIO, but userspace has to emulate any change to the processing state
 if it decides to decode and emulate the instruction.

+This feature isn't available to protected VMs, as userspace does not
+have access to the state that is required to perform the emulation.
+Instead, a data abort exception is directly injected in the guest.
+Note that although KVM_CAP_ARM_NISV_TO_USER will be reported if
+queried outside of a protected VM context, the feature will not be
+exposed if queried on a protected VM file descriptor.
+
 ::

 		/* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */
--- a/Documentation/virt/kvm/arm/fw-pseudo-registers.rst
+++ b/Documentation/virt/kvm/arm/fw-pseudo-registers.rst
@ -0,0 +1,138 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================================
+ARM firmware pseudo-registers interface
+=======================================
+
+KVM handles the hypercall services as requested by the guests. New hypercall
+services are regularly made available by the ARM specification or by KVM (as
+vendor services) if they make sense from a virtualization point of view.
+
+This means that a guest booted on two different versions of KVM can observe
+two different "firmware" revisions. This could cause issues if a given guest
+is tied to a particular version of a hypercall service, or if a migration
+causes a different version to be exposed out of the blue to an unsuspecting
+guest.
+
+In order to remedy this situation, KVM exposes a set of "firmware
+pseudo-registers" that can be manipulated using the GET/SET_ONE_REG
+interface. These registers can be saved/restored by userspace, and set
+to a convenient value as required.
+
+The following registers are defined:
+
+* KVM_REG_ARM_PSCI_VERSION:
+
+  KVM implements the PSCI (Power State Coordination Interface)
+  specification in order to provide services such as CPU on/off, reset
+  and power-off to the guest.
+
+  - Only valid if the vcpu has the KVM_ARM_VCPU_PSCI_0_2 feature set
+    (and thus has already been initialized)
+  - Returns the current PSCI version on GET_ONE_REG (defaulting to the
+    highest PSCI version implemented by KVM and compatible with v0.2)
+  - Allows any PSCI version implemented by KVM and compatible with
+    v0.2 to be set with SET_ONE_REG
+  - Affects the whole VM (even if the register view is per-vcpu)
+
+* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
+    Holds the state of the firmware support to mitigate CVE-2017-5715, as
+    offered by KVM to the guest via a HVC call. The workaround is described
+    under SMCCC_ARCH_WORKAROUND_1 in [1].
+
+  Accepted values are:
+
+    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL:
+      KVM does not offer
+      firmware support for the workaround. The mitigation status for the
+      guest is unknown.
+    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL:
+      The workaround HVC call is
+      available to the guest and required for the mitigation.
+    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED:
+      The workaround HVC call
+      is available to the guest, but it is not needed on this VCPU.
+
+* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
+    Holds the state of the firmware support to mitigate CVE-2018-3639, as
+    offered by KVM to the guest via a HVC call. The workaround is described
+    under SMCCC_ARCH_WORKAROUND_2 in [1]_.
+
+  Accepted values are:
+
+    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL:
+      A workaround is not
+      available. KVM does not offer firmware support for the workaround.
+    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN:
+      The workaround state is
+      unknown. KVM does not offer firmware support for the workaround.
+    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
+      The workaround is available,
+      and can be disabled by a vCPU. If
+      KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED is set, it is active for
+      this vCPU.
+    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED:
+      The workaround is always active on this vCPU or it is not needed.
+
+
+Bitmap Feature Firmware Registers
+---------------------------------
+
+Contrary to the above registers, the following registers exposes the
+hypercall services in the form of a feature-bitmap to the userspace. This
+bitmap is translated to the services that are available to the guest.
+There is a register defined per service call owner and can be accessed via
+GET/SET_ONE_REG interface.
+
+By default, these registers are set with the upper limit of the features
+that are supported. This way userspace can discover all the usable
+hypercall services via GET_ONE_REG. The user-space can write-back the
+desired bitmap back via SET_ONE_REG. The features for the registers that
+are untouched, probably because userspace isn't aware of them, will be
+exposed as is to the guest.
+
+Note that KVM will not allow the userspace to configure the registers
+anymore once any of the vCPUs has run at least once. Instead, it will
+return a -EBUSY.
+
+The pseudo-firmware bitmap register are as follows:
+
+* KVM_REG_ARM_STD_BMAP:
+    Controls the bitmap of the ARM Standard Secure Service Calls.
+
+  The following bits are accepted:
+
+    Bit-0: KVM_REG_ARM_STD_BIT_TRNG_V1_0:
+      The bit represents the services offered under v1.0 of ARM True Random
+      Number Generator (TRNG) specification, ARM DEN0098.
+
+* KVM_REG_ARM_STD_HYP_BMAP:
+    Controls the bitmap of the ARM Standard Hypervisor Service Calls.
+
+  The following bits are accepted:
+
+    Bit-0: KVM_REG_ARM_STD_HYP_BIT_PV_TIME:
+      The bit represents the Paravirtualized Time service as represented by
+      ARM DEN0057A.
+
+* KVM_REG_ARM_VENDOR_HYP_BMAP:
+    Controls the bitmap of the Vendor specific Hypervisor Service Calls.
+
+  The following bits are accepted:
+
+    Bit-0: KVM_REG_ARM_VENDOR_HYP_BIT_FUNC_FEAT
+      The bit represents the ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID
+      and ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID function-ids.
+
+    Bit-1: KVM_REG_ARM_VENDOR_HYP_BIT_PTP:
+      The bit represents the Precision Time Protocol KVM service.
+
+Errors:
+
+    =======  =============================================================
+    -ENOENT   Unknown register accessed.
+    -EBUSY    Attempt a 'write' to the register after the VM has started.
+    -EINVAL   Invalid bitmap written to the register.
+    =======  =============================================================
+
+.. [1] https://developer.arm.com/-/media/developer/pdf/ARM_DEN_0070A_Firmware_interfaces_for_mitigating_CVE-2017-5715.pdf
--- a/Documentation/virt/kvm/arm/hypercalls.rst
+++ b/Documentation/virt/kvm/arm/hypercalls.rst
@ -1,138 +1,118 @@
 .. SPDX-License-Identifier: GPL-2.0

-=======================
-ARM Hypercall Interface
-=======================
+===============================================
+KVM/arm64-specific hypercalls exposed to guests
+===============================================

-KVM handles the hypercall services as requested by the guests. New hypercall
-services are regularly made available by the ARM specification or by KVM (as
-vendor services) if they make sense from a virtualization point of view.
+This file documents the KVM/arm64-specific hypercalls which may be
+exposed by KVM/arm64 to guest operating systems. These hypercalls are
+issued using the HVC instruction according to version 1.1 of the Arm SMC
+Calling Convention (DEN0028/C):

-This means that a guest booted on two different versions of KVM can observe
-two different "firmware" revisions. This could cause issues if a given guest
-is tied to a particular version of a hypercall service, or if a migration
-causes a different version to be exposed out of the blue to an unsuspecting
-guest.
+https://developer.arm.com/docs/den0028/c

-In order to remedy this situation, KVM exposes a set of "firmware
-pseudo-registers" that can be manipulated using the GET/SET_ONE_REG
-interface. These registers can be saved/restored by userspace, and set
-to a convenient value as required.
+All KVM/arm64-specific hypercalls are allocated within the "Vendor
+Specific Hypervisor Service Call" range with a UID of
+``28b46fb6-2ec5-11e9-a9ca-4b564d003a74``. This UID should be queried by the
+guest using the standard "Call UID" function for the service range in
+order to determine that the KVM/arm64-specific hypercalls are available.

-The following registers are defined:
+``ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID``
+---------------------------------------------

-* KVM_REG_ARM_PSCI_VERSION:
+Provides a discovery mechanism for other KVM/arm64 hypercalls.

-  KVM implements the PSCI (Power State Coordination Interface)
-  specification in order to provide services such as CPU on/off, reset
-  and power-off to the guest.
+---------------------+-------------------------------------------------------------+
+| Presence:           | Mandatory for the KVM/arm64 UID                             |
+---------------------+-------------------------------------------------------------+
+| Calling convention: | HVC32                                                       |
+---------------------+----------+--------------------------------------------------+
+| Function ID:        | (uint32) | 0x86000000                                       |
+---------------------+----------+--------------------------------------------------+
+| Arguments:          | None                                                        |
+---------------------+----------+----+---------------------------------------------+
+| Return Values:      | (uint32) | R0 | Bitmap of available function numbers 0-31   |
+|                     +----------+----+---------------------------------------------+
+|                     | (uint32) | R1 | Bitmap of available function numbers 32-63  |
+|                     +----------+----+---------------------------------------------+
+|                     | (uint32) | R2 | Bitmap of available function numbers 64-95  |
+|                     +----------+----+---------------------------------------------+
+|                     | (uint32) | R3 | Bitmap of available function numbers 96-127 |
+---------------------+----------+----+---------------------------------------------+

-  - Only valid if the vcpu has the KVM_ARM_VCPU_PSCI_0_2 feature set
-    (and thus has already been initialized)
-  - Returns the current PSCI version on GET_ONE_REG (defaulting to the
-    highest PSCI version implemented by KVM and compatible with v0.2)
-  - Allows any PSCI version implemented by KVM and compatible with
-    v0.2 to be set with SET_ONE_REG
-  - Affects the whole VM (even if the register view is per-vcpu)
+``ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID``
+----------------------------------------

-* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
-    Holds the state of the firmware support to mitigate CVE-2017-5715, as
-    offered by KVM to the guest via a HVC call. The workaround is described
-    under SMCCC_ARCH_WORKAROUND_1 in [1].
+See ptp_kvm.rst

-  Accepted values are:
+``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO``
+----------------------------------

-    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL:
-      KVM does not offer
-      firmware support for the workaround. The mitigation status for the
-      guest is unknown.
-    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL:
-      The workaround HVC call is
-      available to the guest and required for the mitigation.
-    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED:
-      The workaround HVC call
-      is available to the guest, but it is not needed on this VCPU.
+Query the memory protection parameters for a protected virtual machine.

-* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
-    Holds the state of the firmware support to mitigate CVE-2018-3639, as
-    offered by KVM to the guest via a HVC call. The workaround is described
-    under SMCCC_ARCH_WORKAROUND_2 in [1]_.
+---------------------+-------------------------------------------------------------+
+| Presence:           | Optional; protected guests only.                            |
+---------------------+-------------------------------------------------------------+
+| Calling convention: | HVC64                                                       |
+---------------------+----------+--------------------------------------------------+
+| Function ID:        | (uint32) | 0xC6000002                                       |
+---------------------+----------+----+---------------------------------------------+
+| Arguments:          | (uint64) | R1 | Reserved / Must be zero                     |
+|                     +----------+----+---------------------------------------------+
+|                     | (uint64) | R2 | Reserved / Must be zero                     |
+|                     +----------+----+---------------------------------------------+
+|                     | (uint64) | R3 | Reserved / Must be zero                     |
+---------------------+----------+----+---------------------------------------------+
+| Return Values:      | (int64)  | R0 | ``INVALID_PARAMETER (-3)`` on error, else   |
+|                     |          |    | memory protection granule in bytes          |
+---------------------+----------+----+---------------------------------------------+

-  Accepted values are:
+``ARM_SMCCC_KVM_FUNC_MEM_SHARE``
+--------------------------------

-    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL:
-      A workaround is not
-      available. KVM does not offer firmware support for the workaround.
-    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN:
-      The workaround state is
-      unknown. KVM does not offer firmware support for the workaround.
-    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
-      The workaround is available,
-      and can be disabled by a vCPU. If
-      KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED is set, it is active for
-      this vCPU.
-    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED:
-      The workaround is always active on this vCPU or it is not needed.
+Share a region of memory with the KVM host, granting it read, write and execute
+permissions. The size of the region is equal to the memory protection granule
+advertised by ``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO``.

+---------------------+-------------------------------------------------------------+
+| Presence:           | Optional; protected guests only.                            |
+---------------------+-------------------------------------------------------------+
+| Calling convention: | HVC64                                                       |
+---------------------+----------+--------------------------------------------------+
+| Function ID:        | (uint32) | 0xC6000003                                       |
+---------------------+----------+----+---------------------------------------------+
+| Arguments:          | (uint64) | R1 | Base IPA of memory region to share          |
+|                     +----------+----+---------------------------------------------+
+|                     | (uint64) | R2 | Reserved / Must be zero                     |
+|                     +----------+----+---------------------------------------------+
+|                     | (uint64) | R3 | Reserved / Must be zero                     |
+---------------------+----------+----+---------------------------------------------+
+| Return Values:      | (int64)  | R0 | ``SUCCESS (0)``                             |
+|                     |          |    +---------------------------------------------+
+|                     |          |    | ``INVALID_PARAMETER (-3)``                  |
+---------------------+----------+----+---------------------------------------------+

-Bitmap Feature Firmware Registers
---------------------------------
+``ARM_SMCCC_KVM_FUNC_MEM_UNSHARE``
+----------------------------------

-Contrary to the above registers, the following registers exposes the
-hypercall services in the form of a feature-bitmap to the userspace. This
-bitmap is translated to the services that are available to the guest.
-There is a register defined per service call owner and can be accessed via
-GET/SET_ONE_REG interface.
+Revoke access permission from the KVM host to a memory region previously shared
+with ``ARM_SMCCC_KVM_FUNC_MEM_SHARE``. The size of the region is equal to the
+memory protection granule advertised by ``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO``.

-By default, these registers are set with the upper limit of the features
-that are supported. This way userspace can discover all the usable
-hypercall services via GET_ONE_REG. The user-space can write-back the
-desired bitmap back via SET_ONE_REG. The features for the registers that
-are untouched, probably because userspace isn't aware of them, will be
-exposed as is to the guest.
-
-Note that KVM will not allow the userspace to configure the registers
-anymore once any of the vCPUs has run at least once. Instead, it will
-return a -EBUSY.
-
-The pseudo-firmware bitmap register are as follows:
-
-* KVM_REG_ARM_STD_BMAP:
-    Controls the bitmap of the ARM Standard Secure Service Calls.
-
-  The following bits are accepted:
-
-    Bit-0: KVM_REG_ARM_STD_BIT_TRNG_V1_0:
-      The bit represents the services offered under v1.0 of ARM True Random
-      Number Generator (TRNG) specification, ARM DEN0098.
-
-* KVM_REG_ARM_STD_HYP_BMAP:
-    Controls the bitmap of the ARM Standard Hypervisor Service Calls.
-
-  The following bits are accepted:
-
-    Bit-0: KVM_REG_ARM_STD_HYP_BIT_PV_TIME:
-      The bit represents the Paravirtualized Time service as represented by
-      ARM DEN0057A.
-
-* KVM_REG_ARM_VENDOR_HYP_BMAP:
-    Controls the bitmap of the Vendor specific Hypervisor Service Calls.
-
-  The following bits are accepted:
-
-    Bit-0: KVM_REG_ARM_VENDOR_HYP_BIT_FUNC_FEAT
-      The bit represents the ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID
-      and ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID function-ids.
-
-    Bit-1: KVM_REG_ARM_VENDOR_HYP_BIT_PTP:
-      The bit represents the Precision Time Protocol KVM service.
-
-Errors:
-
-    =======  =============================================================
-    -ENOENT   Unknown register accessed.
-    -EBUSY    Attempt a 'write' to the register after the VM has started.
-    -EINVAL   Invalid bitmap written to the register.
-    =======  =============================================================
-
-.. [1] https://developer.arm.com/-/media/developer/pdf/ARM_DEN_0070A_Firmware_interfaces_for_mitigating_CVE-2017-5715.pdf
+---------------------+-------------------------------------------------------------+
+| Presence:           | Optional; protected guests only.                            |
+---------------------+-------------------------------------------------------------+
+| Calling convention: | HVC64                                                       |
+---------------------+----------+--------------------------------------------------+
+| Function ID:        | (uint32) | 0xC6000004                                       |
+---------------------+----------+----+---------------------------------------------+
+| Arguments:          | (uint64) | R1 | Base IPA of memory region to unshare        |
+|                     +----------+----+---------------------------------------------+
+|                     | (uint64) | R2 | Reserved / Must be zero                     |
+|                     +----------+----+---------------------------------------------+
+|                     | (uint64) | R3 | Reserved / Must be zero                     |
+---------------------+----------+----+---------------------------------------------+
+| Return Values:      | (int64)  | R0 | ``SUCCESS (0)``                             |
+|                     |          |    +---------------------------------------------+
+|                     |          |    | ``INVALID_PARAMETER (-3)``                  |
+---------------------+----------+----+---------------------------------------------+
--- a/Documentation/virt/kvm/arm/index.rst
+++ b/Documentation/virt/kvm/arm/index.rst
@ -7,7 +7,10 @@ ARM
 .. toctree::
   :maxdepth: 2

+   fw-pseudo-registers
   hyp-abi
   hypercalls
+   pkvm
   pvtime
   ptp_kvm
+   mmio-guard
--- a/Documentation/virt/kvm/arm/mmio-guard.rst
+++ b/Documentation/virt/kvm/arm/mmio-guard.rst
@ -0,0 +1,74 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============
+KVM MMIO guard
+==============
+
+KVM implements device emulation by handling translation faults to any
+IPA range that is not contained in a memory slot. Such a translation
+fault is in most cases passed on to userspace (or in rare cases to the
+host kernel) with the address, size and possibly data of the access
+for emulation.
+
+Should the guest exit with an address that is not one that corresponds
+to an emulatable device, userspace may take measures that are not the
+most graceful as far as the guest is concerned (such as terminating it
+or delivering a fatal exception).
+
+There is also an element of trust: by forwarding the request to
+userspace, the kernel assumes that the guest trusts userspace to do
+the right thing.
+
+The KVM MMIO guard offers a way to mitigate this last point: a guest
+can request that only certain regions of the IPA space are valid as
+MMIO. Only these regions will be handled as an MMIO, and any other
+will result in an exception being delivered to the guest.
+
+This relies on a set of hypercalls defined in the KVM-specific range,
+using the HVC64 calling convention.
+
+* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO
+
+    ==============    ========    ================================
+    Function ID:      (uint32)    0xC6000002
+    Arguments:        none
+    Return Values:    (int64)     NOT_SUPPORTED(-1) on error, or
+                      (uint64)    Protection Granule (PG) size in
+                                  bytes (r0)
+    ==============    ========    ================================
+
+* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL
+
+    ==============    ========    ==============================
+    Function ID:      (uint32)    0xC6000003
+    Arguments:        none
+    Return Values:    (int64)     NOT_SUPPORTED(-1) on error, or
+                                  RET_SUCCESS(0) (r0)
+    ==============    ========    ==============================
+
+* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP
+
+    ==============    ========    ====================================
+    Function ID:      (uint32)    0xC6000004
+    Arguments:        (uint64)    The base of the PG-sized IPA range
+                                  that is allowed to be accessed as
+                                  MMIO. Must be aligned to the PG size
+                                  (r1)
+                      (uint64)    Index in the MAIR_EL1 register
+		                  providing the memory attribute that
+				  is used by the guest (r2)
+    Return Values:    (int64)     NOT_SUPPORTED(-1) on error, or
+                                  RET_SUCCESS(0) (r0)
+    ==============    ========    ====================================
+
+* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP
+
+    ==============    ========    ======================================
+    Function ID:      (uint32)    0xC6000005
+    Arguments:        (uint64)    PG-sized IPA range aligned to the PG
+                                  size which has been previously mapped.
+                                  Must be aligned to the PG size and
+                                  have been previously mapped (r1)
+    Return Values:    (int64)     NOT_SUPPORTED(-1) on error, or
+                                  RET_SUCCESS(0) (r0)
+    ==============    ========    ======================================
--- a/Documentation/virt/kvm/arm/pkvm.rst
+++ b/Documentation/virt/kvm/arm/pkvm.rst
@ -0,0 +1,96 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Protected virtual machines (pKVM)
+=================================
+
+Introduction
+------------
+
+Protected KVM (pKVM) is a KVM/arm64 extension which uses the two-stage
+translation capability of the Armv8 MMU to isolate guest memory from the host
+system. This allows for the creation of a confidential computing environment
+without relying on whizz-bang features in hardware, but still allowing room for
+complementary technologies such as memory encryption and hardware-backed
+attestation.
+
+The major implementation change brought about by pKVM is that the hypervisor
+code running at EL2 is now largely independent of (and isolated from) the rest
+of the host kernel running at EL1 and therefore additional hypercalls are
+introduced to manage manipulation of guest stage-2 page tables, creation of VM
+data structures and reclamation of memory on teardown. An immediate consequence
+of this change is that the host itself runs with an identity mapping enabled
+at stage-2, providing the hypervisor code with a mechanism to restrict host
+access to an arbitrary physical page.
+
+Enabling pKVM
+-------------
+
+The pKVM hypervisor is enabled by booting the host kernel at EL2 with
+"``kvm-arm.mode=protected``" on the command-line. Once enabled, VMs can be spawned
+in either protected or non-protected state, although the hypervisor is still
+responsible for managing most of the VM metadata in either case.
+
+Limitations
+-----------
+
+Enabling pKVM places some significant limitations on KVM guests, regardless of
+whether they are spawned in protected state. It is therefore recommended only
+to enable pKVM if protected VMs are required, with non-protected state acting
+primarily as a debug and development aid.
+
+If you're still keen, then here is an incomplete list of caveats that apply
+to all VMs running under pKVM:
+
+- Guest memory cannot be file-backed (with the exception of shmem/memfd) and is
+  pinned as it is mapped into the guest. This prevents the host from
+  swapping-out, migrating, merging or generally doing anything useful with the
+  guest pages. It also requires that the VMM has either ``CAP_IPC_LOCK`` or
+  sufficient ``RLIMIT_MEMLOCK`` to account for this pinned memory.
+
+- GICv2 is not supported and therefore GICv3 hardware is required in order
+  to expose a virtual GICv3 to the guest.
+
+- Read-only memslots are unsupported and therefore dirty logging cannot be
+  enabled.
+
+- Memslot configuration is fixed once a VM has started running, with subsequent
+  move or deletion requests being rejected with ``-EPERM``.
+
+- There are probably many others.
+
+Since the host is unable to tear down the hypervisor when pKVM is enabled,
+hibernation (``CONFIG_HIBERNATION``) and kexec (``CONFIG_KEXEC``) will fail
+with ``-EBUSY``.
+
+If you are not happy with these limitations, then please don't enable pKVM :)
+
+VM creation
+-----------
+
+When pKVM is enabled, protected VMs can be created by specifying the
+``KVM_VM_TYPE_ARM_PROTECTED`` flag in the machine type identifier parameter
+passed to ``KVM_CREATE_VM``.
+
+Protected VMs are instantiated according to a fixed vCPU configuration
+described by the ID register definitions in
+``arch/arm64/include/asm/kvm_pkvm.h``. Only a subset of the architectural
+features that may be available to the host are exposed to the guest and the
+capabilities advertised by ``KVM_CHECK_EXTENSION`` are limited accordingly,
+with the vCPU registers being initialised to their architecturally-defined
+values.
+
+Where not defined by the architecture, the registers of a protected vCPU
+are reset to zero with the exception of the PC and X0 which can be set
+either by the ``KVM_SET_ONE_REG`` interface or by a call to PSCI ``CPU_ON``.
+
+VM runtime
+----------
+
+By default, memory pages mapped into a protected guest are inaccessible to the
+host and any attempt by the host to access such a page will result in the
+injection of an abort at EL1 by the hypervisor. For accesses originating from
+EL0, the host will then terminate the current task with a ``SIGSEGV``.
+
+pKVM exposes additional hypercalls to protected guests, primarily for the
+purpose of establishing shared-memory regions with the host for communication
+and I/O. These hypercalls are documented in hypercalls.rst.
--- a/Documentation/virt/kvm/arm/ptp_kvm.rst
+++ b/Documentation/virt/kvm/arm/ptp_kvm.rst
@ -7,19 +7,29 @@ PTP_KVM is used for high precision time sync between host and guests.
 It relies on transferring the wall clock and counter value from the
 host to the guest using a KVM-specific hypercall.

-* ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: 0x86000001
+``ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID``
+----------------------------------------

-This hypercall uses the SMC32/HVC32 calling convention:
+Retrieve current time information for the specific counter. There are no
+endianness restrictions.

-ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID
-    ==============    ========    =====================================
-    Function ID:      (uint32)    0x86000001
-    Arguments:        (uint32)    KVM_PTP_VIRT_COUNTER(0)
-                                  KVM_PTP_PHYS_COUNTER(1)
-    Return Values:    (int32)     NOT_SUPPORTED(-1) on error, or
-                      (uint32)    Upper 32 bits of wall clock time (r0)
-                      (uint32)    Lower 32 bits of wall clock time (r1)
-                      (uint32)    Upper 32 bits of counter (r2)
-                      (uint32)    Lower 32 bits of counter (r3)
-    Endianness:                   No Restrictions.
-    ==============    ========    =====================================
+---------------------+-------------------------------------------------------+
+| Presence:           | Optional                                              |
+---------------------+-------------------------------------------------------+
+| Calling convention: | HVC32                                                 |
+---------------------+----------+--------------------------------------------+
+| Function ID:        | (uint32) | 0x86000001                                 |
+---------------------+----------+----+---------------------------------------+
+| Arguments:          | (uint32) | R1 | ``KVM_PTP_VIRT_COUNTER (0)``          |
+|                     |          |    +---------------------------------------+
+|                     |          |    | ``KVM_PTP_PHYS_COUNTER (1)``          |
+---------------------+----------+----+---------------------------------------+
+| Return Values:      | (int32)  | R0 | ``NOT_SUPPORTED (-1)`` on error, else |
+|                     |          |    | upper 32 bits of wall clock time      |
+|                     +----------+----+---------------------------------------+
+|                     | (uint32) | R1 | Lower 32 bits of wall clock time      |
+|                     +----------+----+---------------------------------------+
+|                     | (uint32) | R2 | Upper 32 bits of counter              |
+|                     +----------+----+---------------------------------------+
+|                     | (uint32) | R3 | Lower 32 bits of counter              |
+---------------------+----------+----+---------------------------------------+
--- a/1
+++ b/1
@ -7831,6 +7831,7 @@ M:	Chao Yu <chao@kernel.org>
 L:	linux-f2fs-devel@lists.sourceforge.net
 S:	Maintained
 W:	https://f2fs.wiki.kernel.org/
+B:	https://bugzilla.kernel.org/enter_bug.cgi?product=File%20System&component=f2fs
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git
 F:	Documentation/ABI/testing/sysfs-fs-f2fs
 F:	Documentation/filesystems/f2fs.rst
--- a/6
+++ b/6
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 VERSION = 6
 PATCHLEVEL = 1
-SUBLEVEL = 0
+SUBLEVEL = 1
 EXTRAVERSION =
 NAME = Hurr durr I'ma ninja sloth

@ -2094,7 +2094,9 @@ $(clean-dirs):

 clean: $(clean-dirs)
 	$(call cmd,rmfiles)
-	@find $(or $(KBUILD_EXTMOD), .) $(RCS_FIND_IGNORE) \
+	@find $(or $(KBUILD_EXTMOD), .) \
+		$(if $(filter-out arch/$(SRCARCH)/boot/dts, $(dtstree)), $(dtstree)) \
+		$(RCS_FIND_IGNORE) \
 		\( -name '*.[aios]' -o -name '*.rsi' -o -name '*.ko' -o -name '.*.cmd' \
 		-o -name '*.ko.*' \
 		-o -name '*.dtb' -o -name '*.dtbo' -o -name '*.dtb.S' -o -name '*.dt.yaml' \
--- a/android/ACK_SHA
+++ b/android/ACK_SHA
@ -1 +1 @@
-5a26ea7c4a053d84e22ddc713c092565ccf2a173
+09ad10d4ee63f8983acad5515463dd202cc83054
--- a/android/abi_gki_aarch64.xml
+++ b/android/abi_gki_aarch64.xml
--- a/android/abi_gki_protected_exports
+++ b/android/abi_gki_protected_exports
@ -0,0 +1,538 @@
+__cfg80211_alloc_event_skb
+__cfg80211_alloc_reply_skb
+__cfg80211_radar_event
+__cfg80211_send_event_skb
+__hci_cmd_send
+__hci_cmd_sync
+__hci_cmd_sync_ev
+__hci_cmd_sync_sk
+__hci_cmd_sync_status
+__hci_cmd_sync_status_sk
+__ieee80211_schedule_txq
+__nfc_alloc_vendor_cmd_reply_skb
+alloc_can_err_skb
+alloc_can_skb
+alloc_candev_mqs
+alloc_canfd_skb
+alloc_canxl_skb
+arc4_crypt
+arc4_setkey
+baswap
+bridge_tunnel_header
+bt_accept_dequeue
+bt_accept_enqueue
+bt_accept_unlink
+bt_debugfs
+bt_err
+bt_err_ratelimited
+bt_info
+bt_procfs_cleanup
+bt_procfs_init
+bt_sock_ioctl
+bt_sock_link
+bt_sock_poll
+bt_sock_reclassify_lock
+bt_sock_recvmsg
+bt_sock_register
+bt_sock_stream_recvmsg
+bt_sock_unlink
+bt_sock_unregister
+bt_sock_wait_ready
+bt_sock_wait_state
+bt_status
+bt_to_errno
+bt_warn
+bt_warn_ratelimited
+btbcm_check_bdaddr
+btbcm_finalize
+btbcm_initialize
+btbcm_patchram
+btbcm_read_pcm_int_params
+btbcm_set_bdaddr
+btbcm_setup_apple
+btbcm_setup_patchram
+btbcm_write_pcm_int_params
+can_bus_off
+can_change_mtu
+can_change_state
+can_dropped_invalid_skb
+can_eth_ioctl_hwts
+can_ethtool_op_get_ts_info_hwts
+can_fd_dlc2len
+can_fd_len2dlc
+can_free_echo_skb
+can_get_echo_skb
+can_get_state_str
+can_proto_register
+can_proto_unregister
+can_put_echo_skb
+can_rx_offload_add_fifo
+can_rx_offload_add_manual
+can_rx_offload_add_timestamp
+can_rx_offload_del
+can_rx_offload_enable
+can_rx_offload_get_echo_skb
+can_rx_offload_irq_finish
+can_rx_offload_irq_offload_fifo
+can_rx_offload_irq_offload_timestamp
+can_rx_offload_queue_tail
+can_rx_offload_queue_timestamp
+can_rx_offload_threaded_irq_finish
+can_rx_register
+can_rx_unregister
+can_send
+can_skb_get_frame_len
+can_sock_destruct
+cfg80211_any_usable_channels
+cfg80211_assoc_comeback
+cfg80211_assoc_failure
+cfg80211_auth_timeout
+cfg80211_background_cac_abort
+cfg80211_bss_color_notify
+cfg80211_bss_flush
+cfg80211_bss_iter
+cfg80211_cac_event
+cfg80211_calculate_bitrate
+cfg80211_ch_switch_notify
+cfg80211_ch_switch_started_notify
+cfg80211_chandef_compatible
+cfg80211_chandef_create
+cfg80211_chandef_dfs_required
+cfg80211_chandef_usable
+cfg80211_chandef_valid
+cfg80211_check_combinations
+cfg80211_check_station_change
+cfg80211_classify8021d
+cfg80211_conn_failed
+cfg80211_connect_done
+cfg80211_control_port_tx_status
+cfg80211_cqm_beacon_loss_notify
+cfg80211_cqm_pktloss_notify
+cfg80211_cqm_rssi_notify
+cfg80211_cqm_txe_notify
+cfg80211_crit_proto_stopped
+cfg80211_del_sta_sinfo
+cfg80211_disconnected
+cfg80211_external_auth_request
+cfg80211_find_elem_match
+cfg80211_find_vendor_elem
+cfg80211_free_nan_func
+cfg80211_ft_event
+cfg80211_get_bss
+cfg80211_get_drvinfo
+cfg80211_get_ies_channel_number
+cfg80211_get_iftype_ext_capa
+cfg80211_get_p2p_attr
+cfg80211_get_station
+cfg80211_gtk_rekey_notify
+cfg80211_ibss_joined
+cfg80211_iftype_allowed
+cfg80211_inform_bss_data
+cfg80211_inform_bss_frame_data
+cfg80211_is_element_inherited
+cfg80211_iter_combinations
+cfg80211_merge_profile
+cfg80211_mgmt_tx_status_ext
+cfg80211_michael_mic_failure
+cfg80211_nan_func_terminated
+cfg80211_nan_match
+cfg80211_new_sta
+cfg80211_notify_new_peer_candidate
+cfg80211_pmksa_candidate_notify
+cfg80211_pmsr_complete
+cfg80211_pmsr_report
+cfg80211_port_authorized
+cfg80211_probe_status
+cfg80211_put_bss
+cfg80211_ready_on_channel
+cfg80211_ref_bss
+cfg80211_reg_can_beacon
+cfg80211_reg_can_beacon_relax
+cfg80211_register_netdevice
+cfg80211_remain_on_channel_expired
+cfg80211_report_obss_beacon_khz
+cfg80211_report_wowlan_wakeup
+cfg80211_roamed
+cfg80211_rx_assoc_resp
+cfg80211_rx_control_port
+cfg80211_rx_mgmt_ext
+cfg80211_rx_mlme_mgmt
+cfg80211_rx_spurious_frame
+cfg80211_rx_unexpected_4addr_frame
+cfg80211_rx_unprot_mlme_mgmt
+cfg80211_scan_done
+cfg80211_sched_scan_results
+cfg80211_sched_scan_stopped
+cfg80211_sched_scan_stopped_locked
+cfg80211_send_layer2_update
+cfg80211_shutdown_all_interfaces
+cfg80211_sinfo_alloc_tid_stats
+cfg80211_sta_opmode_change_notify
+cfg80211_stop_iface
+cfg80211_tdls_oper_request
+cfg80211_tx_mgmt_expired
+cfg80211_tx_mlme_mgmt
+cfg80211_unlink_bss
+cfg80211_unregister_wdev
+cfg80211_update_owe_info_event
+cfg80211_vendor_cmd_get_sender
+cfg80211_vendor_cmd_reply
+close_candev
+free_candev
+freq_reg_info
+get_wiphy_regdom
+h4_recv_buf
+hci_alloc_dev_priv
+hci_cmd_sync
+hci_cmd_sync_cancel
+hci_cmd_sync_queue
+hci_conn_check_secure
+hci_conn_security
+hci_conn_switch_role
+hci_free_dev
+hci_get_route
+hci_mgmt_chan_register
+hci_mgmt_chan_unregister
+hci_recv_diag
+hci_recv_frame
+hci_register_cb
+hci_register_dev
+hci_release_dev
+hci_reset_dev
+hci_resume_dev
+hci_set_fw_info
+hci_set_hw_info
+hci_suspend_dev
+hci_uart_register_device
+hci_uart_tx_wakeup
+hci_uart_unregister_device
+hci_unregister_cb
+hci_unregister_dev
+hidp_hid_driver
+ieee80211_alloc_hw_nm
+ieee80211_amsdu_to_8023s
+ieee80211_ap_probereq_get
+ieee80211_ave_rssi
+ieee80211_beacon_cntdwn_is_complete
+ieee80211_beacon_get_template
+ieee80211_beacon_get_tim
+ieee80211_beacon_loss
+ieee80211_beacon_set_cntdwn
+ieee80211_beacon_update_cntdwn
+ieee80211_bss_get_elem
+ieee80211_calc_rx_airtime
+ieee80211_calc_tx_airtime
+ieee80211_chandef_to_operating_class
+ieee80211_channel_switch_disconnect
+ieee80211_channel_to_freq_khz
+ieee80211_chswitch_done
+ieee80211_color_change_finish
+ieee80211_connection_loss
+ieee80211_cqm_beacon_loss_notify
+ieee80211_cqm_rssi_notify
+ieee80211_csa_finish
+ieee80211_ctstoself_duration
+ieee80211_ctstoself_get
+ieee80211_data_to_8023_exthdr
+ieee80211_disable_rssi_reports
+ieee80211_disconnect
+ieee80211_enable_rssi_reports
+ieee80211_find_sta
+ieee80211_find_sta_by_ifaddr
+ieee80211_find_sta_by_link_addrs
+ieee80211_free_hw
+ieee80211_free_txskb
+ieee80211_freq_khz_to_channel
+ieee80211_generic_frame_duration
+ieee80211_get_bssid
+ieee80211_get_buffered_bc
+ieee80211_get_channel_khz
+ieee80211_get_fils_discovery_tmpl
+ieee80211_get_hdrlen_from_skb
+ieee80211_get_key_rx_seq
+ieee80211_get_mesh_hdrlen
+ieee80211_get_num_supported_channels
+ieee80211_get_response_rate
+ieee80211_get_tkip_p1k_iv
+ieee80211_get_tkip_p2k
+ieee80211_get_tkip_rx_p1k
+ieee80211_get_tx_rates
+ieee80211_get_unsol_bcast_probe_resp_tmpl
+ieee80211_get_vht_max_nss
+ieee80211_gtk_rekey_add
+ieee80211_gtk_rekey_notify
+ieee80211_hdrlen
+ieee80211_hw_restart_disconnect
+ieee80211_ie_split_ric
+ieee80211_iter_chan_contexts_atomic
+ieee80211_iter_keys
+ieee80211_iter_keys_rcu
+ieee80211_iterate_active_interfaces_atomic
+ieee80211_iterate_active_interfaces_mtx
+ieee80211_iterate_interfaces
+ieee80211_iterate_stations
+ieee80211_iterate_stations_atomic
+ieee80211_key_mic_failure
+ieee80211_key_replay
+ieee80211_manage_rx_ba_offl
+ieee80211_mandatory_rates
+ieee80211_mark_rx_ba_filtered_frames
+ieee80211_nan_func_match
+ieee80211_nan_func_terminated
+ieee80211_next_txq
+ieee80211_nullfunc_get
+ieee80211_operating_class_to_band
+ieee80211_parse_p2p_noa
+ieee80211_probereq_get
+ieee80211_proberesp_get
+ieee80211_pspoll_get
+ieee80211_queue_delayed_work
+ieee80211_queue_stopped
+ieee80211_queue_work
+ieee80211_radar_detected
+ieee80211_radiotap_iterator_init
+ieee80211_radiotap_iterator_next
+ieee80211_rate_control_register
+ieee80211_rate_control_unregister
+ieee80211_ready_on_channel
+ieee80211_register_hw
+ieee80211_remain_on_channel_expired
+ieee80211_remove_key
+ieee80211_report_low_ack
+ieee80211_report_wowlan_wakeup
+ieee80211_request_smps
+ieee80211_reserve_tid
+ieee80211_restart_hw
+ieee80211_resume_disconnect
+ieee80211_rts_duration
+ieee80211_rts_get
+ieee80211_rx_ba_timer_expired
+ieee80211_rx_irqsafe
+ieee80211_rx_list
+ieee80211_rx_napi
+ieee80211_s1g_channel_width
+ieee80211_scan_completed
+ieee80211_sched_scan_results
+ieee80211_sched_scan_stopped
+ieee80211_send_bar
+ieee80211_send_eosp_nullfunc
+ieee80211_set_active_links
+ieee80211_set_active_links_async
+ieee80211_set_key_rx_seq
+ieee80211_sta_block_awake
+ieee80211_sta_eosp
+ieee80211_sta_ps_transition
+ieee80211_sta_pspoll
+ieee80211_sta_recalc_aggregates
+ieee80211_sta_register_airtime
+ieee80211_sta_set_buffered
+ieee80211_sta_uapsd_trigger
+ieee80211_start_tx_ba_cb_irqsafe
+ieee80211_start_tx_ba_session
+ieee80211_stop_queue
+ieee80211_stop_queues
+ieee80211_stop_rx_ba_session
+ieee80211_stop_tx_ba_cb_irqsafe
+ieee80211_stop_tx_ba_session
+ieee80211_tdls_oper_request
+ieee80211_tkip_add_iv
+ieee80211_tx_dequeue
+ieee80211_tx_prepare_skb
+ieee80211_tx_rate_update
+ieee80211_tx_status
+ieee80211_tx_status_8023
+ieee80211_tx_status_ext
+ieee80211_tx_status_irqsafe
+ieee80211_txq_airtime_check
+ieee80211_txq_get_depth
+ieee80211_txq_may_transmit
+ieee80211_txq_schedule_start
+ieee80211_unregister_hw
+ieee80211_unreserve_tid
+ieee80211_update_mu_groups
+ieee80211_update_p2p_noa
+ieee80211_vif_to_wdev
+ieee80211_wake_queue
+ieee80211_wake_queues
+ieee802154_alloc_hw
+ieee802154_configure_durations
+ieee802154_free_hw
+ieee802154_hdr_peek
+ieee802154_hdr_peek_addrs
+ieee802154_hdr_pull
+ieee802154_hdr_push
+ieee802154_max_payload
+ieee802154_register_hw
+ieee802154_rx_irqsafe
+ieee802154_stop_queue
+ieee802154_unregister_hw
+ieee802154_wake_queue
+ieee802154_xmit_complete
+ieee802154_xmit_error
+ieee802154_xmit_hw_error
+ieeee80211_obss_color_collision_notify
+l2cap_add_psm
+l2cap_chan_close
+l2cap_chan_connect
+l2cap_chan_create
+l2cap_chan_del
+l2cap_chan_list
+l2cap_chan_put
+l2cap_chan_send
+l2cap_chan_set_defaults
+l2cap_conn_get
+l2cap_conn_put
+l2cap_is_socket
+l2cap_register_user
+l2cap_unregister_user
+l2tp_recv_common
+l2tp_session_create
+l2tp_session_dec_refcount
+l2tp_session_delete
+l2tp_session_get
+l2tp_session_get_by_ifname
+l2tp_session_get_nth
+l2tp_session_inc_refcount
+l2tp_session_register
+l2tp_session_set_header_len
+l2tp_sk_to_tunnel
+l2tp_tunnel_create
+l2tp_tunnel_dec_refcount
+l2tp_tunnel_delete
+l2tp_tunnel_get
+l2tp_tunnel_get_nth
+l2tp_tunnel_get_session
+l2tp_tunnel_inc_refcount
+l2tp_tunnel_register
+l2tp_udp_encap_recv
+l2tp_xmit_skb
+lowpan_header_compress
+lowpan_header_decompress
+lowpan_nhc_add
+lowpan_nhc_del
+lowpan_register_netdev
+lowpan_register_netdevice
+lowpan_unregister_netdev
+lowpan_unregister_netdevice
+nfc_add_se
+nfc_alloc_recv_skb
+nfc_allocate_device
+nfc_class
+nfc_dep_link_is_up
+nfc_driver_failure
+nfc_find_se
+nfc_fw_download_done
+nfc_get_local_general_bytes
+nfc_proto_register
+nfc_proto_unregister
+nfc_register_device
+nfc_remove_se
+nfc_se_connectivity
+nfc_se_transaction
+nfc_send_to_raw_sock
+nfc_set_remote_general_bytes
+nfc_target_lost
+nfc_targets_found
+nfc_tm_activated
+nfc_tm_data_received
+nfc_tm_deactivated
+nfc_unregister_device
+nfc_vendor_cmd_reply
+of_can_transceiver
+open_candev
+ppp_channel_index
+ppp_dev_name
+ppp_input
+ppp_input_error
+ppp_output_wakeup
+ppp_register_channel
+ppp_register_compressor
+ppp_register_net_channel
+ppp_unit_number
+ppp_unregister_channel
+ppp_unregister_compressor
+pppox_compat_ioctl
+pppox_ioctl
+pppox_unbind_sock
+qca_read_soc_version
+qca_send_pre_shutdown_cmd
+qca_set_bdaddr
+qca_set_bdaddr_rome
+qca_uart_setup
+rate_control_set_rates
+reg_initiator_name
+reg_query_regdb_wmm
+register_candev
+register_pppox_proto
+regulatory_hint
+regulatory_pre_cac_allowed
+regulatory_set_wiphy_regd
+regulatory_set_wiphy_regd_sync
+rfc1042_header
+rfkill_alloc
+rfkill_blocked
+rfkill_destroy
+rfkill_find_type
+rfkill_get_led_trigger_name
+rfkill_init_sw_state
+rfkill_pause_polling
+rfkill_register
+rfkill_resume_polling
+rfkill_set_hw_state_reason
+rfkill_set_led_trigger_name
+rfkill_set_states
+rfkill_set_sw_state
+rfkill_soft_blocked
+rfkill_unregister
+safe_candev_priv
+slhc_compress
+slhc_free
+slhc_init
+slhc_remember
+slhc_toss
+slhc_uncompress
+tipc_dump_done
+tipc_dump_start
+tipc_nl_sk_walk
+tipc_sk_fill_sock_diag
+unregister_candev
+unregister_pppox_proto
+usb_serial_claim_interface
+usb_serial_deregister_drivers
+usb_serial_generic_chars_in_buffer
+usb_serial_generic_close
+usb_serial_generic_get_icount
+usb_serial_generic_open
+usb_serial_generic_process_read_urb
+usb_serial_generic_read_bulk_callback
+usb_serial_generic_resume
+usb_serial_generic_submit_read_urbs
+usb_serial_generic_throttle
+usb_serial_generic_tiocmiwait
+usb_serial_generic_unthrottle
+usb_serial_generic_wait_until_sent
+usb_serial_generic_write
+usb_serial_generic_write_bulk_callback
+usb_serial_generic_write_start
+usb_serial_handle_dcd_change
+usb_serial_port_softint
+usb_serial_register_drivers
+usb_serial_resume
+usb_serial_suspend
+wdev_chandef
+wdev_to_ieee80211_vif
+wiphy_apply_custom_regulatory
+wiphy_free
+wiphy_new_nm
+wiphy_read_of_freq_limits
+wiphy_register
+wiphy_rfkill_set_hw_state_reason
+wiphy_rfkill_start_polling
+wiphy_to_ieee80211_hw
+wiphy_unregister
+wpan_phy_find
+wpan_phy_for_each
+wpan_phy_free
+wpan_phy_new
+wpan_phy_register
+wpan_phy_unregister
--- a/android/gki_protected_modules
+++ b/android/gki_protected_modules
@ -0,0 +1,47 @@
+drivers/bluetooth/btbcm.ko
+drivers/bluetooth/btqca.ko
+drivers/bluetooth/btsdio.ko
+drivers/bluetooth/hci_uart.ko
+drivers/net/can/dev/can-dev.ko
+drivers/net/can/slcan/slcan.ko
+drivers/net/can/vcan.ko
+drivers/net/ppp/bsd_comp.ko
+drivers/net/ppp/ppp_deflate.ko
+drivers/net/ppp/ppp_generic.ko
+drivers/net/ppp/ppp_mppe.ko
+drivers/net/ppp/pppox.ko
+drivers/net/ppp/pptp.ko
+drivers/net/slip/slhc.ko
+drivers/usb/class/cdc-acm.ko
+drivers/usb/serial/ftdi_sio.ko
+drivers/usb/serial/usbserial.ko
+lib/crypto/libarc4.ko
+net/6lowpan/6lowpan.ko
+net/6lowpan/nhc_dest.ko
+net/6lowpan/nhc_fragment.ko
+net/6lowpan/nhc_hop.ko
+net/6lowpan/nhc_ipv6.ko
+net/6lowpan/nhc_mobility.ko
+net/6lowpan/nhc_routing.ko
+net/6lowpan/nhc_udp.ko
+net/8021q/8021q.ko
+net/bluetooth/bluetooth.ko
+net/bluetooth/hidp/hidp.ko
+net/bluetooth/rfcomm/rfcomm.ko
+net/can/can.ko
+net/can/can-bcm.ko
+net/can/can-gw.ko
+net/can/can-raw.ko
+net/ieee802154/6lowpan/ieee802154_6lowpan.ko
+net/ieee802154/ieee802154.ko
+net/ieee802154/ieee802154_socket.ko
+net/l2tp/l2tp_core.ko
+net/l2tp/l2tp_ppp.ko
+net/mac80211/mac80211.ko
+net/mac802154/mac802154.ko
+net/nfc/nfc.ko
+net/rfkill/rfkill.ko
+net/tipc/diag.ko
+net/tipc/tipc.ko
+net/wireless/cfg80211.ko
+
--- a/arch/Kconfig
+++ b/arch/Kconfig
@ -1316,6 +1316,9 @@ config RELR
 config ARCH_HAS_MEM_ENCRYPT
 	bool

+config ARCH_HAS_MEM_RELINQUISH
+	bool
+
 config ARCH_HAS_CC_PLATFORM
 	bool

--- a/arch/arm/include/asm/hypervisor.h
+++ b/arch/arm/include/asm/hypervisor.h
@ -6,5 +6,6 @@

 void kvm_init_hyp_services(void);
 bool kvm_arm_hyp_service_available(u32 func_id);
+void kvm_arm_init_hyp_services(void);

 #endif
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@ -28,9 +28,12 @@ config ARM64
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_GIGANTIC_PAGE
+	select ARCH_HAS_IOREMAP_PHYS_HOOKS
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_KEEPINITRD
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
+	select ARCH_HAS_MEM_ENCRYPT
+	select ARCH_HAS_MEM_RELINQUISH
 	select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
 	select ARCH_HAS_PTE_DEVMAP
 	select ARCH_HAS_PTE_SPECIAL
@ -141,6 +144,7 @@ config ARM64
 	select GENERIC_GETTIMEOFDAY
 	select GENERIC_VDSO_TIME_NS
 	select HARDIRQS_SW_RESEND
+	select HAVE_MOD_ARCH_SPECIFIC if (ARM64_MODULE_PLTS || KVM)
 	select HAVE_MOVE_PMD
 	select HAVE_MOVE_PUD
 	select HAVE_PCI
@ -2058,7 +2062,6 @@ config ARM64_SME
 config ARM64_MODULE_PLTS
 	bool "Use PLTs to allow module memory to spill over into vmalloc area"
 	depends on MODULES
-	select HAVE_MOD_ARCH_SPECIFIC
 	help
 	  Allocate PLTs when loading modules so that jumps and calls whose
 	  targets are too far away for their relative offsets to be encoded
@ -2204,6 +2207,12 @@ config CMDLINE_FROM_BOOTLOADER
 	  the boot loader doesn't provide any, the default kernel command
 	  string provided in CMDLINE will be used.

+config CMDLINE_EXTEND
+	bool "Extend bootloader kernel arguments"
+	help
+	  The command-line arguments provided by the boot loader will be
+	  appended to the default kernel command string.
+
 config CMDLINE_FORCE
 	bool "Always use the default kernel command string"
 	help
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@ -190,6 +190,11 @@ ifeq ($(CONFIG_ARM64_USE_LSE_ATOMICS),y)
  endif
 endif

+ifeq ($(CONFIG_KVM),y)
+archscripts:
+	$(Q)$(MAKE) $(build)=arch/arm64/tools gen-hyprel
+endif
+
 ifeq ($(KBUILD_EXTMOD),)
 # We need to generate vdso-offsets.h before compiling certain files in kernel/.
 # In order to do that, we should use the archprepare target, but we can't since
--- a/arch/arm64/Makefile.postlink
+++ b/arch/arm64/Makefile.postlink
@ -0,0 +1,49 @@
+# SPDX-License-Identifier: GPL-2.0
+
+#
+# This file is included by the generic Kbuild makefile to permit the
+# architecture to perform postlink actions on vmlinux and any .ko module file.
+# In this case, we only need it for fips140.ko, which needs some postprocessing
+# for the integrity check mandated by FIPS. This involves making copies of the
+# relocation sections so that the module will have access to them at
+# initialization time, and calculating and injecting a HMAC digest into the
+# module. All other targets are NOPs.
+#
+
+PHONY := __archpost
+__archpost:
+
+-include include/config/auto.conf
+include scripts/Kbuild.include
+
+CMD_FIPS140_GEN_HMAC = crypto/fips140_gen_hmac
+quiet_cmd_gen_hmac = HMAC    $@
+      cmd_gen_hmac = $(OBJCOPY) $@ \
+	--dump-section=$(shell $(READELF) -SW $@|grep -Eo '\.rela\.text\S*')=$@.rela.text \
+	--dump-section=$(shell $(READELF) -SW $@|grep -Eo '\.rela\.rodata\S*')=$@.rela.rodata && \
+	$(OBJCOPY) $@ \
+	--add-section=.init.rela.text=$@.rela.text \
+	--add-section=.init.rela.rodata=$@.rela.rodata \
+	--set-section-flags=.init.rela.text=alloc,readonly \
+	--set-section-flags=.init.rela.rodata=alloc,readonly && \
+	$(CMD_FIPS140_GEN_HMAC) $@
+
+# `@true` prevents complaints when there is nothing to be done
+
+vmlinux: FORCE
+	@true
+
+$(objtree)/crypto/fips140.ko: FORCE
+	$(call cmd,gen_hmac)
+
+%.ko: FORCE
+	@true
+
+clean:
+	rm -f $(objtree)/crypto/fips140.ko.rela.*
+
+PHONY += FORCE clean
+
+FORCE:
+
+.PHONY: $(PHONY)
--- a/arch/arm64/configs/fips140_gki.fragment
+++ b/arch/arm64/configs/fips140_gki.fragment
@ -0,0 +1,2 @@
+CONFIG_CRYPTO_FIPS140_MOD=m
+# CONFIG_MODULE_SIG_ALL is not set
--- a/arch/arm64/configs/gki_defconfig
+++ b/arch/arm64/configs/gki_defconfig
@ -1,5 +1,4 @@
 CONFIG_UAPI_HEADER_TEST=y
-CONFIG_LOCALVERSION="-mainline"
 CONFIG_AUDIT=y
 CONFIG_NO_HZ=y
 CONFIG_HIGH_RES_TIMERS=y
@ -31,6 +30,7 @@ CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_BPF=y
 CONFIG_NAMESPACES=y
 # CONFIG_PID_NS is not set
+CONFIG_RT_SOFTIRQ_AWARE_SCHED=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
 # CONFIG_RD_XZ is not set
@ -56,7 +56,8 @@ CONFIG_CP15_BARRIER_EMULATION=y
 CONFIG_SETEND_EMULATION=y
 CONFIG_RANDOMIZE_BASE=y
 # CONFIG_RANDOMIZE_MODULE_REGION_FULL is not set
-CONFIG_CMDLINE="console=ttynull stack_depot_disable=on cgroup_disable=pressure kasan.stacktrace=off kvm-arm.mode=protected bootconfig ioremap_guard"
+CONFIG_CMDLINE="console=ttynull stack_depot_disable=on cgroup_disable=pressure kasan.page_alloc.sample=10 kasan.stacktrace=off kvm-arm.mode=protected bootconfig ioremap_guard"
+CONFIG_CMDLINE_EXTEND=y
 # CONFIG_DMI is not set
 CONFIG_PM_WAKELOCKS=y
 CONFIG_PM_WAKELOCKS_LIMIT=0
@ -75,9 +76,11 @@ CONFIG_ARM_SCPI_CPUFREQ=y
 CONFIG_ARM_SCMI_CPUFREQ=y
 CONFIG_VIRTUALIZATION=y
 CONFIG_KVM=y
+CONFIG_KVM_S2MPU=y
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
 CONFIG_SHADOW_CALL_STACK=y
+CONFIG_CFI_CLANG=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODVERSIONS=y
@ -140,6 +143,7 @@ CONFIG_IPV6_MROUTE=y
 CONFIG_NETFILTER=y
 CONFIG_NF_CONNTRACK=y
 CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_PROCFS=y
 CONFIG_NF_CONNTRACK_EVENTS=y
 CONFIG_NF_CONNTRACK_AMANDA=y
 CONFIG_NF_CONNTRACK_FTP=y
@ -566,6 +570,7 @@ CONFIG_QUOTA=y
 CONFIG_QFMT_V2=y
 CONFIG_FUSE_FS=y
 CONFIG_VIRTIO_FS=y
+CONFIG_FUSE_BPF=y
 CONFIG_OVERLAY_FS=y
 CONFIG_INCREMENTAL_FS=y
 CONFIG_MSDOS_FS=y
--- a/arch/arm64/crypto/Kbuild.fips140
+++ b/arch/arm64/crypto/Kbuild.fips140
@ -0,0 +1,52 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Create a separate FIPS archive that duplicates the modules that are relevant
+# for FIPS 140 certification as builtin objects
+#
+
+sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o
+sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o
+sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o
+ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
+aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
+aes-ce-blk-y := aes-glue-ce.o aes-ce.o
+aes-neon-blk-y := aes-glue-neon.o aes-neon.o
+sha256-arm64-y := sha256-glue.o sha256-core.o
+sha512-arm64-y := sha512-glue.o sha512-core.o
+aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
+aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
+
+crypto-arm64-fips-src	  := $(srctree)/arch/arm64/crypto/
+crypto-arm64-fips-modules := sha1-ce.o sha2-ce.o sha512-ce.o ghash-ce.o \
+			     aes-ce-cipher.o aes-ce-blk.o aes-neon-blk.o \
+			     sha256-arm64.o sha512-arm64.o aes-arm64.o \
+			     aes-neon-bs.o
+
+crypto-fips-objs += $(foreach o,$(crypto-arm64-fips-modules),$($(o:.o=-y):.o=-fips-arch.o))
+
+CFLAGS_aes-glue-ce-fips-arch.o := -DUSE_V8_CRYPTO_EXTENSIONS
+
+$(obj)/aes-glue-%-fips-arch.o: KBUILD_CFLAGS += $(FIPS140_CFLAGS)
+$(obj)/aes-glue-%-fips-arch.o: $(crypto-arm64-fips-src)/aes-glue.c FORCE
+	$(call if_changed_rule,cc_o_c)
+
+$(obj)/%-fips-arch.o: KBUILD_CFLAGS += $(FIPS140_CFLAGS)
+$(obj)/%-fips-arch.o: $(crypto-arm64-fips-src)/%.c FORCE
+	$(call if_changed_rule,cc_o_c)
+
+$(obj)/%-fips-arch.o: $(crypto-arm64-fips-src)/%.S FORCE
+	$(call if_changed_rule,as_o_S)
+
+quiet_cmd_perlasm = PERLASM $@
+      cmd_perlasm = $(PERL) $(<) void $(@)
+
+$(obj)/%-core.S: $(crypto-arm64-fips-src)/%-armv8.pl
+	$(call cmd,perlasm)
+
+$(obj)/sha256-core.S: $(crypto-arm64-fips-src)/sha512-armv8.pl
+	$(call cmd,perlasm)
+
+clean-files += sha256-core.S sha512-core.S
+
+$(obj)/%-fips-arch.o: $(obj)/%.S FORCE
+	$(call if_changed_rule,as_o_S)
--- a/arch/arm64/include/asm/alternative-macros.h
+++ b/arch/arm64/include/asm/alternative-macros.h
@ -19,6 +19,7 @@
 #error "cpucaps have overflown ARM64_CB_BIT"
 #endif

+#ifndef BUILD_FIPS140_KO
 #ifndef __ASSEMBLY__

 #include <linux/stringify.h>
@ -261,4 +262,50 @@ alternative_has_feature_unlikely(unsigned long feature)

 #endif /* __ASSEMBLY__ */

+#else
+
+/*
+ * The FIPS140 module does not support alternatives patching, as this
+ * invalidates the HMAC digest of the .text section. However, some alternatives
+ * are known to be irrelevant so we can tolerate them in the FIPS140 module, as
+ * they will never be applied in the first place in the use cases that the
+ * FIPS140 module targets (Android running on a production phone). Any other
+ * uses of alternatives should be avoided, as it is not safe in the general
+ * case to simply use the default sequence in one place (the fips module) and
+ * the alternative sequence everywhere else.
+ *
+ * Below is an allowlist of features that we can ignore, by simply taking the
+ * safe default instruction sequence. Note that this implies that the FIPS140
+ * module is not compatible with VHE, or with pseudo-NMI support.
+ */
+
+#define __ALT_ARM64_HAS_LDAPR			0,
+#define __ALT_ARM64_HAS_VIRT_HOST_EXTN		0,
+#define __ALT_ARM64_HAS_IRQ_PRIO_MASKING	0,
+
+#define ALTERNATIVE(oldinstr, newinstr, feature, ...)   \
+	_ALTERNATIVE(oldinstr, __ALT_ ## feature, #feature)
+
+#define _ALTERNATIVE(oldinstr, feature, feature_str)   \
+	__take_second_arg(feature oldinstr, \
+		".err Feature " feature_str " not supported in fips140 module")
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+static __always_inline bool
+alternative_has_feature_likely(unsigned long feature)
+{
+	return feature == ARM64_HAS_LDAPR ||
+		feature == ARM64_HAS_VIRT_HOST_EXTN ||
+		feature == ARM64_HAS_IRQ_PRIO_MASKING;
+}
+
+#define alternative_has_feature_unlikely alternative_has_feature_likely
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* BUILD_FIPS140_KO */
+
 #endif /* __ASM_ALTERNATIVE_MACROS_H */
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@ -109,6 +109,8 @@ void __init early_fixmap_init(void);

 extern void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot);

+extern pte_t *__get_fixmap_pte(enum fixed_addresses idx);
+
 #include <asm-generic/fixmap.h>

 #endif /* !__ASSEMBLY__ */
--- a/arch/arm64/include/asm/hypervisor.h
+++ b/arch/arm64/include/asm/hypervisor.h
@ -6,5 +6,14 @@

 void kvm_init_hyp_services(void);
 bool kvm_arm_hyp_service_available(u32 func_id);
+void kvm_arm_init_hyp_services(void);
+void kvm_init_memshare_services(void);
+void kvm_init_ioremap_services(void);
+
+#ifdef CONFIG_MEMORY_RELINQUISH
+void kvm_init_memrelinquish_services(void);
+#else
+static inline void kvm_init_memrelinquish_services(void) {}
+#endif

 #endif
--- a/arch/arm64/include/asm/io-mpt-s2mpu.h
+++ b/arch/arm64/include/asm/io-mpt-s2mpu.h
@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 - Google LLC
+ */
+
+#ifndef __IO_MPT_S2MPU_H__
+#define __IO_MPT_S2MPU_H__
+
+#include <linux/bitfield.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_s2mpu.h>
+
+struct s2mpu_mpt_cfg {
+	enum s2mpu_version version;
+};
+
+struct s2mpu_mpt_ops {
+	u32 (*smpt_size)(void);
+	void (*init_with_prot)(void *dev_va, enum mpt_prot prot);
+	void (*init_with_mpt)(void *dev_va, struct mpt *mpt);
+	void (*apply_range)(void *dev_va, struct mpt *mpt, u32 first_gb, u32 last_gb);
+	void (*prepare_range)(struct mpt *mpt, phys_addr_t first_byte,
+			      phys_addr_t last_byte, enum mpt_prot prot);
+	int (*pte_from_addr_smpt)(u32 *smpt, u64 addr);
+};
+
+const struct s2mpu_mpt_ops *s2mpu_get_mpt_ops(struct s2mpu_mpt_cfg cfg);
+
+#endif /* __IO_MPT_S2MPU_H__ */
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@ -135,7 +135,7 @@
 * 40 bits wide (T0SZ = 24).  Systems with a PARange smaller than 40 bits are
 * not known to exist and will break with this configuration.
 *
- * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_setup_stage2().
+ * The VTCR_EL2 is configured per VM and is initialised in kvm_init_stage2_mmu.
 *
 * Note that when using 4K pages, we concatenate two first level page tables
 * together. With 16K pages, we concatenate 16 first level page tables.
@ -344,6 +344,8 @@
 #define PAR_TO_HPFAR(par)		\
 	(((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8)

+#define FAR_MASK GENMASK_ULL(11, 0)
+
 #define ECN(x) { ESR_ELx_EC_##x, #x }

 #define kvm_arm_exception_class \
@ -361,4 +363,13 @@
 #define CPACR_EL1_DEFAULT	(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN |\
 				 CPACR_EL1_ZEN_EL1EN)

+/*
+ * ARMv8 Reset Values
+ */
+#define VCPU_RESET_PSTATE_EL1	(PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | \
+				 PSR_F_BIT | PSR_D_BIT)
+
+#define VCPU_RESET_PSTATE_SVC	(PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \
+				 PSR_AA32_I_BIT | PSR_AA32_F_BIT)
+
 #endif /* __ARM64_KVM_ARM_H__ */
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@ -59,23 +59,44 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs,
 	__KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs,
 	__KVM_HOST_SMCCC_FUNC___vgic_v3_get_gic_config,
+	__KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context,
+	__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa,
+	__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid,
+	__KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context,
 	__KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize,

 	/* Hypercalls available after pKVM finalisation */
 	__KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp,
 	__KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp,
+	__KVM_HOST_SMCCC_FUNC___pkvm_host_reclaim_page,
+	__KVM_HOST_SMCCC_FUNC___pkvm_host_map_guest,
 	__KVM_HOST_SMCCC_FUNC___kvm_adjust_pc,
 	__KVM_HOST_SMCCC_FUNC___kvm_vcpu_run,
-	__KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context,
-	__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa,
-	__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid,
-	__KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context,
 	__KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff,
-	__KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr,
-	__KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr,
-	__KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs,
-	__KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs,
-	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_init_traps,
+	__KVM_HOST_SMCCC_FUNC___vgic_v3_save_vmcr_aprs,
+	__KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs,
+	__KVM_HOST_SMCCC_FUNC___pkvm_init_vm,
+	__KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu,
+	__KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm,
+	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_load,
+	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_put,
+	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_sync_state,
+	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_driver_init,
+	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_register,
+	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_pm_notify,
+	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_finalize,
+	__KVM_HOST_SMCCC_FUNC___pkvm_register_hcall,
+	__KVM_HOST_SMCCC_FUNC___pkvm_alloc_module_va,
+	__KVM_HOST_SMCCC_FUNC___pkvm_map_module_page,
+	__KVM_HOST_SMCCC_FUNC___pkvm_unmap_module_page,
+	__KVM_HOST_SMCCC_FUNC___pkvm_init_module,
+	__KVM_HOST_SMCCC_FUNC___pkvm_close_module_registration,
+
+	/*
+	 * Start of the dynamically registered hypercalls. Start a bit
+	 * further, just in case some modules...
+	 */
+	__KVM_HOST_SMCCC_FUNC___dynamic_hcalls = 128,
 };

 #define DECLARE_KVM_VHE_SYM(sym)	extern char sym[]
@ -106,7 +127,7 @@ enum __kvm_host_smccc_func {
 #define per_cpu_ptr_nvhe_sym(sym, cpu)						\
 	({									\
 		unsigned long base, off;					\
-		base = kvm_arm_hyp_percpu_base[cpu];				\
+		base = kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu];		\
 		off = (unsigned long)&CHOOSE_NVHE_SYM(sym) -			\
 		      (unsigned long)&CHOOSE_NVHE_SYM(__per_cpu_start);		\
 		base ? (typeof(CHOOSE_NVHE_SYM(sym))*)(base + off) : NULL;	\
@ -211,7 +232,7 @@ DECLARE_KVM_HYP_SYM(__kvm_hyp_vector);
 #define __kvm_hyp_init		CHOOSE_NVHE_SYM(__kvm_hyp_init)
 #define __kvm_hyp_vector	CHOOSE_HYP_SYM(__kvm_hyp_vector)

-extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
+extern unsigned long kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[];
 DECLARE_KVM_NVHE_SYM(__per_cpu_start);
 DECLARE_KVM_NVHE_SYM(__per_cpu_end);

@ -231,8 +252,6 @@ extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 extern void __kvm_adjust_pc(struct kvm_vcpu *vcpu);

 extern u64 __vgic_v3_get_gic_config(void);
-extern u64 __vgic_v3_read_vmcr(void);
-extern void __vgic_v3_write_vmcr(u32 vmcr);
 extern void __vgic_v3_init_lrs(void);

 extern u64 __kvm_get_mdcr_el2(void);
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@ -42,6 +42,11 @@ void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
 void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
 void kvm_inject_size_fault(struct kvm_vcpu *vcpu);

+unsigned long get_except64_offset(unsigned long psr, unsigned long target_mode,
+				  enum exception_type type);
+unsigned long get_except64_cpsr(unsigned long old, bool has_mte,
+				unsigned long sctlr, unsigned long mode);
+
 void kvm_vcpu_wfi(struct kvm_vcpu *vcpu);

 #if defined(__KVM_VHE_HYPERVISOR__) || defined(__KVM_NVHE_HYPERVISOR__)
@ -490,4 +495,61 @@ static inline bool vcpu_has_feature(struct kvm_vcpu *vcpu, int feature)
 	return test_bit(feature, vcpu->arch.features);
 }

+static inline int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * For now make sure that both address/generic pointer authentication
+	 * features are requested by the userspace together and the system
+	 * supports these capabilities.
+	 */
+	if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) ||
+	    !vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC) ||
+	    !system_has_full_ptr_auth())
+		return -EINVAL;
+
+	vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH);
+	return 0;
+}
+
+/* Reset a vcpu's core registers. */
+static inline void kvm_reset_vcpu_core(struct kvm_vcpu *vcpu)
+{
+	u32 pstate;
+
+	if (vcpu_el1_is_32bit(vcpu)) {
+		pstate = VCPU_RESET_PSTATE_SVC;
+	} else {
+		pstate = VCPU_RESET_PSTATE_EL1;
+	}
+
+	/* Reset core registers */
+	memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu)));
+	memset(&vcpu->arch.ctxt.fp_regs, 0, sizeof(vcpu->arch.ctxt.fp_regs));
+	vcpu->arch.ctxt.spsr_abt = 0;
+	vcpu->arch.ctxt.spsr_und = 0;
+	vcpu->arch.ctxt.spsr_irq = 0;
+	vcpu->arch.ctxt.spsr_fiq = 0;
+	vcpu_gp_regs(vcpu)->pstate = pstate;
+}
+
+/* PSCI reset handling for a vcpu. */
+static inline void kvm_reset_vcpu_psci(struct kvm_vcpu *vcpu,
+				       struct vcpu_reset_state *reset_state)
+{
+	unsigned long target_pc = reset_state->pc;
+
+	/* Gracefully handle Thumb2 entry point */
+	if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) {
+		target_pc &= ~1UL;
+		vcpu_set_thumb(vcpu);
+	}
+
+	/* Propagate caller endianness */
+	if (reset_state->be)
+		kvm_vcpu_set_be(vcpu);
+
+	*vcpu_pc(vcpu) = target_pc;
+	vcpu_set_reg(vcpu, 0, reset_state->r0);
+}
+
 #endif /* __ARM64_KVM_EMULATE_H__ */
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@ -73,6 +73,64 @@ u32 __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
 void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu);

+struct kvm_hyp_memcache {
+	phys_addr_t head;
+	unsigned long nr_pages;
+};
+
+static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc,
+				     phys_addr_t *p,
+				     phys_addr_t (*to_pa)(void *virt))
+{
+	*p = mc->head;
+	mc->head = to_pa(p);
+	mc->nr_pages++;
+}
+
+static inline void *pop_hyp_memcache(struct kvm_hyp_memcache *mc,
+				     void *(*to_va)(phys_addr_t phys))
+{
+	phys_addr_t *p = to_va(mc->head);
+
+	if (!mc->nr_pages)
+		return NULL;
+
+	mc->head = *p;
+	mc->nr_pages--;
+
+	return p;
+}
+
+static inline int __topup_hyp_memcache(struct kvm_hyp_memcache *mc,
+				       unsigned long min_pages,
+				       void *(*alloc_fn)(void *arg),
+				       phys_addr_t (*to_pa)(void *virt),
+				       void *arg)
+{
+	while (mc->nr_pages < min_pages) {
+		phys_addr_t *p = alloc_fn(arg);
+
+		if (!p)
+			return -ENOMEM;
+		push_hyp_memcache(mc, p, to_pa);
+	}
+
+	return 0;
+}
+
+static inline void __free_hyp_memcache(struct kvm_hyp_memcache *mc,
+				       void (*free_fn)(void *virt, void *arg),
+				       void *(*to_va)(phys_addr_t phys),
+				       void *arg)
+{
+	while (mc->nr_pages)
+		free_fn(pop_hyp_memcache(mc, to_va), arg);
+}
+
+void free_hyp_memcache(struct kvm_hyp_memcache *mc, struct kvm *kvm);
+void free_hyp_stage2_memcache(struct kvm_hyp_memcache *mc, struct kvm *kvm);
+int topup_hyp_memcache(struct kvm_vcpu *vcpu);
+
 struct kvm_vmid {
 	atomic64_t id;
 };
@ -115,6 +173,23 @@ struct kvm_smccc_features {
 	unsigned long vendor_hyp_bmap;
 };

+struct kvm_pinned_page {
+	struct rb_node		node;
+	struct page		*page;
+	u64			ipa;
+};
+
+typedef unsigned int pkvm_handle_t;
+
+struct kvm_protected_vm {
+	pkvm_handle_t handle;
+	struct kvm_hyp_memcache teardown_mc;
+	struct kvm_hyp_memcache teardown_stage2_mc;
+	struct rb_root pinned_pages;
+	gpa_t pvmfw_load_addr;
+	bool enabled;
+};
+
 struct kvm_arch {
 	struct kvm_s2_mmu mmu;

@ -149,7 +224,8 @@ struct kvm_arch {
 #define KVM_ARCH_FLAG_EL1_32BIT				4
 	/* PSCI SYSTEM_SUSPEND enabled for the guest */
 #define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED		5
-
+	/* Guest has bought into the MMIO guard extension */
+#define KVM_ARCH_FLAG_MMIO_GUARD			6
 	unsigned long flags;

 	/*
@ -166,6 +242,12 @@ struct kvm_arch {

 	/* Hypercall features firmware registers' descriptor */
 	struct kvm_smccc_features smccc_feat;
+
+	/*
+	 * For an untrusted host VM, 'pkvm.handle' is used to lookup
+	 * the associated pKVM instance in the hypervisor.
+	 */
+	struct kvm_protected_vm pkvm;
 };

 struct kvm_vcpu_fault_info {
@ -277,6 +359,7 @@ struct kvm_host_data {
 struct kvm_host_psci_config {
 	/* PSCI version used by host. */
 	u32 version;
+	u32 smccc_version;

 	/* Function IDs used by host if version is v0.1. */
 	struct psci_0_1_function_ids function_ids_0_1;
@ -296,6 +379,35 @@ extern s64 kvm_nvhe_sym(hyp_physvirt_offset);
 extern u64 kvm_nvhe_sym(hyp_cpu_logical_map)[NR_CPUS];
 #define hyp_cpu_logical_map CHOOSE_NVHE_SYM(hyp_cpu_logical_map)

+enum pkvm_iommu_pm_event {
+	PKVM_IOMMU_PM_SUSPEND,
+	PKVM_IOMMU_PM_RESUME,
+};
+
+struct pkvm_iommu_ops;
+
+struct pkvm_iommu_driver {
+	const struct pkvm_iommu_ops *ops;
+	struct list_head list;
+	atomic_t state;
+};
+
+extern struct pkvm_iommu_driver kvm_nvhe_sym(pkvm_s2mpu_driver);
+extern struct pkvm_iommu_driver kvm_nvhe_sym(pkvm_sysmmu_sync_driver);
+
+int pkvm_iommu_driver_init(struct pkvm_iommu_driver *drv, void *data, size_t size);
+int pkvm_iommu_register(struct device *dev, struct pkvm_iommu_driver *drv,
+			phys_addr_t pa, size_t size, struct device *parent);
+int pkvm_iommu_suspend(struct device *dev);
+int pkvm_iommu_resume(struct device *dev);
+
+int pkvm_iommu_s2mpu_init(u32 version);
+int pkvm_iommu_s2mpu_register(struct device *dev, phys_addr_t pa);
+int pkvm_iommu_sysmmu_sync_register(struct device *dev, phys_addr_t pa,
+				    struct device *parent);
+/* Reject future calls to pkvm_iommu_driver_init() and pkvm_iommu_register(). */
+int pkvm_iommu_finalize(void);
+
 struct vcpu_reset_state {
 	unsigned long	pc;
 	unsigned long	r0;
@ -399,8 +511,12 @@ struct kvm_vcpu_arch {
 	/* vcpu power state */
 	struct kvm_mp_state mp_state;

-	/* Cache some mmu pages needed inside spinlock regions */
-	struct kvm_mmu_memory_cache mmu_page_cache;
+	union {
+		/* Cache some mmu pages needed inside spinlock regions */
+		struct kvm_mmu_memory_cache mmu_page_cache;
+		/* Pages to be donated to pkvm/EL2 if it runs out */
+		struct kvm_hyp_memcache pkvm_memcache;
+	};

 	/* Target CPU and feature flags */
 	int target;
@ -474,9 +590,25 @@ struct kvm_vcpu_arch {
 		*fset &= ~(m);					\
 	} while (0)

+#define __vcpu_copy_flag(vt, vs, flagset, f, m)			\
+	do {							\
+		typeof(vs->arch.flagset) tmp, val;		\
+								\
+		__build_check_flag(vs, flagset, f, m);		\
+								\
+		val = READ_ONCE(vs->arch.flagset);		\
+		val &= (m);					\
+		tmp = READ_ONCE(vt->arch.flagset);		\
+		tmp &= ~(m);					\
+		tmp |= val;					\
+		WRITE_ONCE(vt->arch.flagset, tmp);		\
+	} while (0)
+
+
 #define vcpu_get_flag(v, ...)	__vcpu_get_flag((v), __VA_ARGS__)
 #define vcpu_set_flag(v, ...)	__vcpu_set_flag((v), __VA_ARGS__)
 #define vcpu_clear_flag(v, ...)	__vcpu_clear_flag((v), __VA_ARGS__)
+#define vcpu_copy_flag(vt, vs,...) __vcpu_copy_flag((vt), (vs), __VA_ARGS__)

 /* SVE exposed to guest */
 #define GUEST_HAS_SVE		__vcpu_single_flag(cflags, BIT(0))
@ -494,6 +626,8 @@ struct kvm_vcpu_arch {
 #define INCREMENT_PC		__vcpu_single_flag(iflags, BIT(1))
 /* Target EL/MODE (not a single flag, but let's abuse the macro) */
 #define EXCEPT_MASK		__vcpu_single_flag(iflags, GENMASK(3, 1))
+/* Cover both PENDING_EXCEPTION and EXCEPT_MASK for global operations */
+#define PC_UPDATE_REQ		__vcpu_single_flag(iflags, GENMASK(3, 0))

 /* Helpers to encode exceptions with minimum fuss */
 #define __EXCEPT_MASK_VAL	unpack_vcpu_flag(EXCEPT_MASK)
@ -525,6 +659,8 @@ struct kvm_vcpu_arch {
 #define DEBUG_STATE_SAVE_SPE	__vcpu_single_flag(iflags, BIT(5))
 /* Save TRBE context if active  */
 #define DEBUG_STATE_SAVE_TRBE	__vcpu_single_flag(iflags, BIT(6))
+/* pKVM host vcpu state is dirty, needs resync */
+#define PKVM_HOST_STATE_DIRTY	__vcpu_single_flag(iflags, BIT(7))

 /* SVE enabled for host EL0 */
 #define HOST_SVE_ENABLED	__vcpu_single_flag(sflags, BIT(0))
@ -601,9 +737,6 @@ struct kvm_vcpu_arch {

 #define __vcpu_sys_reg(v,r)	(ctxt_sys_reg(&(v)->arch.ctxt, (r)))

-u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg);
-void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg);
-
 static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val)
 {
 	/*
@ -695,8 +828,32 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
 	return true;
 }

+#define vcpu_read_sys_reg(__vcpu, reg)					\
+	({								\
+		u64 __val = 0x8badf00d8badf00d;				\
+									\
+		/* SYSREGS_ON_CPU is only used in VHE */		\
+		((!is_nvhe_hyp_code() &&				\
+		  vcpu_get_flag(__vcpu, SYSREGS_ON_CPU) &&		\
+		  __vcpu_read_sys_reg_from_cpu(reg, &__val))) ?		\
+		 __val							\
+		 :							\
+		 ctxt_sys_reg(&__vcpu->arch.ctxt, reg);			\
+	 })
+
+#define vcpu_write_sys_reg(__vcpu, __val, reg)				\
+	do {								\
+		/* SYSREGS_ON_CPU is only used in VHE */		\
+		if (is_nvhe_hyp_code() ||				\
+		    !vcpu_get_flag(__vcpu, SYSREGS_ON_CPU) ||		\
+		    !__vcpu_write_sys_reg_to_cpu(__val, reg))		\
+			ctxt_sys_reg(&__vcpu->arch.ctxt, reg) = __val;	\
+	} while (0)
+
 struct kvm_vm_stat {
 	struct kvm_vm_stat_generic generic;
+	atomic64_t protected_hyp_mem;
+	atomic64_t protected_shared_mem;
 };

 struct kvm_vcpu_stat {
@ -869,9 +1026,26 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);

+#define __vcpu_save_guest_debug_regs(vcpu)				\
+	do {								\
+		u64 val = vcpu_read_sys_reg(vcpu, MDSCR_EL1);		\
+									\
+		(vcpu)->arch.guest_debug_preserved.mdscr_el1 = val;	\
+	} while(0)
+
+#define __vcpu_restore_guest_debug_regs(vcpu)				\
+	do {								\
+		u64 val = (vcpu)->arch.guest_debug_preserved.mdscr_el1;	\
+									\
+		vcpu_write_sys_reg(vcpu, val, MDSCR_EL1);		\
+	} while (0)
+
 #define kvm_vcpu_os_lock_enabled(vcpu)		\
 	(!!(__vcpu_sys_reg(vcpu, OSLSR_EL1) & SYS_OSLSR_OSLK))

+#define kvm_vcpu_needs_debug_regs(vcpu)		\
+	((vcpu)->guest_debug || kvm_vcpu_os_lock_enabled(vcpu))
+
 int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
 			       struct kvm_device_attr *attr);
 int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
@ -915,12 +1089,7 @@ int kvm_set_ipa_limit(void);
 #define __KVM_HAVE_ARCH_VM_ALLOC
 struct kvm *kvm_arch_alloc_vm(void);

-int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
-
-static inline bool kvm_vm_is_protected(struct kvm *kvm)
-{
-	return false;
-}
+#define kvm_vm_is_protected(kvm)	((kvm)->arch.pkvm.enabled)

 void kvm_init_protected_traps(struct kvm_vcpu *vcpu);

--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@ -15,6 +15,9 @@
 DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
 DECLARE_PER_CPU(unsigned long, kvm_hyp_vector);
 DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
+DECLARE_PER_CPU(int, hyp_cpu_number);
+
+#define hyp_smp_processor_id() (__this_cpu_read(hyp_cpu_number))

 #define read_sysreg_elx(r,nvh,vh)					\
 	({								\
@ -61,8 +64,8 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if);
 void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if);
 void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if);
 void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if);
-void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if);
-void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if);
+void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if);
+void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if);
 int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu);

 #ifdef __KVM_NVHE_HYPERVISOR__
@ -90,6 +93,7 @@ void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu);

 void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
 void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
+void __sve_save_state(void *sve_pffr, u32 *fpsr);
 void __sve_restore_state(void *sve_pffr, u32 *fpsr);

 #ifndef __KVM_NVHE_HYPERVISOR__
@ -123,4 +127,10 @@ extern u64 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val);
 extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val);
 extern u64 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val);

+extern unsigned long kvm_nvhe_sym(__icache_flags);
+extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits);
+extern bool kvm_nvhe_sym(smccc_trng_available);
+
+extern bool kvm_nvhe_sym(__pkvm_modules_enabled);
+
 #endif /* __ARM64_KVM_HYP_H__ */
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@ -116,6 +116,7 @@ alternative_cb_end
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
 #include <asm/kvm_host.h>
+#include <asm/kvm_pkvm_module.h>

 void kvm_update_va_mask(struct alt_instr *alt,
 			__le32 *origptr, __le32 *updptr, int nr_inst);
@ -166,7 +167,7 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
 void free_hyp_pgds(void);

 void stage2_unmap_vm(struct kvm *kvm);
-int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu);
+int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type);
 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu);
 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 			  phys_addr_t pa, unsigned long size, bool writable);
@ -187,8 +188,13 @@ static inline void *__kvm_vector_slot2addr(void *base,

 struct kvm;

-#define kvm_flush_dcache_to_poc(a,l)	\
-	dcache_clean_inval_poc((unsigned long)(a), (unsigned long)(a)+(l))
+#define kvm_flush_dcache_to_poc(a, l)	do {			\
+	unsigned long __a = (unsigned long)(a);			\
+	unsigned long __l = (unsigned long)(l);			\
+								\
+	if (__l)						\
+		dcache_clean_inval_poc(__a, __a + __l);		\
+} while (0)

 static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 {
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@ -42,6 +42,38 @@ typedef u64 kvm_pte_t;
 #define KVM_PTE_ADDR_MASK		GENMASK(47, PAGE_SHIFT)
 #define KVM_PTE_ADDR_51_48		GENMASK(15, 12)

+#define KVM_PHYS_INVALID		(-1ULL)
+
+#define KVM_PTE_TYPE			BIT(1)
+#define KVM_PTE_TYPE_BLOCK		0
+#define KVM_PTE_TYPE_PAGE		1
+#define KVM_PTE_TYPE_TABLE		1
+
+#define KVM_PTE_LEAF_ATTR_LO		GENMASK(11, 2)
+
+#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX	GENMASK(4, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP	GENMASK(7, 6)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO	3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW	1
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH	GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS	3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AF	BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR	GENMASK(5, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R	BIT(6)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W	BIT(7)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH	GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS	3
+#define KVM_PTE_LEAF_ATTR_LO_S2_AF	BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_HI		GENMASK(63, 51)
+
+#define KVM_PTE_LEAF_ATTR_HI_SW		GENMASK(58, 55)
+
+#define KVM_PTE_LEAF_ATTR_HI_S1_XN	BIT(54)
+
+#define KVM_PTE_LEAF_ATTR_HI_S2_XN	BIT(54)
+
 static inline bool kvm_pte_valid(kvm_pte_t pte)
 {
 	return pte & KVM_PTE_VALID;
@ -57,6 +89,18 @@ static inline u64 kvm_pte_to_phys(kvm_pte_t pte)
 	return pa;
 }

+static inline kvm_pte_t kvm_phys_to_pte(u64 pa)
+{
+	kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
+
+	if (PAGE_SHIFT == 16) {
+		pa &= GENMASK(51, 48);
+		pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
+	}
+
+	return pte;
+}
+
 static inline u64 kvm_granule_shift(u32 level)
 {
 	/* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
@ -73,6 +117,17 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
 	return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL;
 }

+static inline bool kvm_pte_table(kvm_pte_t pte, u32 level)
+{
+	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+		return false;
+
+	if (!kvm_pte_valid(pte))
+		return false;
+
+	return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
+}
+
 /**
 * struct kvm_pgtable_mm_ops - Memory management callbacks.
 * @zalloc_page:		Allocate a single zeroed memory page.
@ -129,6 +184,7 @@ enum kvm_pgtable_stage2_flags {
 * @KVM_PGTABLE_PROT_W:		Write permission.
 * @KVM_PGTABLE_PROT_R:		Read permission.
 * @KVM_PGTABLE_PROT_DEVICE:	Device attributes.
+ * @KVM_PGTABLE_PROT_NC:       Normal non-cacheable attributes.
 * @KVM_PGTABLE_PROT_SW0:	Software bit 0.
 * @KVM_PGTABLE_PROT_SW1:	Software bit 1.
 * @KVM_PGTABLE_PROT_SW2:	Software bit 2.
@ -140,6 +196,7 @@ enum kvm_pgtable_prot {
 	KVM_PGTABLE_PROT_R			= BIT(2),

 	KVM_PGTABLE_PROT_DEVICE			= BIT(3),
+	KVM_PGTABLE_PROT_NC			= BIT(4),

 	KVM_PGTABLE_PROT_SW0			= BIT(55),
 	KVM_PGTABLE_PROT_SW1			= BIT(56),
@ -161,6 +218,22 @@ enum kvm_pgtable_prot {
 typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
 					   enum kvm_pgtable_prot prot);

+typedef bool (*kvm_pgtable_pte_is_counted_cb_t)(kvm_pte_t pte, u32 level);
+
+/**
+ * struct kvm_pgtable_pte_ops - PTE callbacks.
+ * @force_pte_cb:		Force the mapping granularity to pages and
+ *				return true if we support this instead of
+ *				block mappings.
+ * @pte_is_counted_cb		Verify the attributes of the @pte argument
+ *				and return true if the descriptor needs to be
+ *				refcounted, otherwise return false.
+ */
+struct kvm_pgtable_pte_ops {
+	kvm_pgtable_force_pte_cb_t		force_pte_cb;
+	kvm_pgtable_pte_is_counted_cb_t		pte_is_counted_cb;
+};
+
 /**
 * struct kvm_pgtable - KVM page-table.
 * @ia_bits:		Maximum input address size, in bits.
@ -169,8 +242,7 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
 * @mm_ops:		Memory management callbacks.
 * @mmu:		Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
 * @flags:		Stage-2 page-table flags.
- * @force_pte_cb:	Function that returns true if page level mappings must
- *			be used instead of block mappings.
+ * @pte_ops:		PTE callbacks.
 */
 struct kvm_pgtable {
 	u32					ia_bits;
@ -181,7 +253,7 @@ struct kvm_pgtable {
 	/* Stage-2 only */
 	struct kvm_s2_mmu			*mmu;
 	enum kvm_pgtable_stage2_flags		flags;
-	kvm_pgtable_force_pte_cb_t		force_pte_cb;
+	struct kvm_pgtable_pte_ops		*pte_ops;
 };

 /**
@ -296,24 +368,31 @@ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
 */
 u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift);

+/**
+ * kvm_pgtable_stage2_pgd_size() - Helper to compute size of a stage-2 PGD
+ * @vtcr:	Content of the VTCR register.
+ *
+ * Return: the size (in bytes) of the stage-2 PGD
+ */
+size_t kvm_pgtable_stage2_pgd_size(u64 vtcr);
+
 /**
 * __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
 * @pgt:	Uninitialised page-table structure to initialise.
 * @mmu:	S2 MMU context for this S2 translation
 * @mm_ops:	Memory management callbacks.
 * @flags:	Stage-2 configuration flags.
- * @force_pte_cb: Function that returns true if page level mappings must
- *		be used instead of block mappings.
+ * @pte_ops:	PTE callbacks.
 *
 * Return: 0 on success, negative error code on failure.
 */
 int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
 			      struct kvm_pgtable_mm_ops *mm_ops,
 			      enum kvm_pgtable_stage2_flags flags,
-			      kvm_pgtable_force_pte_cb_t force_pte_cb);
+			      struct kvm_pgtable_pte_ops *pte_ops);

-#define kvm_pgtable_stage2_init(pgt, mmu, mm_ops) \
-	__kvm_pgtable_stage2_init(pgt, mmu, mm_ops, 0, NULL)
+#define kvm_pgtable_stage2_init(pgt, mmu, mm_ops, pte_ops) \
+	__kvm_pgtable_stage2_init(pgt, mmu, mm_ops, 0, pte_ops)

 /**
 * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table.
@ -357,14 +436,16 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
 			   void *mc);

 /**
- * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to
- *				    track ownership.
+ * kvm_pgtable_stage2_annotate() - Unmap and annotate pages in the IPA space
+ *				   to track ownership (and more).
 * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init*().
 * @addr:	Base intermediate physical address to annotate.
 * @size:	Size of the annotated range.
 * @mc:		Cache of pre-allocated and zeroed memory from which to allocate
 *		page-table pages.
- * @owner_id:	Unique identifier for the owner of the page.
+ * @annotation:	A 63 bit value that will be stored in the page tables.
+ *		@annotation[0] must be 0, and @annotation[63:1] is stored
+ *		in the page tables.
 *
 * By default, all page-tables are owned by identifier 0. This function can be
 * used to mark portions of the IPA space as owned by other entities. When a
@ -373,8 +454,8 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
 *
 * Return: 0 on success, negative error code on failure.
 */
-int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
-				 void *mc, u8 owner_id);
+int kvm_pgtable_stage2_annotate(struct kvm_pgtable *pgt, u64 addr, u64 size,
+				void *mc, kvm_pte_t annotation);

 /**
 * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@ -2,18 +2,298 @@
 /*
 * Copyright (C) 2020 - Google LLC
 * Author: Quentin Perret <qperret@google.com>
+ * Author: Fuad Tabba <tabba@google.com>
 */
 #ifndef __ARM64_KVM_PKVM_H__
 #define __ARM64_KVM_PKVM_H__

+#include <linux/arm_ffa.h>
 #include <linux/memblock.h>
+#include <linux/scatterlist.h>
 #include <asm/kvm_pgtable.h>
+#include <asm/sysreg.h>
+
+/* Maximum number of VMs that can co-exist under pKVM. */
+#define KVM_MAX_PVMS 255

 #define HYP_MEMBLOCK_REGIONS 128
+#define PVMFW_INVALID_LOAD_ADDR	(-1)
+
+int pkvm_vm_ioctl_enable_cap(struct kvm *kvm,struct kvm_enable_cap *cap);
+int pkvm_init_host_vm(struct kvm *kvm, unsigned long type);
+int pkvm_create_hyp_vm(struct kvm *kvm);
+void pkvm_destroy_hyp_vm(struct kvm *kvm);
+void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa);
+
+/*
+ * Definitions for features to be allowed or restricted for guest virtual
+ * machines, depending on the mode KVM is running in and on the type of guest
+ * that is running.
+ *
+ * The ALLOW masks represent a bitmask of feature fields that are allowed
+ * without any restrictions as long as they are supported by the system.
+ *
+ * The RESTRICT_UNSIGNED masks, if present, represent unsigned fields for
+ * features that are restricted to support at most the specified feature.
+ *
+ * If a feature field is not present in either, than it is not supported.
+ *
+ * The approach taken for protected VMs is to allow features that are:
+ * - Needed by common Linux distributions (e.g., floating point)
+ * - Trivial to support, e.g., supporting the feature does not introduce or
+ * require tracking of additional state in KVM
+ * - Cannot be trapped or prevent the guest from using anyway
+ */
+
+/*
+ * Allow for protected VMs:
+ * - Floating-point and Advanced SIMD
+ * - GICv3(+) system register interface
+ * - Data Independent Timing
+ */
+#define PVM_ID_AA64PFR0_ALLOW (\
+	ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \
+	ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \
+	ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC) | \
+	ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) \
+	)
+
+/*
+ * Restrict to the following *unsigned* features for protected VMs:
+ * - AArch64 guests only (no support for AArch32 guests):
+ *	AArch32 adds complexity in trap handling, emulation, condition codes,
+ *	etc...
+ * - RAS (v1)
+ *	Supported by KVM
+ */
+#define PVM_ID_AA64PFR0_RESTRICT_UNSIGNED (\
+	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
+	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
+	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL2), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
+	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL3), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
+	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), ID_AA64PFR0_EL1_RAS_IMP) \
+	)
+
+/*
+ * Allow for protected VMs:
+ * - Branch Target Identification
+ * - Speculative Store Bypassing
+ */
+#define PVM_ID_AA64PFR1_ALLOW (\
+	ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_BT) | \
+	ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SSBS) \
+	)
+
+/*
+ * Allow for protected VMs:
+ * - Mixed-endian
+ * - Distinction between Secure and Non-secure Memory
+ * - Mixed-endian at EL0 only
+ * - Non-context synchronizing exception entry and exit
+ */
+#define PVM_ID_AA64MMFR0_ALLOW (\
+	ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGEND) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_SNSMEM) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGENDEL0) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_EXS) \
+	)
+
+/*
+ * Restrict to the following *unsigned* features for protected VMs:
+ * - 40-bit IPA
+ * - 16-bit ASID
+ */
+#define PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED (\
+	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_PARANGE), ID_AA64MMFR0_EL1_PARANGE_40) | \
+	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_ASIDBITS), ID_AA64MMFR0_EL1_ASIDBITS_16) \
+	)
+
+/*
+ * Allow for protected VMs:
+ * - Hardware translation table updates to Access flag and Dirty state
+ * - Number of VMID bits from CPU
+ * - Hierarchical Permission Disables
+ * - Privileged Access Never
+ * - SError interrupt exceptions from speculative reads
+ * - Enhanced Translation Synchronization
+ */
+#define PVM_ID_AA64MMFR1_ALLOW (\
+	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_VMIDBits) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HPDS) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_PAN) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_SpecSEI) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_ETS) \
+	)
+
+/*
+ * Allow for protected VMs:
+ * - Common not Private translations
+ * - User Access Override
+ * - IESB bit in the SCTLR_ELx registers
+ * - Unaligned single-copy atomicity and atomic functions
+ * - ESR_ELx.EC value on an exception by read access to feature ID space
+ * - TTL field in address operations.
+ * - Break-before-make sequences when changing translation block size
+ * - E0PDx mechanism
+ */
+#define PVM_ID_AA64MMFR2_ALLOW (\
+	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_CnP) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_UAO) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IESB) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_AT) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IDS) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_TTL) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_BBM) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_E0PD) \
+	)
+
+/*
+ * No support for Scalable Vectors for protected VMs:
+ *	Requires additional support from KVM, e.g., context-switching and
+ *	trapping at EL2
+ */
+#define PVM_ID_AA64ZFR0_ALLOW (0ULL)
+
+/*
+ * No support for debug, including breakpoints, and watchpoints for protected
+ * VMs:
+ *	The Arm architecture mandates support for at least the Armv8 debug
+ *	architecture, which would include at least 2 hardware breakpoints and
+ *	watchpoints. Providing that support to protected guests adds
+ *	considerable state and complexity. Therefore, the reserved value of 0 is
+ *	used for debug-related fields.
+ */
+#define PVM_ID_AA64DFR0_ALLOW (0ULL)
+#define PVM_ID_AA64DFR1_ALLOW (0ULL)
+
+/*
+ * No support for implementation defined features.
+ */
+#define PVM_ID_AA64AFR0_ALLOW (0ULL)
+#define PVM_ID_AA64AFR1_ALLOW (0ULL)
+
+/*
+ * No restrictions on instructions implemented in AArch64.
+ */
+#define PVM_ID_AA64ISAR0_ALLOW (\
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_AES) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA1) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA2) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_CRC32) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RDM) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA3) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM3) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM4) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_DP) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_FHM) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TS) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TLB) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RNDR) \
+	)
+
+#define PVM_ID_AA64ISAR1_ALLOW (\
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DPB) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_JSCVT) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FCMA) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_LRCPC) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FRINTTS) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SB) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SPECRES) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_BF16) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DGH) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM) \
+	)
+
+#define PVM_ID_AA64ISAR2_ALLOW (\
+	ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) \
+	)
+
+/*
+ * Returns the maximum number of breakpoints supported for protected VMs.
+ */
+static inline int pkvm_get_max_brps(void)
+{
+	int num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_BRPs),
+			    PVM_ID_AA64DFR0_ALLOW);
+
+	/*
+	 * If breakpoints are supported, the maximum number is 1 + the field.
+	 * Otherwise, return 0, which is not compliant with the architecture,
+	 * but is reserved and is used here to indicate no debug support.
+	 */
+	return num ? num + 1 : 0;
+}
+
+/*
+ * Returns the maximum number of watchpoints supported for protected VMs.
+ */
+static inline int pkvm_get_max_wrps(void)
+{
+	int num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_WRPs),
+			    PVM_ID_AA64DFR0_ALLOW);
+
+	return num ? num + 1 : 0;
+}
+
+enum pkvm_moveable_reg_type {
+	PKVM_MREG_MEMORY,
+	PKVM_MREG_PROTECTED_RANGE,
+};
+
+struct pkvm_moveable_reg {
+	phys_addr_t start;
+	u64 size;
+	enum pkvm_moveable_reg_type type;
+};
+
+#define PKVM_NR_MOVEABLE_REGS 512
+extern struct pkvm_moveable_reg kvm_nvhe_sym(pkvm_moveable_regs)[];
+extern unsigned int kvm_nvhe_sym(pkvm_moveable_regs_nr);

 extern struct memblock_region kvm_nvhe_sym(hyp_memory)[];
 extern unsigned int kvm_nvhe_sym(hyp_memblock_nr);

+extern phys_addr_t kvm_nvhe_sym(pvmfw_base);
+extern phys_addr_t kvm_nvhe_sym(pvmfw_size);
+
+static inline unsigned long
+hyp_vmemmap_memblock_size(struct memblock_region *reg, size_t vmemmap_entry_size)
+{
+	unsigned long nr_pages = reg->size >> PAGE_SHIFT;
+	unsigned long start, end;
+
+	start = (reg->base >> PAGE_SHIFT) * vmemmap_entry_size;
+	end = start + nr_pages * vmemmap_entry_size;
+	start = ALIGN_DOWN(start, PAGE_SIZE);
+	end = ALIGN(end, PAGE_SIZE);
+
+	return end - start;
+}
+
+static inline unsigned long hyp_vmemmap_pages(size_t vmemmap_entry_size)
+{
+	unsigned long res = 0, i;
+
+	for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) {
+		res += hyp_vmemmap_memblock_size(&kvm_nvhe_sym(hyp_memory)[i],
+						 vmemmap_entry_size);
+	}
+
+	return res >> PAGE_SHIFT;
+}
+
+static inline unsigned long hyp_vm_table_pages(void)
+{
+	return PAGE_ALIGN(KVM_MAX_PVMS * sizeof(void *)) >> PAGE_SHIFT;
+}
+
 static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages)
 {
 	unsigned long total = 0, i;
@ -27,27 +307,28 @@ static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages)
 	return total;
 }

-static inline unsigned long __hyp_pgtable_total_pages(void)
+static inline unsigned long __hyp_pgtable_moveable_regs_pages(void)
 {
 	unsigned long res = 0, i;

-	/* Cover all of memory with page-granularity */
-	for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) {
-		struct memblock_region *reg = &kvm_nvhe_sym(hyp_memory)[i];
+	/* Cover all of moveable regions with page-granularity */
+	for (i = 0; i < kvm_nvhe_sym(pkvm_moveable_regs_nr); i++) {
+		struct pkvm_moveable_reg *reg = &kvm_nvhe_sym(pkvm_moveable_regs)[i];
 		res += __hyp_pgtable_max_pages(reg->size >> PAGE_SHIFT);
 	}

 	return res;
 }

+#define __PKVM_PRIVATE_SZ SZ_1G
+
 static inline unsigned long hyp_s1_pgtable_pages(void)
 {
 	unsigned long res;

-	res = __hyp_pgtable_total_pages();
+	res = __hyp_pgtable_moveable_regs_pages();

-	/* Allow 1 GiB for private mappings */
-	res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
+	res += __hyp_pgtable_max_pages(__PKVM_PRIVATE_SZ >> PAGE_SHIFT);

 	return res;
 }
@ -60,12 +341,48 @@ static inline unsigned long host_s2_pgtable_pages(void)
 	 * Include an extra 16 pages to safely upper-bound the worst case of
 	 * concatenated pgds.
 	 */
-	res = __hyp_pgtable_total_pages() + 16;
+	res = __hyp_pgtable_moveable_regs_pages() + 16;

-	/* Allow 1 GiB for MMIO mappings */
+	/* Allow 1 GiB for non-moveable regions */
 	res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);

 	return res;
 }

+#define KVM_FFA_MBOX_NR_PAGES	1
+
+/*
+ * Maximum number of consitutents allowed in a descriptor. This number is
+ * arbitrary, see comment below on SG_MAX_SEGMENTS in hyp_ffa_proxy_pages().
+ */
+#define KVM_FFA_MAX_NR_CONSTITUENTS	4096
+
+static inline unsigned long hyp_ffa_proxy_pages(void)
+{
+	size_t desc_max;
+
+	/*
+	 * SG_MAX_SEGMENTS is supposed to bound the number of elements in an
+	 * sglist, which should match the number of consituents in the
+	 * corresponding FFA descriptor. As such, the EL2 buffer needs to be
+	 * large enough to hold a descriptor with SG_MAX_SEGMENTS consituents
+	 * at least. But the kernel's DMA code doesn't enforce the limit, and
+	 * it is sometimes abused, so let's allow larger descriptors and hope
+	 * for the best.
+	 */
+	BUILD_BUG_ON(KVM_FFA_MAX_NR_CONSTITUENTS < SG_MAX_SEGMENTS);
+
+	/*
+	 * The hypervisor FFA proxy needs enough memory to buffer a fragmented
+	 * descriptor returned from EL3 in response to a RETRIEVE_REQ call.
+	 */
+	desc_max = sizeof(struct ffa_mem_region) +
+		   sizeof(struct ffa_mem_region_attributes) +
+		   sizeof(struct ffa_composite_mem_region) +
+		   KVM_FFA_MAX_NR_CONSTITUENTS * sizeof(struct ffa_mem_region_addr_range);
+
+	/* Plus a page each for the hypervisor's RX and TX mailboxes. */
+	return (2 * KVM_FFA_MBOX_NR_PAGES) + DIV_ROUND_UP(desc_max, PAGE_SIZE);
+}
+
 #endif	/* __ARM64_KVM_PKVM_H__ */
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ARM64_KVM_PKVM_MODULE_H__
+#define __ARM64_KVM_PKVM_MODULE_H__
+
+#include <asm/kvm_pgtable.h>
+#include <linux/export.h>
+
+typedef void (*dyn_hcall_t)(struct kvm_cpu_context *);
+
+enum pkvm_psci_notification {
+	PKVM_PSCI_CPU_SUSPEND,
+	PKVM_PSCI_SYSTEM_SUSPEND,
+	PKVM_PSCI_CPU_ENTRY,
+};
+
+#ifdef CONFIG_MODULES
+struct pkvm_module_ops {
+	int (*create_private_mapping)(phys_addr_t phys, size_t size,
+				      enum kvm_pgtable_prot prot,
+				      unsigned long *haddr);
+	int (*register_serial_driver)(void (*hyp_putc_cb)(char));
+	void (*puts)(const char *str);
+	void (*putx64)(u64 num);
+	void *(*fixmap_map)(phys_addr_t phys);
+	void (*fixmap_unmap)(void);
+	void *(*linear_map_early)(phys_addr_t phys, size_t size, enum kvm_pgtable_prot prot);
+	void (*linear_unmap_early)(void *addr, size_t size);
+	void (*flush_dcache_to_poc)(void *addr, size_t size);
+	int (*register_host_perm_fault_handler)(int (*cb)(struct kvm_cpu_context *ctxt, u64 esr, u64 addr));
+	int (*protect_host_page)(u64 pfn, enum kvm_pgtable_prot prot);
+	int (*register_host_smc_handler)(bool (*cb)(struct kvm_cpu_context *));
+	int (*register_default_trap_handler)(bool (*cb)(struct kvm_cpu_context *));
+	int (*register_illegal_abt_notifier)(void (*cb)(struct kvm_cpu_context *));
+	int (*register_psci_notifier)(void (*cb)(enum pkvm_psci_notification, struct kvm_cpu_context *));
+	int (*register_hyp_panic_notifier)(void (*cb)(struct kvm_cpu_context *host_ctxt));
+};
+
+int __pkvm_load_el2_module(struct module *this, unsigned long *token);
+
+int __pkvm_register_el2_call(unsigned long hfn_hyp_va);
+#else
+static inline int __pkvm_load_el2_module(struct module *this,
+					 unsigned long *token)
+{
+	return -ENOSYS;
+}
+
+static inline int __pkvm_register_el2_call(unsigned long hfn_hyp_va)
+{
+	return -ENOSYS;
+}
+#endif /* CONFIG_MODULES */
+
+#ifdef MODULE
+/*
+ * Convert an EL2 module addr from the kernel VA to the hyp VA
+ */
+#define pkvm_el2_mod_va(kern_va, token)					\
+({									\
+	unsigned long hyp_text_kern_va =				\
+		(unsigned long)THIS_MODULE->arch.hyp.text.start;	\
+	unsigned long offset;						\
+									\
+	offset = (unsigned long)kern_va - hyp_text_kern_va;		\
+	token + offset;							\
+})
+
+#define pkvm_load_el2_module(init_fn, token)				\
+({									\
+	THIS_MODULE->arch.hyp.init = init_fn;				\
+	__pkvm_load_el2_module(THIS_MODULE, token);			\
+})
+
+#define pkvm_register_el2_mod_call(hfn, token)				\
+({									\
+	__pkvm_register_el2_call(pkvm_el2_mod_va(hfn, token));		\
+})
+
+#define pkvm_el2_mod_call(id, ...)					\
+	({								\
+		struct arm_smccc_res res;				\
+									\
+		arm_smccc_1_1_hvc(KVM_HOST_SMCCC_ID(id),		\
+				  ##__VA_ARGS__, &res);			\
+		WARN_ON(res.a0 != SMCCC_RET_SUCCESS);			\
+									\
+		res.a1;							\
+	})
+#endif
+#endif
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@ -0,0 +1,436 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 - Google LLC
+ * Author: David Brazdil <dbrazdil@google.com>
+ */
+
+#ifndef __ARM64_KVM_S2MPU_H__
+#define __ARM64_KVM_S2MPU_H__
+
+#include <linux/bitfield.h>
+
+#include <asm/kvm_mmu.h>
+
+#define S2MPU_MMIO_SIZE				SZ_64K
+#define SYSMMU_SYNC_MMIO_SIZE			SZ_64K
+#define SYSMMU_SYNC_S2_OFFSET			SZ_32K
+#define SYSMMU_SYNC_S2_MMIO_SIZE		(SYSMMU_SYNC_MMIO_SIZE - \
+						 SYSMMU_SYNC_S2_OFFSET)
+
+#define NR_VIDS					8
+#define NR_CTX_IDS				8
+
+#define ALL_VIDS_BITMAP				GENMASK(NR_VIDS - 1, 0)
+
+
+/*
+ * S2MPU V9 specific values (some new and some different from old versions)
+ * to avoid any confusion all names are prefixed with V9.
+ */
+#define REG_NS_V9_CTRL_PROT_EN_PER_VID_SET	0x50
+#define REG_NS_V9_CTRL_ERR_RESP_T_PER_VID_SET	0x70
+#define REG_NS_V9_CFG_MPTW_ATTRIBUTE		0x10
+
+#define REG_NS_V9_READ_MPTC			0x3014
+#define REG_NS_V9_READ_MPTC_TAG_PPN		0x3018
+#define REG_NS_V9_READ_MPTC_TAG_OTHERS		0x301C
+#define REG_NS_V9_READ_MPTC_DATA		0x3020
+#define REG_NS_V9_READ_PTLB			0x3030
+#define REG_NS_V9_READ_PTLB_TAG			0x3034
+#define REG_NS_V9_READ_PTLB_DATA_S1_EN_PPN_AP	0x3040
+#define REG_NS_V9_READ_PTLB_DATA_S1_DIS_AP_LIST 0x3044
+#define REG_NS_V9_PMMU_INDICATOR		0x3050
+#define REG_NS_V9_PMMU_INFO			0x3100
+#define REG_NS_V9_PMMU_PTLB_INFO(n)		(0x3400 + (n)*0x4)
+#define REG_NS_V9_SWALKER_INFO			0x3104
+#define REG_NS_V9_MPTC_INFO			0x3C00
+
+/* V9 Masks */
+#define V9_READ_MPTC_TAG_PPN_VALID_MASK		BIT(28)
+#define V9_READ_MPTC_TAG_PPN_TPN_PPN_MASK	GENMASK(23, 0)
+#define V9_READ_MPTC_TAG_PPN_MASK		(V9_READ_MPTC_TAG_PPN_VALID_MASK | \
+						 V9_READ_MPTC_TAG_PPN_TPN_PPN_MASK)
+
+#define V9_READ_MPTC_TAG_OTHERS_VID_MASK	GENMASK(10, 8)
+#define V9_READ_MPTC_TAG_OTHERS_PAGE_GRAN_MASK	GENMASK(5, 4)
+#define V9_READ_MPTC_TAG_OTHERS_MASK		(V9_READ_MPTC_TAG_OTHERS_VID_MASK | \
+						 V9_READ_MPTC_TAG_OTHERS_PAGE_GRAN_MASK)
+
+#define V9_READ_PTLB_WAY_MASK			GENMASK(31, 24)
+#define V9_READ_PTLB_SET_MASK			GENMASK(23, 16)
+#define V9_READ_PTLB_PTLB_MASK			GENMASK(15, 4)
+#define V9_READ_PTLB_PMMU_MASK			GENMASK(3, 0)
+#define V9_READ_PTLB_MASK			(V9_READ_PTLB_WAY_MASK | V9_READ_PTLB_SET_MASK | \
+						 V9_READ_PTLB_PTLB_MASK | V9_READ_PTLB_PMMU_MASK)
+
+#define V9_READ_PTLB_TAG_VALID_MASK		BIT(31)
+#define V9_READ_PTLB_TAG_PAGE_SIZE_MASK		GENMASK(30, 28)
+#define V9_READ_PTLB_TAG_STAGE1_ENABLED_MASK	BIT(27)
+#define V9_READ_PTLB_TAG_VID_MASK		GENMASK(26, 24)
+#define V9_READ_PTLB_TAG_TPN_MASK		GENMASK(23, 0)
+#define V9_READ_PTLB_TAG_MASK			(V9_READ_PTLB_TAG_VALID_MASK | \
+						 V9_READ_PTLB_TAG_TPN_MASK | \
+						 V9_READ_PTLB_TAG_VID_MASK | \
+						 V9_READ_PTLB_TAG_PAGE_SIZE_MASK | \
+						 V9_READ_PTLB_TAG_STAGE1_ENABLED_MASK)
+
+#define V9_READ_PTLB_DTA_S1_EN_PPN_AP_S2AP_MASK	GENMASK(25, 24)
+#define V9_READ_PTLB_DTA_S1_EN_PPN_AP_PPN_MASK	GENMASK(23, 0)
+
+#define V9_READ_PTLB_DATA_S1_ENABLE_PPN_AP_MASK (V9_READ_PTLB_DTA_S1_EN_PPN_AP_S2AP_MASK | \
+						 V9_READ_PTLB_DTA_S1_EN_PPN_AP_PPN_MASK)
+
+#define V9_READ_MPTC_INFO_NUM_MPTC_SET		GENMASK(31, 16)
+#define V9_READ_MPTC_INFO_NUM_MPTC_WAY		GENMASK(15, 12)
+#define V9_READ_MPTC_INFO_MASK			(V9_READ_MPTC_INFO_NUM_MPTC_SET | \
+						 V9_READ_MPTC_INFO_NUM_MPTC_SET)
+
+#define V9_READ_PMMU_INFO_NUM_PTLB		GENMASK(15, 1)
+#define V9_READ_PMMU_INFO_VA_WIDTH		BIT(0)
+#define V9_READ_PMMU_INFO_NUM_STREAM_TABLE	GENMASK(31, 16)
+#define V9_READ_PMMU_INFO_MASK			(V9_READ_PMMU_INFO_NUM_PTLB | \
+						 V9_READ_PMMU_INFO_VA_WIDTH | \
+						 V9_READ_PMMU_INFO_NUM_STREAM_TABLE)
+
+#define V9_READ_PMMU_PTLB_INFO_NUM_WAY		GENMASK(31, 16)
+#define V9_READ_PMMU_PTLB_INFO_NUM_SET		GENMASK(15, 0)
+#define V9_READ_PMMU_PTLB_INFO_MASK		(V9_READ_PMMU_PTLB_INFO_NUM_WAY | \
+						 V9_READ_PMMU_PTLB_INFO_NUM_SET)
+
+#define V9_READ_PMMU_INDICATOR_PMMU_NUM		GENMASK(3, 0)
+#define V9_READ_PMMU_INDICATOR_MASK		V9_READ_PMMU_INDICATOR_PMMU_NUM
+
+#define V9_READ_MPTC_WAY_MASK			GENMASK(17, 16)
+#define V9_READ_MPTC_SET_MASK			GENMASK(15, 0)
+#define V9_READ_MPTC_MASK			(V9_READ_MPTC_WAY_MASK | \
+						 V9_READ_MPTC_SET_MASK)
+#define V9_READ_MPTC_WAY(way)			FIELD_PREP(V9_READ_MPTC_WAY_MASK, (way))
+#define V9_READ_MPTC_SET(set)			FIELD_PREP(V9_READ_MPTC_SET_MASK, (set))
+#define V9_READ_MPTC(set, way)			(V9_READ_MPTC_SET(set) | V9_READ_MPTC_WAY(way))
+
+#define V9_READ_PTLB_WAY(x)			FIELD_PREP(V9_READ_PTLB_WAY_MASK, (x))
+#define V9_READ_PTLB_SET(x)			FIELD_PREP(V9_READ_PTLB_SET_MASK, (x))
+#define V9_READ_PTLB_PTLB(x)			FIELD_PREP(V9_READ_PTLB_PTLB_MASK, (x))
+#define V9_READ_PTLB_PMMU(x)			FIELD_PREP(V9_READ_PTLB_PMMU_MASK, (x))
+#define V9_READ_PTLB(pu_i, pb_i, s, w)		(V9_READ_PTLB_WAY(w) | V9_READ_PTLB_SET(s) | \
+						 V9_READ_PTLB_PTLB(pb_i) | V9_READ_PTLB_PMMU(pu_i))
+
+#define V9_READ_SLTB_INFO_SET_MASK		GENMASK(15, 0)
+#define V9_READ_SLTB_INFO_WAY_MASK		GENMASK(31, 16)
+#define V9_READ_SLTB_INFO_MASK			(V9_READ_SLTB_INFO_SET_MASK | \
+						 V9_READ_SLTB_INFO_WAY_MASK)
+
+#define V9_SWALKER_INFO_NUM_STLB_MASK		GENMASK(31, 16)
+#define V9_SWALKER_INFO_NUM_PMMU_MASK		GENMASK(15, 0)
+#define V9_SWALKER_INFO_MASK			(V9_SWALKER_INFO_NUM_STLB_MASK | \
+						 V9_SWALKER_INFO_NUM_PMMU_MASK)
+
+/*
+ * STLB has 2 types: A,B based on how S2MPU is connected
+ * registers or masks that vary based on type are suffixed with
+ * either TYPEA or TYPEB.
+ */
+#define REG_NS_V9_READ_STLB			0x3000
+#define REG_NS_V9_READ_STLB_TPN			0x3004
+#define REG_NS_V9_READ_STLB_TAG_PPN		0x3008
+#define REG_NS_V9_READ_STLB_TAG_OTHERS		0x300C
+#define REG_NS_V9_READ_STLB_DATA		0x3010
+#define REG_NS_V9_STLB_INFO(n)			(0x3800 + (n)*0x4)
+
+#define V9_READ_STLB_SET_MASK_TYPEA		GENMASK(7, 0)
+#define V9_READ_STLB_WAY_MASK_TYPEA		GENMASK(15, 8)
+#define V9_READ_STLB_SUBLINE_MASK_TYPEA		GENMASK(31, 20)
+#define V9_READ_STLB_STLBID_MASK_TYPEA		GENMASK(17, 16)
+#define V9_READ_STLB_MASK_TYPEA			(V9_READ_STLB_SET_MASK_TYPEA | \
+						 V9_READ_STLB_WAY_MASK_TYPEA | \
+						 V9_READ_STLB_SUBLINE_MASK_TYPEA | \
+						 V9_READ_STLB_STLBID_MASK_TYPEA)
+
+#define V9_READ_STLB_SET_MASK_TYPEB		GENMASK(15, 0)
+#define V9_READ_STLB_WAY_MASK_TYPEB		GENMASK(17, 16)
+#define V9_READ_STLB_STLBID_MASK_TYPEB		GENMASK(31, 20)
+#define V9_READ_STLB_MASK_TYPEB			(V9_READ_STLB_SET_MASK_TYPEB | \
+						 V9_READ_STLB_WAY_MASK_TYPEB  | \
+						 V9_READ_STLB_STLBID_MASK_TYPEB)
+
+#define V9_READ_STLB_TPN_TPN_MASK		GENMASK(23, 0)
+#define V9_READ_STLB_TPN_S2VALID_MASK		BIT(24)
+#define V9_READ_STLB_TPN_STAGE1_ENABLED_MASK	BIT(27)
+#define V9_READ_STLB_TPN_VALID_MASK		BIT(28)
+#define V9_READ_STLB_TPN_MASK			(V9_READ_STLB_TPN_TPN_MASK | \
+						 V9_READ_STLB_TPN_S2VALID_MASK | \
+						 V9_READ_STLB_TPN_STAGE1_ENABLED_MASK | \
+						 V9_READ_STLB_TPN_VALID_MASK)
+
+#define V9_READ_STLB_TAG_PPN_VALID_MASK_TYPEB	BIT(28)
+#define V9_READ_STLB_TAG_PPN_PPN_MASK		GENMASK(23, 0)
+#define V9_READ_STLB_TAG_PPN_MASK		(V9_READ_STLB_TAG_PPN_PPN_MASK | \
+						 V9_READ_STLB_TAG_PPN_VALID_MASK_TYPEB)
+
+#define V9_READ_STLB_TAG_OTHERS_S2AP_MASK_TYPEA	GENMASK(1, 0)
+#define V9_READ_STLB_TAG_OTHERS_PS_MASK		GENMASK(10, 8)
+#define V9_READ_STLB_TAG_OTHERS_BPS_MASK	BIT(12)
+#define V9_READ_STLB_TAG_OTHERS_VID_MASK	GENMASK(23, 20)
+#define V9_READ_STLB_TAG_OTHERS_MASK		(V9_READ_STLB_TAG_OTHERS_S2AP_MASK_TYPEA | \
+						 V9_READ_STLB_TAG_OTHERS_PS_MASK | \
+						 V9_READ_STLB_TAG_OTHERS_BPS_MASK | \
+						 V9_READ_STLB_TAG_OTHERS_VID_MASK)
+
+#define V9_READ_STLB_WAY_TYPEA(x)		FIELD_PREP(V9_READ_STLB_WAY_MASK_TYPEA, (x))
+#define V9_READ_STLB_SET_TYPEA(x)		FIELD_PREP(V9_READ_STLB_SET_MASK_TYPEA, (x))
+#define V9_READ_STLB_STLBID_TYPEA(x)		FIELD_PREP(V9_READ_STLB_STLBID_MASK_TYPEA, (x))
+#define V9_READ_STLB_SUBLINE_TYPEA(x)		FIELD_PREP(V9_READ_STLB_SUBLINE_MASK_TYPEA, (x))
+
+#define V9_READ_STLB_TYPEA(s_i, sub, s, w)	(V9_READ_STLB_WAY_TYPEA(w) | \
+						 V9_READ_STLB_SET_TYPEA(s) | \
+						 V9_READ_STLB_STLBID_TYPEA(s_i) | \
+						 V9_READ_STLB_SUBLINE_TYPEA(sub))
+
+#define V9_READ_STLB_WAY_TYPEB(x)		FIELD_PREP(V9_READ_STLB_WAY_MASK_TYPEB, (x))
+#define V9_READ_STLB_SET_TYPEB(x)		FIELD_PREP(V9_READ_STLB_SET_MASK_TYPEB, (x))
+#define V9_READ_STLB_STLBID_TYPEB(x)		FIELD_PREP(V9_READ_STLB_STLBID_MASK_TYPEB, (x))
+
+#define V9_READ_STLB_TYPEB(s_i,  s, w)		(V9_READ_STLB_WAY_TYPEB(w) | \
+						 V9_READ_STLB_SET_TYPEB(s) | \
+						 V9_READ_STLB_STLBID_TYPEB(s_i))
+
+#define V9_MAX_PTLB_NUM				0x100
+#define V9_MAX_STLB_NUM				0x100
+
+#define V9_CTRL0_DIS_CHK_S1L1PTW_MASK		BIT(0)
+#define V9_CTRL0_DIS_CHK_S1L2PTW_MASK		BIT(1)
+#define V9_CTRL0_DIS_CHK_USR_MARCHED_REQ_MASK	BIT(3)
+#define V9_CTRL0_FAULT_MODE_MASK		BIT(4)
+#define V9_CTRL0_ENF_FLT_MODE_S1_NONSEC_MASK	BIT(5)
+#define V9_CTRL0_DESTRUCTIVE_AP_CHK_MODE_MASK	BIT(6)
+#define V9_CTRL0_MASK				(V9_CTRL0_DIS_CHK_S1L1PTW_MASK | \
+						 V9_CTRL0_DESTRUCTIVE_AP_CHK_MODE_MASK | \
+						 V9_CTRL0_DIS_CHK_USR_MARCHED_REQ_MASK | \
+						 V9_CTRL0_DIS_CHK_S1L2PTW_MASK | \
+						 V9_CTRL0_ENF_FLT_MODE_S1_NONSEC_MASK | \
+						 V9_CTRL0_FAULT_MODE_MASK)
+
+/*
+ * S2MPU V9 specific values (some new and some different from old versions)
+ * to avoid any confusion all names are prefixed with V9.
+ */
+#define V9_L1ENTRY_ATTR_GRAN_MASK		BIT(3)
+#define V9_MPT_PROT_BITS			4
+#define V9_MPT_ACCESS_SHIFT			2
+
+/* V1,V2 variants. */
+#define MPT_ACCESS_SHIFT			0
+#define L1ENTRY_ATTR_GRAN_MASK			GENMASK(5, 4)
+#define MPT_PROT_BITS				2
+
+#define REG_NS_CTRL0				0x0
+#define REG_NS_CTRL1				0x4
+#define REG_NS_CFG				0x10
+#define REG_NS_INTERRUPT_ENABLE_PER_VID_SET	0x20
+#define REG_NS_INTERRUPT_CLEAR			0x2c
+#define REG_NS_VERSION				0x60
+#define REG_NS_INFO				0x64
+#define REG_NS_STATUS				0x68
+#define REG_NS_NUM_CONTEXT			0x100
+#define REG_NS_CONTEXT_CFG_VALID_VID		0x104
+#define REG_NS_ALL_INVALIDATION			0x1000
+#define REG_NS_RANGE_INVALIDATION		0x1020
+#define REG_NS_RANGE_INVALIDATION_START_PPN	0x1024
+#define REG_NS_RANGE_INVALIDATION_END_PPN	0x1028
+#define REG_NS_FAULT_STATUS			0x2000
+#define REG_NS_FAULT_PA_LOW(vid)		(0x2004 + ((vid) * 0x20))
+#define REG_NS_FAULT_PA_HIGH(vid)		(0x2008 + ((vid) * 0x20))
+#define REG_NS_FAULT_INFO(vid)			(0x2010 + ((vid) * 0x20))
+#define REG_NS_READ_MPTC			0x3000
+#define REG_NS_READ_MPTC_TAG_PPN		0x3004
+#define REG_NS_READ_MPTC_TAG_OTHERS		0x3008
+#define REG_NS_READ_MPTC_DATA			0x3010
+#define REG_NS_L1ENTRY_L2TABLE_ADDR(vid, gb)	(0x4000 + ((vid) * 0x200) + ((gb) * 0x8))
+#define REG_NS_L1ENTRY_ATTR(vid, gb)		(0x4004 + ((vid) * 0x200) + ((gb) * 0x8))
+
+#define CTRL0_ENABLE				BIT(0)
+#define CTRL0_INTERRUPT_ENABLE			BIT(1)
+#define CTRL0_FAULT_RESP_TYPE_SLVERR		BIT(2) /* for v1 */
+#define CTRL0_FAULT_RESP_TYPE_DECERR		BIT(2) /* for v2 */
+#define CTRL0_MASK				(CTRL0_ENABLE | \
+						 CTRL0_INTERRUPT_ENABLE | \
+						 CTRL0_FAULT_RESP_TYPE_SLVERR | \
+						 CTRL0_FAULT_RESP_TYPE_DECERR)
+
+#define CTRL1_DISABLE_CHK_S1L1PTW		BIT(0)
+#define CTRL1_DISABLE_CHK_S1L2PTW		BIT(1)
+#define CTRL1_ENABLE_PAGE_SIZE_AWARENESS	BIT(2)
+#define CTRL1_DISABLE_CHK_USER_MATCHED_REQ	BIT(3)
+#define CTRL1_MASK				(CTRL1_DISABLE_CHK_S1L1PTW | \
+						 CTRL1_DISABLE_CHK_S1L2PTW | \
+						 CTRL1_ENABLE_PAGE_SIZE_AWARENESS | \
+						 CTRL1_DISABLE_CHK_USER_MATCHED_REQ)
+
+#define CFG_MPTW_CACHE_OVERRIDE			BIT(0)
+#define CFG_MPTW_CACHE_VALUE			GENMASK(7, 4)
+#define CFG_MPTW_QOS_OVERRIDE			BIT(8)
+#define CFG_MPTW_QOS_VALUE			GENMASK(15, 12)
+#define CFG_MPTW_SHAREABLE			BIT(16)
+#define CFG_MASK				(CFG_MPTW_CACHE_OVERRIDE | \
+						 CFG_MPTW_CACHE_VALUE | \
+						 CFG_MPTW_QOS_OVERRIDE | \
+						 CFG_MPTW_QOS_VALUE | \
+						 CFG_MPTW_SHAREABLE)
+
+/* For use with hi_lo_readq_relaxed(). */
+#define REG_NS_FAULT_PA_HIGH_LOW(vid)		REG_NS_FAULT_PA_LOW(vid)
+
+/* Mask used for extracting VID from FAULT_* register offset. */
+#define REG_NS_FAULT_VID_MASK			GENMASK(7, 5)
+
+#define VERSION_MAJOR_ARCH_VER_MASK		GENMASK(31, 28)
+#define VERSION_MINOR_ARCH_VER_MASK		GENMASK(27, 24)
+#define VERSION_REV_ARCH_VER_MASK		GENMASK(23, 16)
+#define VERSION_RTL_VER_MASK			GENMASK(7, 0)
+
+/* Ignore RTL version in driver version check. */
+#define VERSION_CHECK_MASK			(VERSION_MAJOR_ARCH_VER_MASK | \
+						 VERSION_MINOR_ARCH_VER_MASK | \
+						 VERSION_REV_ARCH_VER_MASK)
+
+#define INFO_NUM_SET_MASK			GENMASK(15, 0)
+
+#define STATUS_BUSY				BIT(0)
+#define STATUS_ON_INVALIDATING			BIT(1)
+
+#define NUM_CONTEXT_MASK			GENMASK(3, 0)
+
+#define CONTEXT_CFG_VALID_VID_CTX_VALID(ctx)	BIT((4 * (ctx)) + 3)
+#define CONTEXT_CFG_VALID_VID_CTX_VID(ctx, vid)	\
+		FIELD_PREP(GENMASK((4 * (ctx) + 2), 4 * (ctx)), (vid))
+
+#define INVALIDATION_INVALIDATE			BIT(0)
+#define RANGE_INVALIDATION_PPN_SHIFT		12
+
+#define NR_FAULT_INFO_REGS			8
+#define FAULT_INFO_VID_MASK			GENMASK(26, 24)
+#define FAULT_INFO_TYPE_MASK			GENMASK(23, 21)
+#define FAULT_INFO_TYPE_CONTEXT			0x4 /* v2 only */
+#define FAULT_INFO_TYPE_AP			0x2
+#define FAULT_INFO_TYPE_MPTW			0x1
+#define FAULT_INFO_RW_BIT			BIT(20)
+#define FAULT_INFO_LEN_MASK			GENMASK(19, 16)
+#define FAULT_INFO_ID_MASK			GENMASK(15, 0)
+
+#define L1ENTRY_L2TABLE_ADDR_SHIFT		4
+#define L1ENTRY_L2TABLE_ADDR(pa)		((pa) >> L1ENTRY_L2TABLE_ADDR_SHIFT)
+
+#define READ_MPTC_WAY_MASK			GENMASK(18, 16)
+#define READ_MPTC_SET_MASK			GENMASK(15, 0)
+#define READ_MPTC_MASK				(READ_MPTC_WAY_MASK | READ_MPTC_SET_MASK)
+#define READ_MPTC_WAY(way)			FIELD_PREP(READ_MPTC_WAY_MASK, (way))
+#define READ_MPTC_SET(set)			FIELD_PREP(READ_MPTC_SET_MASK, (set))
+#define READ_MPTC(set, way)			(READ_MPTC_SET(set) | READ_MPTC_WAY(way))
+#define READ_MPTC_TAG_PPN_MASK			GENMASK(23, 0)
+#define READ_MPTC_TAG_OTHERS_VID_MASK		GENMASK(10, 8)
+#define READ_MPTC_TAG_OTHERS_GRAN_MASK		GENMASK(5, 4)
+#define READ_MPTC_TAG_OTHERS_VALID_BIT		BIT(0)
+#define READ_MPTC_TAG_OTHERS_MASK		(READ_MPTC_TAG_OTHERS_VID_MASK | \
+						 READ_MPTC_TAG_OTHERS_GRAN_MASK | \
+						 READ_MPTC_TAG_OTHERS_VALID_BIT)
+
+#define L1ENTRY_ATTR_L2TABLE_EN			BIT(0)
+#define L1ENTRY_ATTR_GRAN_4K			0x0
+#define L1ENTRY_ATTR_GRAN_64K			0x1
+#define L1ENTRY_ATTR_GRAN_2M			0x2
+#define L1ENTRY_ATTR_GRAN(gran, msk)		FIELD_PREP(msk, gran)
+#define L1ENTRY_ATTR_PROT_MASK			GENMASK(2, 1)
+#define L1ENTRY_ATTR_PROT(prot)			FIELD_PREP(L1ENTRY_ATTR_PROT_MASK, prot)
+#define L1ENTRY_ATTR_1G(prot)			L1ENTRY_ATTR_PROT(prot)
+#define L1ENTRY_ATTR_L2(gran, msk)		(L1ENTRY_ATTR_GRAN(gran, msk) | \
+						 L1ENTRY_ATTR_L2TABLE_EN)
+
+#define NR_GIGABYTES				64
+#define RO_GIGABYTES_FIRST			4
+#define RO_GIGABYTES_LAST			33
+#define NR_RO_GIGABYTES				(RO_GIGABYTES_LAST - RO_GIGABYTES_FIRST + 1)
+#define NR_RW_GIGABYTES				(NR_GIGABYTES - NR_RO_GIGABYTES)
+
+#ifdef CONFIG_ARM64_64K_PAGES
+#define SMPT_GRAN				SZ_64K
+#define SMPT_GRAN_ATTR				L1ENTRY_ATTR_GRAN_64K
+#else
+#define SMPT_GRAN				SZ_4K
+#define SMPT_GRAN_ATTR				L1ENTRY_ATTR_GRAN_4K
+#endif
+static_assert(SMPT_GRAN <= PAGE_SIZE);
+
+
+#define SMPT_WORD_SIZE				sizeof(u32)
+#define SMPT_ELEMS_PER_BYTE(prot_bits)		(BITS_PER_BYTE / (prot_bits))
+#define SMPT_ELEMS_PER_WORD(prot_bits)		(SMPT_WORD_SIZE * SMPT_ELEMS_PER_BYTE(prot_bits))
+#define SMPT_WORD_BYTE_RANGE(prot_bits)		(SMPT_GRAN * SMPT_ELEMS_PER_WORD(prot_bits))
+#define SMPT_NUM_ELEMS				(SZ_1G / SMPT_GRAN)
+#define SMPT_SIZE(prot_bits)			(SMPT_NUM_ELEMS / SMPT_ELEMS_PER_BYTE(prot_bits))
+#define SMPT_NUM_WORDS(prot_bits)		(SMPT_SIZE(prot_bits) / SMPT_WORD_SIZE)
+#define SMPT_NUM_PAGES(prot_bits)		(SMPT_SIZE(prot_bits) / PAGE_SIZE)
+#define SMPT_ORDER(prot_bits)			get_order(SMPT_SIZE(prot_bits))
+
+
+#define SMPT_GRAN_MASK				GENMASK(1, 0)
+
+/* SysMMU_SYNC registers, relative to SYSMMU_SYNC_S2_OFFSET. */
+#define REG_NS_SYNC_CMD				0x0
+#define REG_NS_SYNC_COMP			0x4
+
+#define SYNC_CMD_SYNC				BIT(0)
+#define SYNC_COMP_COMPLETE			BIT(0)
+
+/*
+ * Iterate over S2MPU gigabyte regions. Skip those that cannot be modified
+ * (the MMIO registers are read only, with reset value MPT_PROT_NONE).
+ */
+#define for_each_gb_in_range(i, first, last) \
+	for ((i) = (first); (i) <= (last) && (i) < NR_GIGABYTES; \
+	     (i) = (((i) + 1 == RO_GIGABYTES_FIRST) ? RO_GIGABYTES_LAST : (i)) + 1)
+
+#define for_each_gb(i)			for_each_gb_in_range(i, 0, NR_GIGABYTES - 1)
+#define for_each_vid(i)			for ((i) = 0; (i) < NR_VIDS; (i)++)
+#define for_each_gb_and_vid(gb, vid)	for_each_vid((vid)) for_each_gb((gb))
+
+enum s2mpu_version {
+	S2MPU_VERSION_1 = 0x11000000,
+	S2MPU_VERSION_2 = 0x20000000,
+	S2MPU_VERSION_9 = 0x90000000,
+};
+
+static inline int smpt_order_from_version(enum s2mpu_version version)
+{
+	if (version == S2MPU_VERSION_9)
+		return SMPT_ORDER(V9_MPT_PROT_BITS);
+	else if ((version == S2MPU_VERSION_1) || (version == S2MPU_VERSION_2))
+		return SMPT_ORDER(MPT_PROT_BITS);
+	BUG();
+}
+
+enum mpt_prot {
+	MPT_PROT_NONE	= 0,
+	MPT_PROT_R	= BIT(0),
+	MPT_PROT_W	= BIT(1),
+	MPT_PROT_RW	= MPT_PROT_R | MPT_PROT_W,
+	MPT_PROT_MASK	= MPT_PROT_RW,
+};
+
+enum mpt_update_flags {
+	MPT_UPDATE_L1 = BIT(0),
+	MPT_UPDATE_L2 = BIT(1),
+};
+
+struct fmpt {
+	u32 *smpt;
+	bool gran_1g;
+	enum mpt_prot prot;
+	enum mpt_update_flags flags;
+};
+
+struct mpt {
+	struct fmpt fmpt[NR_GIGABYTES];
+	enum s2mpu_version version;
+};
+
+#endif /* __ARM64_KVM_S2MPU_H__ */
--- a/arch/arm64/include/asm/lse.h
+++ b/arch/arm64/include/asm/lse.h
@ -4,7 +4,7 @@

 #include <asm/atomic_ll_sc.h>

-#ifdef CONFIG_ARM64_LSE_ATOMICS
+#if defined(CONFIG_ARM64_LSE_ATOMICS) && !defined(BUILD_FIPS140_KO)

 #define __LSE_PREAMBLE	".arch_extension lse\n"

--- a/arch/arm64/include/asm/mem_encrypt.h
+++ b/arch/arm64/include/asm/mem_encrypt.h
@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ASM_MEM_ENCRYPT_H
+#define __ASM_MEM_ENCRYPT_H
+
+bool mem_encrypt_active(void);
+int set_memory_encrypted(unsigned long addr, int numpages);
+int set_memory_decrypted(unsigned long addr, int numpages);
+
+#endif	/* __ASM_MEM_ENCRYPT_H */
--- a/arch/arm64/include/asm/mem_relinquish.h
+++ b/arch/arm64/include/asm/mem_relinquish.h
@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 Google LLC
+ * Author: Keir Fraser <keirf@google.com>
+ */
+
+#ifndef __ASM_MEM_RELINQUISH_H
+#define __ASM_MEM_RELINQUISH_H
+
+struct page;
+
+bool kvm_has_memrelinquish_services(void);
+void page_relinquish(struct page *page);
+
+#endif	/* __ASM_MEM_RELINQUISH_H */
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@ -147,6 +147,7 @@
 * Memory types for Stage-2 translation
 */
 #define MT_S2_NORMAL		0xf
+#define MT_S2_NORMAL_NC		0x5
 #define MT_S2_DEVICE_nGnRE	0x1

 /*
@ -154,6 +155,7 @@
 * Stage-2 enforces Normal-WB and Device-nGnRE
 */
 #define MT_S2_FWB_NORMAL	6
+#define MT_S2_FWB_NORMAL_NC	5
 #define MT_S2_FWB_DEVICE_nGnRE	1

 #ifdef CONFIG_ARM64_4K_PAGES
--- a/arch/arm64/include/asm/module.h
+++ b/arch/arm64/include/asm/module.h
@ -14,12 +14,50 @@ struct mod_plt_sec {
 	int			plt_max_entries;
 };

-struct mod_arch_specific {
-	struct mod_plt_sec	core;
-	struct mod_plt_sec	init;
-
-	/* for CONFIG_DYNAMIC_FTRACE */
+#define ARM64_MODULE_PLTS_ARCHDATA					\
+	struct mod_plt_sec	core;					\
+	struct mod_plt_sec	init;					\
+									\
+	/* for CONFIG_DYNAMIC_FTRACE */					\
 	struct plt_entry	*ftrace_trampolines;
+#else
+#define ARM64_MODULE_PLTS_ARCHDATA
+#endif
+
+#ifdef CONFIG_KVM
+struct pkvm_module_section {
+	void *start;
+	void *end;
+};
+
+typedef s32 kvm_nvhe_reloc_t;
+struct pkvm_module_ops;
+
+struct pkvm_el2_module {
+	struct pkvm_module_section text;
+	struct pkvm_module_section bss;
+	struct pkvm_module_section rodata;
+	struct pkvm_module_section data;
+	kvm_nvhe_reloc_t *relocs;
+	unsigned int nr_relocs;
+	int (*init)(const struct pkvm_module_ops *ops);
+};
+
+void kvm_apply_hyp_module_relocations(void *mod_start, void *hyp_va,
+				      kvm_nvhe_reloc_t *begin,
+				      kvm_nvhe_reloc_t *end);
+
+#define ARM64_MODULE_KVM_ARCHDATA					\
+	/* For pKVM hypervisor modules */				\
+	struct pkvm_el2_module	hyp;
+#else
+#define ARM64_MODULE_KVM_ARCHDATA
+#endif
+
+#ifdef CONFIG_HAVE_MOD_ARCH_SPECIFIC
+struct mod_arch_specific {
+	ARM64_MODULE_PLTS_ARCHDATA
+	ARM64_MODULE_KVM_ARCHDATA
 };
 #endif

--- a/arch/arm64/include/asm/module.lds.h
+++ b/arch/arm64/include/asm/module.lds.h
@ -1,3 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#include <asm/page-def.h>
+
 SECTIONS {
 #ifdef CONFIG_ARM64_MODULE_PLTS
 	.plt 0 : { BYTE(0) }
@ -17,4 +20,24 @@ SECTIONS {
 	 */
 	.text.hot : { *(.text.hot) }
 #endif
+
+#ifdef CONFIG_KVM
+	.hyp.text : ALIGN(PAGE_SIZE) {
+		*(.hyp.text)
+		. = ALIGN(PAGE_SIZE);
+	}
+	.hyp.bss : ALIGN(PAGE_SIZE) {
+		*(.hyp.bss)
+		. = ALIGN(PAGE_SIZE);
+	}
+	.hyp.rodata : ALIGN(PAGE_SIZE) {
+		*(.hyp.rodata)
+		. = ALIGN(PAGE_SIZE);
+	}
+	.hyp.data : ALIGN(PAGE_SIZE) {
+		*(.hyp.data)
+		. = ALIGN(PAGE_SIZE);
+	}
+	.hyp.reloc : ALIGN(4) {	*(.hyp.reloc) }
+#endif
 }
--- a/arch/arm64/include/asm/patching.h
+++ b/arch/arm64/include/asm/patching.h
@ -6,6 +6,7 @@

 int aarch64_insn_read(void *addr, u32 *insnp);
 int aarch64_insn_write(void *addr, u32 insn);
+int aarch64_addr_write(void *addr, u64 dst);

 int aarch64_insn_patch_text_nosync(void *addr, u32 insn);
 int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt);
--- a/arch/arm64/include/asm/sections.h
+++ b/arch/arm64/include/asm/sections.h
@ -11,6 +11,7 @@ extern char __alt_instructions[], __alt_instructions_end[];
 extern char __hibernate_exit_text_start[], __hibernate_exit_text_end[];
 extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
 extern char __hyp_text_start[], __hyp_text_end[];
+extern char __hyp_data_start[], __hyp_data_end[];
 extern char __hyp_rodata_start[], __hyp_rodata_end[];
 extern char __hyp_reloc_begin[], __hyp_reloc_end[];
 extern char __hyp_bss_start[], __hyp_bss_end[];
--- a/arch/arm64/include/asm/set_memory.h
+++ b/arch/arm64/include/asm/set_memory.h
@ -10,6 +10,7 @@ bool can_set_direct_map(void);

 int set_memory_valid(unsigned long addr, int numpages, int enable);

+int arch_set_direct_map_range_uncached(unsigned long addr, unsigned long numpages);
 int set_direct_map_invalid_noflush(struct page *page);
 int set_direct_map_default_noflush(struct page *page);
 bool kernel_page_present(struct page *page);
--- a/arch/arm64/include/asm/simd.h
+++ b/arch/arm64/include/asm/simd.h
@ -35,9 +35,7 @@ static __must_check inline bool may_use_simd(void)
 	 * migrated, and if it's clear we cannot be migrated to a CPU
 	 * where it is set.
 	 */
-	return !WARN_ON(!system_capabilities_finalized()) &&
-	       system_supports_fpsimd() &&
-	       !in_hardirq() && !irqs_disabled() && !in_nmi() &&
+	return !in_hardirq() && !irqs_disabled() && !in_nmi() &&
 	       !this_cpu_read(fpsimd_context_busy);
 }

--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@ -81,6 +81,12 @@ void __hyp_reset_vectors(void);

 DECLARE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);

+static inline bool is_pkvm_initialized(void)
+{
+	return IS_ENABLED(CONFIG_KVM) &&
+	       static_branch_likely(&kvm_protected_mode_initialized);
+}
+
 /* Reports the availability of HYP mode */
 static inline bool is_hyp_mode_available(void)
 {
@ -88,8 +94,7 @@ static inline bool is_hyp_mode_available(void)
 	 * If KVM protected mode is initialized, all CPUs must have been booted
 	 * in EL2. Avoid checking __boot_cpu_mode as CPUs now come up in EL1.
 	 */
-	if (IS_ENABLED(CONFIG_KVM) &&
-	    static_branch_likely(&kvm_protected_mode_initialized))
+	if (is_pkvm_initialized())
 		return true;

 	return (__boot_cpu_mode[0] == BOOT_CPU_MODE_EL2 &&
@ -103,8 +108,7 @@ static inline bool is_hyp_mode_mismatched(void)
 	 * If KVM protected mode is initialized, all CPUs must have been booted
 	 * in EL2. Avoid checking __boot_cpu_mode as CPUs now come up in EL1.
 	 */
-	if (IS_ENABLED(CONFIG_KVM) &&
-	    static_branch_likely(&kvm_protected_mode_initialized))
+	if (is_pkvm_initialized())
 		return false;

 	return __boot_cpu_mode[0] != __boot_cpu_mode[1];
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@ -457,6 +457,15 @@ enum {
 #define KVM_PSCI_RET_INVAL		PSCI_RET_INVALID_PARAMS
 #define KVM_PSCI_RET_DENIED		PSCI_RET_DENIED

+/* Protected KVM */
+#define KVM_CAP_ARM_PROTECTED_VM_FLAGS_SET_FW_IPA	0
+#define KVM_CAP_ARM_PROTECTED_VM_FLAGS_INFO		1
+
+struct kvm_protected_vm_info {
+	__u64 firmware_size;
+	__u64 __reserved[7];
+};
+
 /* arm64-specific kvm_run::system_event flags */
 /*
 * Reset caused by a PSCI v1.1 SYSTEM_RESET2 call.
--- a/arch/arm64/kernel/idreg-override.c
+++ b/arch/arm64/kernel/idreg-override.c
@ -287,8 +287,11 @@ static __init void parse_cmdline(void)
 {
 	const u8 *prop = get_bootargs_cmdline();

-	if (IS_ENABLED(CONFIG_CMDLINE_FORCE) || !prop)
+	if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) ||
+	    IS_ENABLED(CONFIG_CMDLINE_FORCE) ||
+	    !prop) {
 		__parse_cmdline(CONFIG_CMDLINE, true);
+	}

 	if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && prop)
 		__parse_cmdline(prop, true);
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@ -71,12 +71,6 @@ KVM_NVHE_ALIAS(nvhe_hyp_panic_handler);
 /* Vectors installed by hyp-init on reset HVC. */
 KVM_NVHE_ALIAS(__hyp_stub_vectors);

-/* Kernel symbol used by icache_is_vpipt(). */
-KVM_NVHE_ALIAS(__icache_flags);
-
-/* VMID bits set by the KVM VMID allocator */
-KVM_NVHE_ALIAS(kvm_arm_vmid_bits);
-
 /* Static keys which are set if a vGIC trap should be handled in hyp. */
 KVM_NVHE_ALIAS(vgic_v2_cpuif_trap);
 KVM_NVHE_ALIAS(vgic_v3_cpuif_trap);
@ -92,9 +86,6 @@ KVM_NVHE_ALIAS(gic_nonsecure_priorities);
 KVM_NVHE_ALIAS(__start___kvm_ex_table);
 KVM_NVHE_ALIAS(__stop___kvm_ex_table);

-/* Array containing bases of nVHE per-CPU memory regions. */
-KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base);
-
 /* PMU available static key */
 #ifdef CONFIG_HW_PERF_EVENTS
 KVM_NVHE_ALIAS(kvm_arm_pmu_available);
@ -111,12 +102,6 @@ KVM_NVHE_ALIAS_HYP(__memcpy, __pi_memcpy);
 KVM_NVHE_ALIAS_HYP(__memset, __pi_memset);
 #endif

-/* Kernel memory sections */
-KVM_NVHE_ALIAS(__start_rodata);
-KVM_NVHE_ALIAS(__end_rodata);
-KVM_NVHE_ALIAS(__bss_start);
-KVM_NVHE_ALIAS(__bss_stop);
-
 /* Hyp memory sections */
 KVM_NVHE_ALIAS(__hyp_idmap_text_start);
 KVM_NVHE_ALIAS(__hyp_idmap_text_end);
@ -124,6 +109,8 @@ KVM_NVHE_ALIAS(__hyp_text_start);
 KVM_NVHE_ALIAS(__hyp_text_end);
 KVM_NVHE_ALIAS(__hyp_bss_start);
 KVM_NVHE_ALIAS(__hyp_bss_end);
+KVM_NVHE_ALIAS(__hyp_data_start);
+KVM_NVHE_ALIAS(__hyp_data_end);
 KVM_NVHE_ALIAS(__hyp_rodata_start);
 KVM_NVHE_ALIAS(__hyp_rodata_end);

--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@ -505,14 +505,76 @@ static int module_init_ftrace_plt(const Elf_Ehdr *hdr,
 	return 0;
 }

+static int module_init_hyp(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+			   struct module *mod)
+{
+#ifdef CONFIG_KVM
+	const Elf_Shdr *s;
+
+	/*
+	 * If the .hyp.text is missing or empty, this is not a hypervisor
+	 * module so ignore the rest of it.
+	 */
+	s = find_section(hdr, sechdrs, ".hyp.text");
+	if (!s || !s->sh_size)
+		return 0;
+
+	mod->arch.hyp.text = (struct pkvm_module_section) {
+		.start	= (void *)s->sh_addr,
+		.end	= (void *)s->sh_addr + s->sh_size,
+	};
+
+	s = find_section(hdr, sechdrs, ".hyp.bss");
+	if (!s)
+		return -ENOEXEC;
+
+	mod->arch.hyp.bss = (struct pkvm_module_section) {
+		.start	= (void *)s->sh_addr,
+		.end	= (void *)s->sh_addr + s->sh_size,
+	};
+
+	s = find_section(hdr, sechdrs, ".hyp.rodata");
+	if (!s)
+		return -ENOEXEC;
+
+	mod->arch.hyp.rodata = (struct pkvm_module_section) {
+		.start	= (void *)s->sh_addr,
+		.end	= (void *)s->sh_addr + s->sh_size,
+	};
+
+	s = find_section(hdr, sechdrs, ".hyp.data");
+	if (!s)
+		return -ENOEXEC;
+
+	mod->arch.hyp.data = (struct pkvm_module_section) {
+		.start	= (void *)s->sh_addr,
+		.end	= (void *)s->sh_addr + s->sh_size,
+	};
+
+	s = find_section(hdr, sechdrs, ".hyp.reloc");
+	if (!s)
+		return -ENOEXEC;
+
+	mod->arch.hyp.relocs = (void *)s->sh_addr;
+	mod->arch.hyp.nr_relocs = s->sh_size / sizeof(*mod->arch.hyp.relocs);
+#endif
+	return 0;
+}
+
 int module_finalize(const Elf_Ehdr *hdr,
 		    const Elf_Shdr *sechdrs,
 		    struct module *me)
 {
+	int err;
 	const Elf_Shdr *s;
+
 	s = find_section(hdr, sechdrs, ".altinstructions");
 	if (s)
 		apply_alternatives_module((void *)s->sh_addr, s->sh_size);

-	return module_init_ftrace_plt(hdr, sechdrs, me);
+	err = module_init_ftrace_plt(hdr, sechdrs, me);
+	if (err)
+		return err;
+
+	return module_init_hyp(hdr, sechdrs, me);
 }
--- a/arch/arm64/kernel/patching.c
+++ b/arch/arm64/kernel/patching.c
@ -66,16 +66,16 @@ int __kprobes aarch64_insn_read(void *addr, u32 *insnp)
 	return ret;
 }

-static int __kprobes __aarch64_insn_write(void *addr, __le32 insn)
+static int __kprobes __aarch64_text_write(void *dst, void *src, size_t size)
 {
-	void *waddr = addr;
-	unsigned long flags = 0;
+	unsigned long flags;
+	void *waddr;
 	int ret;

 	raw_spin_lock_irqsave(&patch_lock, flags);
-	waddr = patch_map(addr, FIX_TEXT_POKE0);
+	waddr = patch_map(dst, FIX_TEXT_POKE0);

-	ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE);
+	ret = copy_to_kernel_nofault(waddr, src, size);

 	patch_unmap(FIX_TEXT_POKE0);
 	raw_spin_unlock_irqrestore(&patch_lock, flags);
@ -85,7 +85,14 @@ static int __kprobes __aarch64_insn_write(void *addr, __le32 insn)

 int __kprobes aarch64_insn_write(void *addr, u32 insn)
 {
-	return __aarch64_insn_write(addr, cpu_to_le32(insn));
+	__le32 __insn = cpu_to_le32(insn);
+
+	return __aarch64_text_write(addr, &__insn, AARCH64_INSN_SIZE);
+}
+
+int __kprobes aarch64_addr_write(void *addr, u64 dst)
+{
+	return __aarch64_text_write(addr, &dst, sizeof(dst));
 }

 int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn)
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@ -40,6 +40,7 @@
 #include <asm/elf.h>
 #include <asm/cpufeature.h>
 #include <asm/cpu_ops.h>
+#include <asm/hypervisor.h>
 #include <asm/kasan.h>
 #include <asm/numa.h>
 #include <asm/sections.h>
@ -49,6 +50,7 @@
 #include <asm/tlbflush.h>
 #include <asm/traps.h>
 #include <asm/efi.h>
+#include <asm/hypervisor.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/mmu_context.h>

@ -438,3 +440,10 @@ static int __init register_arm64_panic_block(void)
 	return 0;
 }
 device_initcall(register_arm64_panic_block);
+
+void kvm_arm_init_hyp_services(void)
+{
+	kvm_init_ioremap_services();
+	kvm_init_memshare_services();
+	kvm_init_memrelinquish_services();
+}
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@ -13,7 +13,7 @@
 	*(__kvm_ex_table)					\
 	__stop___kvm_ex_table = .;

-#define HYPERVISOR_DATA_SECTIONS				\
+#define HYPERVISOR_RODATA_SECTIONS				\
 	HYP_SECTION_NAME(.rodata) : {				\
 		. = ALIGN(PAGE_SIZE);				\
 		__hyp_rodata_start = .;				\
@ -23,6 +23,15 @@
 		__hyp_rodata_end = .;				\
 	}

+#define HYPERVISOR_DATA_SECTION					\
+	HYP_SECTION_NAME(.data) : {				\
+		. = ALIGN(PAGE_SIZE);				\
+		__hyp_data_start = .;				\
+		*(HYP_SECTION_NAME(.data))			\
+		. = ALIGN(PAGE_SIZE);				\
+		__hyp_data_end = .;				\
+	}
+
 #define HYPERVISOR_PERCPU_SECTION				\
 	. = ALIGN(PAGE_SIZE);					\
 	HYP_SECTION_NAME(.data..percpu) : {			\
@ -51,7 +60,8 @@
 #define SBSS_ALIGN			PAGE_SIZE
 #else /* CONFIG_KVM */
 #define HYPERVISOR_EXTABLE
-#define HYPERVISOR_DATA_SECTIONS
+#define HYPERVISOR_RODATA_SECTIONS
+#define HYPERVISOR_DATA_SECTION
 #define HYPERVISOR_PERCPU_SECTION
 #define HYPERVISOR_RELOC_SECTION
 #define SBSS_ALIGN			0
@ -188,7 +198,7 @@ SECTIONS
 	/* everything from this point to __init_begin will be marked RO NX */
 	RO_DATA(PAGE_SIZE)

-	HYPERVISOR_DATA_SECTIONS
+	HYPERVISOR_RODATA_SECTIONS

 	/* code sections that are never executed via the kernel mapping */
 	.rodata.text : {
@ -276,6 +286,8 @@ SECTIONS
 	_sdata = .;
 	RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)

+	HYPERVISOR_DATA_SECTION
+
 	/*
 	 * Data written with the MMU off but read with the MMU on requires
 	 * cache lines to be invalidated, discarding up to a Cache Writeback
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@ -69,4 +69,13 @@ config PROTECTED_NVHE_STACKTRACE

 	  If unsure, or not using protected nVHE (pKVM), say N.

+config KVM_S2MPU
+	bool "Stage-2 Memory Protection Unit support"
+	depends on KVM
+	help
+	  Support for the Stage-2 Memory Protection Unit (S2MPU) and Stream
+	  Security Mapping Table (SSMT) devices in KVM. This allows the
+	  hypervisor to restrict DMA access to its memory and the memory of
+	  protected guests.
+
 endif # VIRTUALIZATION
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@ -8,13 +8,13 @@ ccflags-y += -I $(srctree)/$(src)
 include $(srctree)/virt/kvm/Makefile.kvm

 obj-$(CONFIG_KVM) += kvm.o
-obj-$(CONFIG_KVM) += hyp/
+obj-$(CONFIG_KVM) += hyp/ iommu/

 kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
 	 inject_fault.o va_layout.o handle_exit.o \
 	 guest.o debug.o reset.o sys_regs.o stacktrace.o \
 	 vgic-sys-reg-v3.o fpsimd.o pkvm.o \
-	 arch_timer.o trng.o vmid.o \
+	 arch_timer.o trng.o vmid.o iommu.o \
 	 vgic/vgic.o vgic/vgic-init.o \
 	 vgic/vgic-irqfd.o vgic/vgic-v2.o \
 	 vgic/vgic-v3.o vgic/vgic-v4.o \
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@ -88,7 +88,9 @@ static u64 timer_get_offset(struct arch_timer_context *ctxt)

 	switch(arch_timer_ctx_index(ctxt)) {
 	case TIMER_VTIMER:
-		return __vcpu_sys_reg(vcpu, CNTVOFF_EL2);
+		if (likely(!kvm_vm_is_protected(vcpu->kvm)))
+			return __vcpu_sys_reg(vcpu, CNTVOFF_EL2);
+		fallthrough;
 	default:
 		return 0;
 	}
@ -768,6 +770,9 @@ static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff)
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_vcpu *tmp;

+	if (unlikely(kvm_vm_is_protected(vcpu->kvm)))
+		cntvoff = 0;
+
 	mutex_lock(&kvm->lock);
 	kvm_for_each_vcpu(i, tmp, kvm)
 		timer_set_offset(vcpu_vtimer(tmp), cntvoff);
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@ -37,6 +37,7 @@
 #include <asm/kvm_arm.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_mmu.h>
+#include <asm/kvm_pkvm.h>
 #include <asm/kvm_emulate.h>
 #include <asm/sections.h>

@ -50,8 +51,8 @@ DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
 DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);

 DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
-unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
 DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
+DECLARE_KVM_NVHE_PER_CPU(int, hyp_cpu_number);

 static bool vgic_present;

@ -78,18 +79,31 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 {
 	int r;

-	if (cap->flags)
-		return -EINVAL;
+	/* Capabilities with flags */
+	switch (cap->cap) {
+	case KVM_CAP_ARM_PROTECTED_VM:
+		return pkvm_vm_ioctl_enable_cap(kvm, cap);
+	default:
+		if (cap->flags)
+			return -EINVAL;
+	}

+	/* Capabilities without flags */
 	switch (cap->cap) {
 	case KVM_CAP_ARM_NISV_TO_USER:
-		r = 0;
-		set_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
-			&kvm->arch.flags);
+		if (kvm_vm_is_protected(kvm)) {
+			r = -EINVAL;
+		} else {
+			r = 0;
+			set_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
+				&kvm->arch.flags);
+		}
 		break;
 	case KVM_CAP_ARM_MTE:
 		mutex_lock(&kvm->lock);
-		if (!system_supports_mte() || kvm->created_vcpus) {
+		if (!system_supports_mte() ||
+		    kvm_vm_is_protected(kvm) ||
+		    kvm->created_vcpus) {
 			r = -EINVAL;
 		} else {
 			r = 0;
@ -138,24 +152,27 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
 	int ret;

-	ret = kvm_arm_setup_stage2(kvm, type);
-	if (ret)
-		return ret;
-
-	ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu);
-	if (ret)
-		return ret;
+	if (type & ~KVM_VM_TYPE_MASK)
+		return -EINVAL;

 	ret = kvm_share_hyp(kvm, kvm + 1);
 	if (ret)
-		goto out_free_stage2_pgd;
+		return ret;
+
+	ret = pkvm_init_host_vm(kvm, type);
+	if (ret)
+		goto err_unshare_kvm;

 	if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) {
 		ret = -ENOMEM;
-		goto out_free_stage2_pgd;
+		goto err_unshare_kvm;
 	}
 	cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask);

+	ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type);
+	if (ret)
+		goto err_free_cpumask;
+
 	kvm_vgic_early_init(kvm);

 	/* The maximum number of VCPUs is limited by the host's GIC model */
@ -164,9 +181,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	set_default_spectre(kvm);
 	kvm_arm_init_hypercalls(kvm);

-	return ret;
-out_free_stage2_pgd:
-	kvm_free_stage2_pgd(&kvm->arch.mmu);
+	return 0;
+
+err_free_cpumask:
+	free_cpumask_var(kvm->arch.supported_cpus);
+err_unshare_kvm:
+	kvm_unshare_hyp(kvm, kvm + 1);
 	return ret;
 }

@ -187,14 +207,22 @@ void kvm_arch_destroy_vm(struct kvm *kvm)

 	kvm_vgic_destroy(kvm);

+	if (is_protected_kvm_enabled())
+		pkvm_destroy_hyp_vm(kvm);
+
 	kvm_destroy_vcpus(kvm);

+	if (atomic64_read(&kvm->stat.protected_hyp_mem))
+		pr_warn("%lluB of donations to the nVHE hyp are missing\n",
+			atomic64_read(&kvm->stat.protected_hyp_mem));
+
 	kvm_unshare_hyp(kvm, kvm + 1);
 }

-int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
+static int kvm_check_extension(struct kvm *kvm, long ext)
 {
 	int r;
+
 	switch (ext) {
 	case KVM_CAP_IRQCHIP:
 		r = vgic_present;
@ -212,7 +240,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_IMMEDIATE_EXIT:
 	case KVM_CAP_VCPU_EVENTS:
 	case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
-	case KVM_CAP_ARM_NISV_TO_USER:
 	case KVM_CAP_ARM_INJECT_EXT_DABT:
 	case KVM_CAP_SET_GUEST_DEBUG:
 	case KVM_CAP_VCPU_ATTRIBUTES:
@ -220,6 +247,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ARM_SYSTEM_SUSPEND:
 		r = 1;
 		break;
+	case KVM_CAP_ARM_NISV_TO_USER:
+		r = !kvm || !kvm_vm_is_protected(kvm);
+		break;
 	case KVM_CAP_SET_GUEST_DEBUG2:
 		return KVM_GUESTDBG_VALID_MASK;
 	case KVM_CAP_ARM_SET_DEVICE_ADDR:
@ -293,6 +323,75 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	return r;
 }

+/*
+ * Checks whether the extension specified in ext is supported in protected
+ * mode for the specified vm.
+ * The capabilities supported by kvm in general are passed in kvm_cap.
+ */
+static int pkvm_check_extension(struct kvm *kvm, long ext, int kvm_cap)
+{
+	int r;
+
+	switch (ext) {
+	case KVM_CAP_IRQCHIP:
+	case KVM_CAP_ARM_PSCI:
+	case KVM_CAP_ARM_PSCI_0_2:
+	case KVM_CAP_NR_VCPUS:
+	case KVM_CAP_MAX_VCPUS:
+	case KVM_CAP_MAX_VCPU_ID:
+	case KVM_CAP_MSI_DEVID:
+	case KVM_CAP_ARM_VM_IPA_SIZE:
+		r = kvm_cap;
+		break;
+	case KVM_CAP_GUEST_DEBUG_HW_BPS:
+		r = min(kvm_cap, pkvm_get_max_brps());
+		break;
+	case KVM_CAP_GUEST_DEBUG_HW_WPS:
+		r = min(kvm_cap, pkvm_get_max_wrps());
+		break;
+	case KVM_CAP_ARM_PMU_V3:
+		r = kvm_cap && FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer),
+					 PVM_ID_AA64DFR0_ALLOW);
+		break;
+	case KVM_CAP_ARM_SVE:
+		r = kvm_cap && FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE),
+					 PVM_ID_AA64PFR0_RESTRICT_UNSIGNED);
+		break;
+	case KVM_CAP_ARM_PTRAUTH_ADDRESS:
+		r = kvm_cap &&
+		    FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API),
+			      PVM_ID_AA64ISAR1_ALLOW) &&
+		    FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA),
+			      PVM_ID_AA64ISAR1_ALLOW);
+		break;
+	case KVM_CAP_ARM_PTRAUTH_GENERIC:
+		r = kvm_cap &&
+		    FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI),
+			      PVM_ID_AA64ISAR1_ALLOW) &&
+		    FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA),
+			      PVM_ID_AA64ISAR1_ALLOW);
+		break;
+	case KVM_CAP_ARM_PROTECTED_VM:
+		r = 1;
+		break;
+	default:
+		r = 0;
+		break;
+	}
+
+	return r;
+}
+
+int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
+{
+	int r = kvm_check_extension(kvm, ext);
+
+	if (kvm && kvm_vm_is_protected(kvm))
+		r = pkvm_check_extension(kvm, ext, r);
+
+	return r;
+}
+
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg)
 {
@ -363,7 +462,11 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	if (vcpu_has_run_once(vcpu) && unlikely(!irqchip_in_kernel(vcpu->kvm)))
 		static_branch_dec(&userspace_irqchip_in_use);

-	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+	if (is_protected_kvm_enabled())
+		free_hyp_stage2_memcache(&vcpu->arch.pkvm_memcache, vcpu->kvm);
+	else
+		kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+
 	kvm_timer_vcpu_terminate(vcpu);
 	kvm_pmu_vcpu_destroy(vcpu);

@ -385,6 +488,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	struct kvm_s2_mmu *mmu;
 	int *last_ran;

+	if (is_protected_kvm_enabled())
+		goto nommu;
+
 	mmu = vcpu->arch.hw_mmu;
 	last_ran = this_cpu_ptr(mmu->last_vcpu_ran);

@ -402,6 +508,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		*last_ran = vcpu->vcpu_id;
 	}

+nommu:
 	vcpu->cpu = cpu;

 	kvm_vgic_load(vcpu);
@ -422,18 +529,36 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		vcpu_ptrauth_disable(vcpu);
 	kvm_arch_vcpu_load_debug_state_flags(vcpu);

+	if (is_protected_kvm_enabled()) {
+		kvm_call_hyp_nvhe(__pkvm_vcpu_load,
+				  vcpu->kvm->arch.pkvm.handle,
+				  vcpu->vcpu_idx, vcpu->arch.hcr_el2);
+		kvm_call_hyp(__vgic_v3_restore_vmcr_aprs,
+			     &vcpu->arch.vgic_cpu.vgic_v3);
+	}
+
 	if (!cpumask_test_cpu(smp_processor_id(), vcpu->kvm->arch.supported_cpus))
 		vcpu_set_on_unsupported_cpu(vcpu);
 }

 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
+	if (is_protected_kvm_enabled()) {
+		kvm_call_hyp(__vgic_v3_save_vmcr_aprs,
+			     &vcpu->arch.vgic_cpu.vgic_v3);
+		kvm_call_hyp_nvhe(__pkvm_vcpu_put);
+
+		/* __pkvm_vcpu_put implies a sync of the state */
+		if (!kvm_vm_is_protected(vcpu->kvm))
+			vcpu_set_flag(vcpu, PKVM_HOST_STATE_DIRTY);
+	}
+
 	kvm_arch_vcpu_put_debug_state_flags(vcpu);
 	kvm_arch_vcpu_put_fp(vcpu);
 	if (has_vhe())
 		kvm_vcpu_put_sysregs_vhe(vcpu);
 	kvm_timer_vcpu_put(vcpu);
-	kvm_vgic_put(vcpu);
+	kvm_vgic_put(vcpu, false);
 	kvm_vcpu_pmu_restore_host(vcpu);
 	kvm_arm_vmid_clear_active();

@ -569,6 +694,15 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
 	if (ret)
 		return ret;

+	if (is_protected_kvm_enabled()) {
+		/* Start with the vcpu in a dirty state */
+		if (!kvm_vm_is_protected(vcpu->kvm))
+			vcpu_set_flag(vcpu, PKVM_HOST_STATE_DIRTY);
+		ret = pkvm_create_hyp_vm(kvm);
+		if (ret)
+			return ret;
+	}
+
 	if (!irqchip_in_kernel(kvm)) {
 		/*
 		 * Tell the rest of the code that there are userspace irqchip
@ -577,14 +711,6 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
 		static_branch_inc(&userspace_irqchip_in_use);
 	}

-	/*
-	 * Initialize traps for protected VMs.
-	 * NOTE: Move to run in EL2 directly, rather than via a hypercall, once
-	 * the code is in place for first run initialization at EL2.
-	 */
-	if (kvm_vm_is_protected(kvm))
-		kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu);
-
 	mutex_lock(&kvm->lock);
 	set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags);
 	mutex_unlock(&kvm->lock);
@ -660,15 +786,14 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
 	 * doorbells to be signalled, should an interrupt become pending.
 	 */
 	preempt_disable();
-	kvm_vgic_vmcr_sync(vcpu);
-	vgic_v4_put(vcpu, true);
+	kvm_vgic_put(vcpu, true);
 	preempt_enable();

 	kvm_vcpu_halt(vcpu);
 	vcpu_clear_flag(vcpu, IN_WFIT);

 	preempt_disable();
-	vgic_v4_load(vcpu);
+	kvm_vgic_load(vcpu);
 	preempt_enable();
 }

@ -1522,6 +1647,9 @@ static void cpu_prepare_hyp_mode(int cpu)
 {
 	struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
 	unsigned long tcr;
+	int *hyp_cpu_number_ptr = per_cpu_ptr_nvhe_sym(hyp_cpu_number, cpu);
+
+	*hyp_cpu_number_ptr = cpu;

 	/*
 	 * Calculate the raw per-cpu offset without a translation from the
@ -1779,6 +1907,7 @@ static bool init_psci_relay(void)
 	}

 	kvm_host_psci_config.version = psci_ops.get_version();
+	kvm_host_psci_config.smccc_version = arm_smccc_get_version();

 	if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) {
 		kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids();
@ -1844,13 +1973,13 @@ static void teardown_hyp_mode(void)
 	free_hyp_pgds();
 	for_each_possible_cpu(cpu) {
 		free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
-		free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order());
+		free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order());
 	}
 }

 static int do_pkvm_init(u32 hyp_va_bits)
 {
-	void *per_cpu_base = kvm_ksym_ref(kvm_arm_hyp_percpu_base);
+	void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base));
 	int ret;

 	preempt_disable();
@ -1870,11 +1999,8 @@ static int do_pkvm_init(u32 hyp_va_bits)
 	return ret;
 }

-static int kvm_hyp_init_protection(u32 hyp_va_bits)
+static void kvm_hyp_init_symbols(void)
 {
-	void *addr = phys_to_virt(hyp_mem_base);
-	int ret;
-
 	kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
 	kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
 	kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1);
@ -1883,6 +2009,15 @@ static int kvm_hyp_init_protection(u32 hyp_va_bits)
 	kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
 	kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
 	kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
+	kvm_nvhe_sym(__icache_flags) = __icache_flags;
+	kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;
+	kvm_nvhe_sym(smccc_trng_available) = smccc_trng_available;
+}
+
+static int kvm_hyp_init_protection(u32 hyp_va_bits)
+{
+	void *addr = phys_to_virt(hyp_mem_base);
+	int ret;

 	ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP);
 	if (ret)
@ -1950,7 +2085,7 @@ static int init_hyp_mode(void)

 		page_addr = page_address(page);
 		memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
-		kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr;
+		kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr;
 	}

 	/*
@ -1963,6 +2098,13 @@ static int init_hyp_mode(void)
 		goto out_err;
 	}

+	err = create_hyp_mappings(kvm_ksym_ref(__hyp_data_start),
+				  kvm_ksym_ref(__hyp_data_end), PAGE_HYP);
+	if (err) {
+		kvm_err("Cannot map .hyp.data section\n");
+		goto out_err;
+	}
+
 	err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start),
 				  kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO);
 	if (err) {
@ -2043,7 +2185,7 @@ static int init_hyp_mode(void)
 	}

 	for_each_possible_cpu(cpu) {
-		char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu];
+		char *percpu_begin = (char *)kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu];
 		char *percpu_end = percpu_begin + nvhe_percpu_size();

 		/* Map Hyp percpu pages */
@ -2057,6 +2199,8 @@ static int init_hyp_mode(void)
 		cpu_prepare_hyp_mode(cpu);
 	}

+	kvm_hyp_init_symbols();
+
 	if (is_protected_kvm_enabled()) {
 		init_cpu_logical_map();

@ -2064,9 +2208,7 @@ static int init_hyp_mode(void)
 			err = -ENODEV;
 			goto out_err;
 		}
-	}

-	if (is_protected_kvm_enabled()) {
 		err = kvm_hyp_init_protection(hyp_va_bits);
 		if (err) {
 			kvm_err("Failed to init hyp memory protection\n");
@ -2099,6 +2241,17 @@ static int pkvm_drop_host_privileges(void)
 	 * once the host stage 2 is installed.
 	 */
 	static_branch_enable(&kvm_protected_mode_initialized);
+
+	/*
+	 * Fixup the boot mode so that we don't take spurious round
+	 * trips via EL2 on cpu_resume. Flush to the PoC for a good
+	 * measure, so that it can be observed by a CPU coming out of
+	 * suspend with the MMU off.
+	 */
+	__boot_cpu_mode[0] = __boot_cpu_mode[1] = BOOT_CPU_MODE_EL1;
+	dcache_clean_poc((unsigned long)__boot_cpu_mode,
+			 (unsigned long)(__boot_cpu_mode + 2));
+
 	on_each_cpu(_kvm_host_prot_finalize, &ret, 1);
 	return ret;
 }
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@ -39,9 +39,7 @@ static DEFINE_PER_CPU(u64, mdcr_el2);
 */
 static void save_guest_debug_regs(struct kvm_vcpu *vcpu)
 {
-	u64 val = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
-
-	vcpu->arch.guest_debug_preserved.mdscr_el1 = val;
+	__vcpu_save_guest_debug_regs(vcpu);

 	trace_kvm_arm_set_dreg32("Saved MDSCR_EL1",
 				vcpu->arch.guest_debug_preserved.mdscr_el1);
@ -52,9 +50,7 @@ static void save_guest_debug_regs(struct kvm_vcpu *vcpu)

 static void restore_guest_debug_regs(struct kvm_vcpu *vcpu)
 {
-	u64 val = vcpu->arch.guest_debug_preserved.mdscr_el1;
-
-	vcpu_write_sys_reg(vcpu, val, MDSCR_EL1);
+	__vcpu_restore_guest_debug_regs(vcpu);

 	trace_kvm_arm_set_dreg32("Restored MDSCR_EL1",
 				vcpu_read_sys_reg(vcpu, MDSCR_EL1));
@ -175,7 +171,7 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
 	kvm_arm_setup_mdcr_el2(vcpu);

 	/* Check if we need to use the debug registers. */
-	if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) {
+	if (kvm_vcpu_needs_debug_regs(vcpu)) {
 		/* Save guest debug state */
 		save_guest_debug_regs(vcpu);

@ -284,7 +280,7 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)
 	/*
 	 * Restore the guest's debug registers if we were using them.
 	 */
-	if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) {
+	if (kvm_vcpu_needs_debug_regs(vcpu)) {
 		if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
 			if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS))
 				/*
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@ -29,7 +29,9 @@
 #include "trace.h"

 const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
-	KVM_GENERIC_VM_STATS()
+	KVM_GENERIC_VM_STATS(),
+	STATS_DESC_ICOUNTER(VM, protected_hyp_mem),
+	STATS_DESC_ICOUNTER(VM, protected_shared_mem),
 };

 const struct kvm_stats_header kvm_vm_stats_header = {
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@ -240,6 +240,21 @@ static int handle_trap_exceptions(struct kvm_vcpu *vcpu)
 {
 	int handled;

+	/*
+	 * If we run a non-protected VM when protection is enabled
+	 * system-wide, resync the state from the hypervisor and mark
+	 * it as dirty on the host side if it wasn't dirty already
+	 * (which could happen if preemption has taken place).
+	 */
+	if (is_protected_kvm_enabled() && !kvm_vm_is_protected(vcpu->kvm)) {
+		preempt_disable();
+		if (!(vcpu_get_flag(vcpu, PKVM_HOST_STATE_DIRTY))) {
+			kvm_call_hyp_nvhe(__pkvm_vcpu_sync_state);
+			vcpu_set_flag(vcpu, PKVM_HOST_STATE_DIRTY);
+		}
+		preempt_enable();
+	}
+
 	/*
 	 * See ARM ARM B1.14.1: "Hyp traps on instructions
 	 * that fail their condition code check"
@ -307,6 +322,13 @@ int handle_exit(struct kvm_vcpu *vcpu, int exception_index)
 /* For exit types that need handling before we can be preempted */
 void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index)
 {
+	/*
+	 * We just exited, so the state is clean from a hypervisor
+	 * perspective.
+	 */
+	if (is_protected_kvm_enabled())
+		vcpu_clear_flag(vcpu, PKVM_HOST_STATE_DIRTY);
+
 	if (ARM_SERROR_PENDING(exception_index)) {
 		if (this_cpu_has_cap(ARM64_HAS_RAS_EXTN)) {
 			u64 disr = kvm_vcpu_get_disr(vcpu);
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@ -61,12 +61,25 @@ static void __vcpu_write_spsr_und(struct kvm_vcpu *vcpu, u64 val)
 		vcpu->arch.ctxt.spsr_und = val;
 }

+unsigned long get_except64_offset(unsigned long psr, unsigned long target_mode,
+				  enum exception_type type)
+{
+	u64 mode = psr & (PSR_MODE_MASK | PSR_MODE32_BIT);
+	u64 exc_offset;
+
+	if      (mode == target_mode)
+		exc_offset = CURRENT_EL_SP_ELx_VECTOR;
+	else if ((mode | PSR_MODE_THREAD_BIT) == target_mode)
+		exc_offset = CURRENT_EL_SP_EL0_VECTOR;
+	else if (!(mode & PSR_MODE32_BIT))
+		exc_offset = LOWER_EL_AArch64_VECTOR;
+	else
+		exc_offset = LOWER_EL_AArch32_VECTOR;
+
+	return exc_offset + type;
+}
+
 /*
- * This performs the exception entry at a given EL (@target_mode), stashing PC
- * and PSTATE into ELR and SPSR respectively, and compute the new PC/PSTATE.
- * The EL passed to this function *must* be a non-secure, privileged mode with
- * bit 0 being set (PSTATE.SP == 1).
- *
 * When an exception is taken, most PSTATE fields are left unchanged in the
 * handler. However, some are explicitly overridden (e.g. M[4:0]). Luckily all
 * of the inherited bits have the same position in the AArch64/AArch32 SPSR_ELx
@ -78,45 +91,17 @@ static void __vcpu_write_spsr_und(struct kvm_vcpu *vcpu, u64 val)
 * Here we manipulate the fields in order of the AArch64 SPSR_ELx layout, from
 * MSB to LSB.
 */
-static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
-			      enum exception_type type)
+unsigned long get_except64_cpsr(unsigned long old, bool has_mte,
+				unsigned long sctlr, unsigned long target_mode)
 {
-	unsigned long sctlr, vbar, old, new, mode;
-	u64 exc_offset;
-
-	mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT);
-
-	if      (mode == target_mode)
-		exc_offset = CURRENT_EL_SP_ELx_VECTOR;
-	else if ((mode | PSR_MODE_THREAD_BIT) == target_mode)
-		exc_offset = CURRENT_EL_SP_EL0_VECTOR;
-	else if (!(mode & PSR_MODE32_BIT))
-		exc_offset = LOWER_EL_AArch64_VECTOR;
-	else
-		exc_offset = LOWER_EL_AArch32_VECTOR;
-
-	switch (target_mode) {
-	case PSR_MODE_EL1h:
-		vbar = __vcpu_read_sys_reg(vcpu, VBAR_EL1);
-		sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1);
-		__vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL1);
-		break;
-	default:
-		/* Don't do that */
-		BUG();
-	}
-
-	*vcpu_pc(vcpu) = vbar + exc_offset + type;
-
-	old = *vcpu_cpsr(vcpu);
-	new = 0;
+	u64 new = 0;

 	new |= (old & PSR_N_BIT);
 	new |= (old & PSR_Z_BIT);
 	new |= (old & PSR_C_BIT);
 	new |= (old & PSR_V_BIT);

-	if (kvm_has_mte(kern_hyp_va(vcpu->kvm)))
+	if (has_mte)
 		new |= PSR_TCO_BIT;

 	new |= (old & PSR_DIT_BIT);
@ -152,6 +137,36 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,

 	new |= target_mode;

+	return new;
+}
+
+/*
+ * This performs the exception entry at a given EL (@target_mode), stashing PC
+ * and PSTATE into ELR and SPSR respectively, and compute the new PC/PSTATE.
+ * The EL passed to this function *must* be a non-secure, privileged mode with
+ * bit 0 being set (PSTATE.SP == 1).
+ */
+static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
+			      enum exception_type type)
+{
+	u64 offset = get_except64_offset(*vcpu_cpsr(vcpu), target_mode, type);
+	unsigned long sctlr, vbar, old, new;
+
+	switch (target_mode) {
+	case PSR_MODE_EL1h:
+		vbar = __vcpu_read_sys_reg(vcpu, VBAR_EL1);
+		sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+		__vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL1);
+		break;
+	default:
+		/* Don't do that */
+		BUG();
+	}
+
+	*vcpu_pc(vcpu) = vbar + offset;
+
+	old = *vcpu_cpsr(vcpu);
+	new = get_except64_cpsr(old, kvm_has_mte(kern_hyp_va(vcpu->kvm)), sctlr, target_mode);
 	*vcpu_cpsr(vcpu) = new;
 	__vcpu_write_spsr(vcpu, old);
 }
--- a/arch/arm64/kvm/hyp/fpsimd.S
+++ b/arch/arm64/kvm/hyp/fpsimd.S
@ -25,3 +25,9 @@ SYM_FUNC_START(__sve_restore_state)
 	sve_load 0, x1, x2, 3
 	ret
 SYM_FUNC_END(__sve_restore_state)
+
+SYM_FUNC_START(__sve_save_state)
+	mov	x2, #1
+	sve_save 0, x1, x2, 3
+	ret
+SYM_FUNC_END(__sve_save_state)
--- a/arch/arm64/kvm/hyp/hyp-constants.c
+++ b/arch/arm64/kvm/hyp/hyp-constants.c
@ -2,9 +2,12 @@

 #include <linux/kbuild.h>
 #include <nvhe/memory.h>
+#include <nvhe/pkvm.h>

 int main(void)
 {
 	DEFINE(STRUCT_HYP_PAGE_SIZE,	sizeof(struct hyp_page));
+	DEFINE(PKVM_HYP_VM_SIZE,	sizeof(struct pkvm_hyp_vm));
+	DEFINE(PKVM_HYP_VCPU_SIZE,	sizeof(struct pkvm_hyp_vcpu));
 	return 0;
 }
--- a/arch/arm64/kvm/hyp/include/nvhe/ffa.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 - Google LLC
+ * Author: Andrew Walbran <qwandor@google.com>
+ */
+#ifndef __KVM_HYP_FFA_H
+#define __KVM_HYP_FFA_H
+
+#include <asm/kvm_host.h>
+
+#define FFA_MIN_FUNC_NUM 0x60
+#define FFA_MAX_FUNC_NUM 0x7F
+
+int hyp_ffa_init(void *pages);
+bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt);
+
+#endif /* __KVM_HYP_FFA_H */
--- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h
@ -1,205 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2021 Google LLC
- * Author: Fuad Tabba <tabba@google.com>
- */
-
-#ifndef __ARM64_KVM_FIXED_CONFIG_H__
-#define __ARM64_KVM_FIXED_CONFIG_H__
-
-#include <asm/sysreg.h>
-
-/*
- * This file contains definitions for features to be allowed or restricted for
- * guest virtual machines, depending on the mode KVM is running in and on the
- * type of guest that is running.
- *
- * The ALLOW masks represent a bitmask of feature fields that are allowed
- * without any restrictions as long as they are supported by the system.
- *
- * The RESTRICT_UNSIGNED masks, if present, represent unsigned fields for
- * features that are restricted to support at most the specified feature.
- *
- * If a feature field is not present in either, than it is not supported.
- *
- * The approach taken for protected VMs is to allow features that are:
- * - Needed by common Linux distributions (e.g., floating point)
- * - Trivial to support, e.g., supporting the feature does not introduce or
- * require tracking of additional state in KVM
- * - Cannot be trapped or prevent the guest from using anyway
- */
-
-/*
- * Allow for protected VMs:
- * - Floating-point and Advanced SIMD
- * - Data Independent Timing
- */
-#define PVM_ID_AA64PFR0_ALLOW (\
-	ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \
-	ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \
-	ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) \
-	)
-
-/*
- * Restrict to the following *unsigned* features for protected VMs:
- * - AArch64 guests only (no support for AArch32 guests):
- *	AArch32 adds complexity in trap handling, emulation, condition codes,
- *	etc...
- * - RAS (v1)
- *	Supported by KVM
- */
-#define PVM_ID_AA64PFR0_RESTRICT_UNSIGNED (\
-	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
-	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
-	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL2), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
-	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL3), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
-	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), ID_AA64PFR0_EL1_RAS_IMP) \
-	)
-
-/*
- * Allow for protected VMs:
- * - Branch Target Identification
- * - Speculative Store Bypassing
- */
-#define PVM_ID_AA64PFR1_ALLOW (\
-	ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_BT) | \
-	ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SSBS) \
-	)
-
-/*
- * Allow for protected VMs:
- * - Mixed-endian
- * - Distinction between Secure and Non-secure Memory
- * - Mixed-endian at EL0 only
- * - Non-context synchronizing exception entry and exit
- */
-#define PVM_ID_AA64MMFR0_ALLOW (\
-	ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGEND) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_SNSMEM) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGENDEL0) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_EXS) \
-	)
-
-/*
- * Restrict to the following *unsigned* features for protected VMs:
- * - 40-bit IPA
- * - 16-bit ASID
- */
-#define PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED (\
-	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_PARANGE), ID_AA64MMFR0_EL1_PARANGE_40) | \
-	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_ASIDBITS), ID_AA64MMFR0_EL1_ASIDBITS_16) \
-	)
-
-/*
- * Allow for protected VMs:
- * - Hardware translation table updates to Access flag and Dirty state
- * - Number of VMID bits from CPU
- * - Hierarchical Permission Disables
- * - Privileged Access Never
- * - SError interrupt exceptions from speculative reads
- * - Enhanced Translation Synchronization
- */
-#define PVM_ID_AA64MMFR1_ALLOW (\
-	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_VMIDBits) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HPDS) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_PAN) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_SpecSEI) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_ETS) \
-	)
-
-/*
- * Allow for protected VMs:
- * - Common not Private translations
- * - User Access Override
- * - IESB bit in the SCTLR_ELx registers
- * - Unaligned single-copy atomicity and atomic functions
- * - ESR_ELx.EC value on an exception by read access to feature ID space
- * - TTL field in address operations.
- * - Break-before-make sequences when changing translation block size
- * - E0PDx mechanism
- */
-#define PVM_ID_AA64MMFR2_ALLOW (\
-	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_CnP) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_UAO) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IESB) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_AT) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IDS) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_TTL) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_BBM) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_E0PD) \
-	)
-
-/*
- * No support for Scalable Vectors for protected VMs:
- *	Requires additional support from KVM, e.g., context-switching and
- *	trapping at EL2
- */
-#define PVM_ID_AA64ZFR0_ALLOW (0ULL)
-
-/*
- * No support for debug, including breakpoints, and watchpoints for protected
- * VMs:
- *	The Arm architecture mandates support for at least the Armv8 debug
- *	architecture, which would include at least 2 hardware breakpoints and
- *	watchpoints. Providing that support to protected guests adds
- *	considerable state and complexity. Therefore, the reserved value of 0 is
- *	used for debug-related fields.
- */
-#define PVM_ID_AA64DFR0_ALLOW (0ULL)
-#define PVM_ID_AA64DFR1_ALLOW (0ULL)
-
-/*
- * No support for implementation defined features.
- */
-#define PVM_ID_AA64AFR0_ALLOW (0ULL)
-#define PVM_ID_AA64AFR1_ALLOW (0ULL)
-
-/*
- * No restrictions on instructions implemented in AArch64.
- */
-#define PVM_ID_AA64ISAR0_ALLOW (\
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_AES) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA1) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA2) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_CRC32) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RDM) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA3) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM3) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM4) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_DP) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_FHM) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TS) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TLB) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RNDR) \
-	)
-
-#define PVM_ID_AA64ISAR1_ALLOW (\
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DPB) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_JSCVT) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FCMA) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_LRCPC) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FRINTTS) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SB) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SPECRES) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_BF16) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DGH) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM) \
-	)
-
-#define PVM_ID_AA64ISAR2_ALLOW (\
-	ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) \
-	)
-
-u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id);
-bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code);
-bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code);
-int kvm_check_pvm_sysreg_table(void);
-
-#endif /* __ARM64_KVM_FIXED_CONFIG_H__ */
--- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
@ -7,7 +7,7 @@
 #include <nvhe/memory.h>
 #include <nvhe/spinlock.h>

-#define HYP_NO_ORDER	USHRT_MAX
+#define HYP_NO_ORDER	0xff

 struct hyp_pool {
 	/*
@ -19,11 +19,11 @@ struct hyp_pool {
 	struct list_head free_area[MAX_ORDER];
 	phys_addr_t range_start;
 	phys_addr_t range_end;
-	unsigned short max_order;
+	u8 max_order;
 };

 /* Allocation */
-void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order);
+void *hyp_alloc_pages(struct hyp_pool *pool, u8 order);
 void hyp_split_page(struct hyp_page *page);
 void hyp_get_page(struct hyp_pool *pool, void *addr);
 void hyp_put_page(struct hyp_pool *pool, void *addr);
--- a/arch/arm64/kvm/hyp/include/nvhe/iommu.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ARM64_KVM_NVHE_IOMMU_H__
+#define __ARM64_KVM_NVHE_IOMMU_H__
+
+#include <linux/types.h>
+#include <asm/kvm_host.h>
+
+#include <nvhe/mem_protect.h>
+
+struct pkvm_iommu;
+
+struct pkvm_iommu_ops {
+	/*
+	 * Global driver initialization called before devices are registered.
+	 * Driver-specific arguments are passed in a buffer shared by the host.
+	 * The buffer memory has been pinned in EL2 but host retains R/W access.
+	 * Extra care must be taken when reading from it to avoid TOCTOU bugs.
+	 * If the driver maintains its own page tables, it is expected to
+	 * initialize them to all memory owned by the host.
+	 * Driver initialization lock held during callback.
+	 */
+	int (*init)(void *data, size_t size);
+
+	/*
+	 * Driver-specific validation of a device that is being registered.
+	 * All fields of the device struct have been populated.
+	 * Called with the host lock held.
+	 */
+	int (*validate)(struct pkvm_iommu *dev);
+
+	/*
+	 * Validation of a new child device that is being register by
+	 * the parent device the child selected. Called with the host lock held.
+	 */
+	int (*validate_child)(struct pkvm_iommu *dev, struct pkvm_iommu *child);
+
+	/*
+	 * Callback to apply a host stage-2 mapping change at driver level.
+	 * Called before 'host_stage2_idmap_apply' with host lock held.
+	 */
+	void (*host_stage2_idmap_prepare)(phys_addr_t start, phys_addr_t end,
+					  enum kvm_pgtable_prot prot);
+
+	/*
+	 * Callback to apply a host stage-2 mapping change at device level.
+	 * Called after 'host_stage2_idmap_prepare' with host lock held.
+	 */
+	void (*host_stage2_idmap_apply)(struct pkvm_iommu *dev,
+					phys_addr_t start, phys_addr_t end);
+
+	/*
+	 * Callback to finish a host stage-2 mapping change at device level.
+	 * Called after 'host_stage2_idmap_apply' with host lock held.
+	 */
+	void (*host_stage2_idmap_complete)(struct pkvm_iommu *dev);
+
+	/* Power management callbacks. Called with host lock held. */
+	int (*suspend)(struct pkvm_iommu *dev);
+	int (*resume)(struct pkvm_iommu *dev);
+
+	/*
+	 * Host data abort handler callback. Called with host lock held.
+	 * Returns true if the data abort has been handled.
+	 */
+	bool (*host_dabt_handler)(struct pkvm_iommu *dev,
+				  struct kvm_cpu_context *host_ctxt,
+				  u32 esr, size_t off);
+
+	/* Amount of memory allocated per-device for use by the driver. */
+	size_t data_size;
+};
+
+struct pkvm_iommu {
+	struct pkvm_iommu *parent;
+	struct list_head list;
+	struct list_head siblings;
+	struct list_head children;
+	unsigned long id;
+	const struct pkvm_iommu_ops *ops;
+	phys_addr_t pa;
+	void *va;
+	size_t size;
+	bool powered;
+	char data[];
+};
+
+int __pkvm_iommu_driver_init(struct pkvm_iommu_driver *drv, void *data, size_t size);
+int __pkvm_iommu_register(unsigned long dev_id, unsigned long drv_id,
+			  phys_addr_t dev_pa, size_t dev_size,
+			  unsigned long parent_id,
+			  void *kern_mem_va, size_t mem_size);
+int __pkvm_iommu_pm_notify(unsigned long dev_id,
+			   enum pkvm_iommu_pm_event event);
+int __pkvm_iommu_finalize(void);
+int pkvm_iommu_host_stage2_adjust_range(phys_addr_t addr, phys_addr_t *start,
+					phys_addr_t *end);
+bool pkvm_iommu_host_dabt_handler(struct kvm_cpu_context *host_ctxt, u32 esr,
+				  phys_addr_t fault_pa);
+void pkvm_iommu_host_stage2_idmap(phys_addr_t start, phys_addr_t end,
+				  enum kvm_pgtable_prot prot);
+
+#endif	/* __ARM64_KVM_NVHE_IOMMU_H__ */
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@ -8,8 +8,10 @@
 #define __KVM_NVHE_MEM_PROTECT__
 #include <linux/kvm_host.h>
 #include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
 #include <asm/kvm_pgtable.h>
 #include <asm/virt.h>
+#include <nvhe/pkvm.h>
 #include <nvhe/spinlock.h>

 /*
@ -29,6 +31,7 @@ enum pkvm_page_state {

 	/* Meta-states which aren't encoded directly in the PTE's SW bits */
 	PKVM_NOPAGE,
+	PKVM_PAGE_RESTRICTED_PROT,
 };

 #define PKVM_PAGE_STATE_PROT_MASK	(KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1)
@ -43,30 +46,69 @@ static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot)
 	return prot & PKVM_PAGE_STATE_PROT_MASK;
 }

-struct host_kvm {
+struct host_mmu {
 	struct kvm_arch arch;
 	struct kvm_pgtable pgt;
 	struct kvm_pgtable_mm_ops mm_ops;
 	hyp_spinlock_t lock;
 };
-extern struct host_kvm host_kvm;
+extern struct host_mmu host_mmu;

-extern const u8 pkvm_hyp_id;
+/* This corresponds to page-table locking order */
+enum pkvm_component_id {
+	PKVM_ID_HOST,
+	PKVM_ID_HYP,
+	PKVM_ID_GUEST,
+	PKVM_ID_FFA,
+	PKVM_ID_PROTECTED,
+	PKVM_ID_MAX = PKVM_ID_PROTECTED,
+};
+
+extern unsigned long hyp_nr_cpus;

 int __pkvm_prot_finalize(void);
 int __pkvm_host_share_hyp(u64 pfn);
 int __pkvm_host_unshare_hyp(u64 pfn);
+int __pkvm_host_reclaim_page(u64 pfn);
+int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
+int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
+int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu);
+int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu);
+int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
+int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
+int __pkvm_guest_relinquish_to_host(struct pkvm_hyp_vcpu *vcpu,
+				    u64 ipa, u64 *ppa);
+int __pkvm_install_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
+int __pkvm_remove_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
+bool __pkvm_check_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu);
+int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages);
+int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages);

 bool addr_is_memory(phys_addr_t phys);
-int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
-int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id);
+int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot,
+			     bool update_iommu);
+int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, enum pkvm_component_id owner_id);
+int host_stage2_protect_pages_locked(phys_addr_t addr, u64 size);
+int host_stage2_unmap_reg_locked(phys_addr_t start, u64 size);
 int kvm_host_prepare_stage2(void *pgt_pool_base);
+int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd);
 void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);

+int hyp_register_host_perm_fault_handler(int (*cb)(struct kvm_cpu_context *ctxt, u64 esr, u64 addr));
+int hyp_pin_shared_mem(void *from, void *to);
+void hyp_unpin_shared_mem(void *from, void *to);
+void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc);
+int hyp_protect_host_page(u64 pfn, enum kvm_pgtable_prot prot);
+int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
+		    struct kvm_hyp_memcache *host_mc);
+
+void psci_mem_protect_inc(u64 n);
+void psci_mem_protect_dec(u64 n);
+
 static __always_inline void __load_host_stage2(void)
 {
 	if (static_branch_likely(&kvm_protected_mode_initialized))
-		__load_stage2(&host_kvm.arch.mmu, &host_kvm.arch);
+		__load_stage2(&host_mmu.arch.mmu, &host_mmu.arch);
 	else
 		write_sysreg(0, vttbr_el2);
 }
--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@ -7,9 +7,17 @@

 #include <linux/types.h>

+/*
+ * Accesses to struct hyp_page flags are serialized by the host stage-2
+ * page-table lock.
+ */
+#define HOST_PAGE_NEED_POISONING	BIT(0)
+#define HOST_PAGE_PENDING_RECLAIM	BIT(1)
+
 struct hyp_page {
 	unsigned short refcount;
-	unsigned short order;
+	u8 order;
+	u8 flags;
 };

 extern u64 __hyp_vmemmap;
@ -38,6 +46,10 @@ static inline phys_addr_t hyp_virt_to_phys(void *addr)
 #define hyp_page_to_virt(page)	__hyp_va(hyp_page_to_phys(page))
 #define hyp_page_to_pool(page)	(((struct hyp_page *)page)->pool)

+/*
+ * Refcounting for 'struct hyp_page'.
+ * hyp_pool::lock must be held if atomic access to the refcount is required.
+ */
 static inline int hyp_page_count(void *addr)
 {
 	struct hyp_page *p = hyp_virt_to_page(addr);
@ -45,4 +57,27 @@ static inline int hyp_page_count(void *addr)
 	return p->refcount;
 }

+static inline void hyp_page_ref_inc(struct hyp_page *p)
+{
+	BUG_ON(p->refcount == USHRT_MAX);
+	p->refcount++;
+}
+
+static inline void hyp_page_ref_dec(struct hyp_page *p)
+{
+	BUG_ON(!p->refcount);
+	p->refcount--;
+}
+
+static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
+{
+	hyp_page_ref_dec(p);
+	return (p->refcount == 0);
+}
+
+static inline void hyp_set_page_refcounted(struct hyp_page *p)
+{
+	BUG_ON(p->refcount);
+	p->refcount = 1;
+}
 #endif /* __KVM_HYP_MEMORY_H */
--- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
@ -12,10 +12,16 @@

 extern struct kvm_pgtable pkvm_pgtable;
 extern hyp_spinlock_t pkvm_pgd_lock;
+extern const struct pkvm_module_ops module_ops;
+
+int hyp_create_pcpu_fixmap(void);
+void *hyp_fixmap_map(phys_addr_t phys);
+void hyp_fixmap_unmap(void);
+void hyp_poison_page(phys_addr_t phys);

 int hyp_create_idmap(u32 hyp_va_bits);
 int hyp_map_vectors(void);
-int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back);
+int hyp_back_vmemmap(phys_addr_t back);
 int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot);
 int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
 int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot);
@ -23,17 +29,9 @@ int __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
 				  enum kvm_pgtable_prot prot,
 				  unsigned long *haddr);
 int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr);
+void pkvm_remove_mappings(void *from, void *to);

-static inline void hyp_vmemmap_range(phys_addr_t phys, unsigned long size,
-				     unsigned long *start, unsigned long *end)
-{
-	unsigned long nr_pages = size >> PAGE_SHIFT;
-	struct hyp_page *p = hyp_phys_to_page(phys);
-
-	*start = (unsigned long)p;
-	*end = *start + nr_pages * sizeof(struct hyp_page);
-	*start = ALIGN_DOWN(*start, PAGE_SIZE);
-	*end = ALIGN(*end, PAGE_SIZE);
-}
-
+int __pkvm_map_module_page(u64 pfn, void *va, enum kvm_pgtable_prot prot);
+void __pkvm_unmap_module_page(u64 pfn, void *va);
+void *__pkvm_alloc_module_va(u64 nr_pages);
 #endif /* __KVM_HYP_MM_H */
--- a/arch/arm64/kvm/hyp/include/nvhe/modules.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/modules.h
@ -0,0 +1,34 @@
+#include <asm/kvm_pgtable.h>
+
+#define HCALL_HANDLED 0
+#define HCALL_UNHANDLED -1
+
+int __pkvm_register_host_smc_handler(bool (*cb)(struct kvm_cpu_context *));
+int __pkvm_register_default_trap_handler(bool (*cb)(struct kvm_cpu_context *));
+int __pkvm_register_illegal_abt_notifier(void (*cb)(struct kvm_cpu_context *));
+int __pkvm_register_hyp_panic_notifier(void (*cb)(struct kvm_cpu_context *));
+
+enum pkvm_psci_notification;
+int __pkvm_register_psci_notifier(void (*cb)(enum pkvm_psci_notification, struct kvm_cpu_context *));
+
+#ifdef CONFIG_MODULES
+int __pkvm_init_module(void *module_init);
+int __pkvm_register_hcall(unsigned long hfn_hyp_va);
+int handle_host_dynamic_hcall(struct kvm_cpu_context *host_ctxt);
+void pkvm_modules_lock(void);
+void pkvm_modules_unlock(void);
+bool pkvm_modules_enabled(void);
+int __pkvm_close_module_registration(void);
+#else
+static inline int __pkvm_init_module(void *module_init) { return -EOPNOTSUPP; }
+static inline int
+__pkvm_register_hcall(unsigned long hfn_hyp_va) { return -EOPNOTSUPP; }
+static inline int handle_host_dynamic_hcall(struct kvm_cpu_context *host_ctxt)
+{
+	return HCALL_UNHANDLED;
+}
+static inline void pkvm_modules_lock(void) { }
+static inline void pkvm_modules_unlock(void) { }
+static inline bool pkvm_modules_enabled(void) { return false; }
+static inline int __pkvm_close_module_registration(void) { return -EOPNOTSUPP; }
+#endif
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Google LLC
+ * Author: Fuad Tabba <tabba@google.com>
+ */
+
+#ifndef __ARM64_KVM_NVHE_PKVM_H__
+#define __ARM64_KVM_NVHE_PKVM_H__
+
+#include <asm/kvm_pkvm.h>
+
+#include <nvhe/gfp.h>
+#include <nvhe/spinlock.h>
+
+/*
+ * Holds the relevant data for maintaining the vcpu state completely at hyp.
+ */
+struct pkvm_hyp_vcpu {
+	struct kvm_vcpu vcpu;
+
+	/* Backpointer to the host's (untrusted) vCPU instance. */
+	struct kvm_vcpu *host_vcpu;
+
+	/*
+	 * If this hyp vCPU is loaded, then this is a backpointer to the
+	 * per-cpu pointer tracking us. Otherwise, NULL if not loaded.
+	 */
+	struct pkvm_hyp_vcpu **loaded_hyp_vcpu;
+
+	/* Tracks exit code for the protected guest. */
+	u32 exit_code;
+
+	/*
+	 * Track the power state transition of a protected vcpu.
+	 * Can be in one of three states:
+	 * PSCI_0_2_AFFINITY_LEVEL_ON
+	 * PSCI_0_2_AFFINITY_LEVEL_OFF
+	 * PSCI_0_2_AFFINITY_LEVEL_PENDING
+	 */
+	int power_state;
+};
+
+/*
+ * Holds the relevant data for running a protected vm.
+ */
+struct pkvm_hyp_vm {
+	struct kvm kvm;
+
+	/* Backpointer to the host's (untrusted) KVM instance. */
+	struct kvm *host_kvm;
+
+	/* The guest's stage-2 page-table managed by the hypervisor. */
+	struct kvm_pgtable pgt;
+	struct kvm_pgtable_mm_ops mm_ops;
+	struct hyp_pool pool;
+	hyp_spinlock_t lock;
+
+	/* Primary vCPU pending entry to the pvmfw */
+	struct pkvm_hyp_vcpu *pvmfw_entry_vcpu;
+
+	/*
+	 * The number of vcpus initialized and ready to run.
+	 * Modifying this is protected by 'vm_table_lock'.
+	 */
+	unsigned int nr_vcpus;
+
+	/* Array of the hyp vCPU structures for this VM. */
+	struct pkvm_hyp_vcpu *vcpus[];
+};
+
+static inline struct pkvm_hyp_vm *
+pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	return container_of(hyp_vcpu->vcpu.kvm, struct pkvm_hyp_vm, kvm);
+}
+
+static inline bool vcpu_is_protected(struct kvm_vcpu *vcpu)
+{
+	if (!is_protected_kvm_enabled())
+		return false;
+
+	return vcpu->kvm->arch.pkvm.enabled;
+}
+
+static inline bool pkvm_hyp_vcpu_is_protected(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	return vcpu_is_protected(&hyp_vcpu->vcpu);
+}
+
+extern phys_addr_t pvmfw_base;
+extern phys_addr_t pvmfw_size;
+
+void pkvm_hyp_vm_table_init(void *tbl);
+
+int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
+		   unsigned long pgd_hva, unsigned long last_ran_hva);
+int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
+		     unsigned long vcpu_hva);
+int __pkvm_teardown_vm(pkvm_handle_t handle);
+
+struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
+					 unsigned int vcpu_idx);
+void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu);
+struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void);
+
+u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id);
+bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code);
+bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code);
+void kvm_reset_pvm_sys_regs(struct kvm_vcpu *vcpu);
+int kvm_check_pvm_sysreg_table(void);
+
+void pkvm_reset_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu);
+
+bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code);
+bool kvm_hyp_handle_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code);
+
+struct pkvm_hyp_vcpu *pkvm_mpidr_to_hyp_vcpu(struct pkvm_hyp_vm *vm, u64 mpidr);
+
+static inline bool pkvm_hyp_vm_has_pvmfw(struct pkvm_hyp_vm *vm)
+{
+	return vm->kvm.arch.pkvm.pvmfw_load_addr != PVMFW_INVALID_LOAD_ADDR;
+}
+
+static inline bool pkvm_ipa_range_has_pvmfw(struct pkvm_hyp_vm *vm,
+					    u64 ipa_start, u64 ipa_end)
+{
+	struct kvm_protected_vm *pkvm = &vm->kvm.arch.pkvm;
+	u64 pvmfw_load_end = pkvm->pvmfw_load_addr + pvmfw_size;
+
+	if (!pkvm_hyp_vm_has_pvmfw(vm))
+		return false;
+
+	return ipa_end > pkvm->pvmfw_load_addr && ipa_start < pvmfw_load_end;
+}
+
+int pkvm_load_pvmfw_pages(struct pkvm_hyp_vm *vm, u64 ipa, phys_addr_t phys,
+			  u64 size);
+void pkvm_poison_pvmfw_pages(void);
+
+#endif /* __ARM64_KVM_NVHE_PKVM_H__ */
--- a/arch/arm64/kvm/hyp/include/nvhe/serial.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/serial.h
@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ARM64_KVM_NVHE_SERIAL_H__
+#define __ARM64_KVM_NVHE_SERIAL_H__
+
+void hyp_puts(const char *s);
+void hyp_putx64(u64 x);
+void hyp_putc(char c);
+int __pkvm_register_serial_driver(void (*driver_cb)(char));
+
+#endif
--- a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
@ -28,9 +28,17 @@ typedef union hyp_spinlock {
 	};
 } hyp_spinlock_t;

+#define __HYP_SPIN_LOCK_INITIALIZER \
+	{ .__val = 0 }
+
+#define __HYP_SPIN_LOCK_UNLOCKED \
+	((hyp_spinlock_t) __HYP_SPIN_LOCK_INITIALIZER)
+
+#define DEFINE_HYP_SPINLOCK(x)	hyp_spinlock_t x = __HYP_SPIN_LOCK_UNLOCKED
+
 #define hyp_spin_lock_init(l)						\
 do {									\
-	*(l) = (hyp_spinlock_t){ .__val = 0 };				\
+	*(l) = __HYP_SPIN_LOCK_UNLOCKED;				\
 } while (0)

 static inline void hyp_spin_lock(hyp_spinlock_t *lock)
--- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
@ -15,6 +15,4 @@
 #define DECLARE_REG(type, name, ctxt, reg)	\
 				type name = (type)cpu_reg(ctxt, (reg))

-void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu);
-
 #endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */
--- a/arch/arm64/kvm/hyp/nvhe/.gitignore
+++ b/arch/arm64/kvm/hyp/nvhe/.gitignore
@ -1,4 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
-gen-hyprel
 hyp.lds
 hyp-reloc.S
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@ -1,111 +1,23 @@
 # SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for Kernel-based Virtual Machine module, HYP/nVHE part
-#
-
-asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS
-
-# Tracepoint and MMIO logging symbols should not be visible at nVHE KVM as
-# there is no way to execute them and any such MMIO access from nVHE KVM
-# will explode instantly (Words of Marc Zyngier). So introduce a generic flag
-# __DISABLE_TRACE_MMIO__ to disable MMIO tracing for nVHE KVM.
-ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__
-ccflags-y += -fno-stack-protector	\
-	     -DDISABLE_BRANCH_PROFILING	\
-	     $(DISABLE_STACKLEAK_PLUGIN)
-
-hostprogs := gen-hyprel
-HOST_EXTRACFLAGS += -I$(objtree)/include

 lib-objs := clear_page.o copy_page.o memcpy.o memset.o
 lib-objs := $(addprefix ../../../lib/, $(lib-objs))

 hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
 	 hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \
-	 cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o
+	 cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o iommu.o \
+	 serial.o
 hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
 	 ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
 hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o
+hyp-obj-$(CONFIG_MODULES) += modules.o
 hyp-obj-y += $(lib-objs)

-##
-## Build rules for compiling nVHE hyp code
-## Output of this folder is `kvm_nvhe.o`, a partially linked object
-## file containing all nVHE hyp code and data.
-##
+hyp-obj-$(CONFIG_KVM_S2MPU) += iommu/s2mpu.o
+hyp-obj-$(CONFIG_KVM_S2MPU) += iommu/io-mpt-s2mpu.o

-hyp-obj := $(patsubst %.o,%.nvhe.o,$(hyp-obj-y))
-obj-y := kvm_nvhe.o
-targets += $(hyp-obj) kvm_nvhe.tmp.o kvm_nvhe.rel.o hyp.lds hyp-reloc.S hyp-reloc.o
-
-# 1) Compile all source files to `.nvhe.o` object files. The file extension
-#    avoids file name clashes for files shared with VHE.
-$(obj)/%.nvhe.o: $(src)/%.c FORCE
-	$(call if_changed_rule,cc_o_c)
-$(obj)/%.nvhe.o: $(src)/%.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-# 2) Compile linker script.
 $(obj)/hyp.lds: $(src)/hyp.lds.S FORCE
 	$(call if_changed_dep,cpp_lds_S)

-# 3) Partially link all '.nvhe.o' files and apply the linker script.
-#    Prefixes names of ELF sections with '.hyp', eg. '.hyp.text'.
-#    Note: The following rule assumes that the 'ld' rule puts LDFLAGS before
-#          the list of dependencies to form '-T $(obj)/hyp.lds'. This is to
-#          keep the dependency on the target while avoiding an error from
-#          GNU ld if the linker script is passed to it twice.
-LDFLAGS_kvm_nvhe.tmp.o := -r -T
-$(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE
-	$(call if_changed,ld)
-
-# 4) Generate list of hyp code/data positions that need to be relocated at
-#    runtime. Because the hypervisor is part of the kernel binary, relocations
-#    produce a kernel VA. We enumerate relocations targeting hyp at build time
-#    and convert the kernel VAs at those positions to hyp VAs.
-$(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o $(obj)/gen-hyprel FORCE
-	$(call if_changed,hyprel)
-
-# 5) Compile hyp-reloc.S and link it into the existing partially linked object.
-#    The object file now contains a section with pointers to hyp positions that
-#    will contain kernel VAs at runtime. These pointers have relocations on them
-#    so that they get updated as the hyp object is linked into `vmlinux`.
-LDFLAGS_kvm_nvhe.rel.o := -r
-$(obj)/kvm_nvhe.rel.o: $(obj)/kvm_nvhe.tmp.o $(obj)/hyp-reloc.o FORCE
-	$(call if_changed,ld)
-
-# 6) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'.
-#    Prefixes names of ELF symbols with '__kvm_nvhe_'.
-$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.rel.o FORCE
-	$(call if_changed,hypcopy)
-
-# The HYPREL command calls `gen-hyprel` to generate an assembly file with
-# a list of relocations targeting hyp code/data.
-quiet_cmd_hyprel = HYPREL  $@
-      cmd_hyprel = $(obj)/gen-hyprel $< > $@
-
-# The HYPCOPY command uses `objcopy` to prefix all ELF symbol names
-# to avoid clashes with VHE code/data.
-quiet_cmd_hypcopy = HYPCOPY $@
-      cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@
-
-# Remove ftrace, Shadow Call Stack, and CFI CFLAGS.
-# This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations.
-KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI), $(KBUILD_CFLAGS))
-# Starting from 13.0.0 llvm emits SHT_REL section '.llvm.call-graph-profile'
-# when profile optimization is applied. gen-hyprel does not support SHT_REL and
-# causes a build failure. Remove profile optimization flags.
-KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS))
-
-# KVM nVHE code is run at a different exception code with a different map, so
-# compiler instrumentation that inserts callbacks or checks into the code may
-# cause crashes. Just disable it.
-GCOV_PROFILE	:= n
-KASAN_SANITIZE	:= n
-KCSAN_SANITIZE	:= n
-UBSAN_SANITIZE	:= n
-KCOV_INSTRUMENT	:= n
-
-# Skip objtool checking for this directory because nVHE code is compiled with
-# non-standard build rules.
-OBJECT_FILES_NON_STANDARD := y
+include $(srctree)/arch/arm64/kvm/hyp/nvhe/Makefile.nvhe
+obj-y := kvm_nvhe.o
--- a/arch/arm64/kvm/hyp/nvhe/Makefile.module
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile.module
@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+
+$(obj)/hyp.lds: arch/arm64/kvm/hyp/nvhe/module.lds.S FORCE
+	$(call if_changed_dep,cpp_lds_S)
+
+include $(srctree)/arch/arm64/kvm/hyp/nvhe/Makefile.nvhe
--- a/arch/arm64/kvm/hyp/nvhe/Makefile.nvhe
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile.nvhe
@ -0,0 +1,94 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for Kernel-based Virtual Machine module, HYP/nVHE part
+#
+
+asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS
+
+# Tracepoint and MMIO logging symbols should not be visible at nVHE KVM as
+# there is no way to execute them and any such MMIO access from nVHE KVM
+# will explode instantly (Words of Marc Zyngier). So introduce a generic flag
+# __DISABLE_TRACE_MMIO__ to disable MMIO tracing for nVHE KVM.
+ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__
+ccflags-y += -fno-stack-protector	\
+	     -DDISABLE_BRANCH_PROFILING	\
+	     $(DISABLE_STACKLEAK_PLUGIN)
+
+HYPREL := arch/arm64/tools/gen-hyprel
+
+##
+## Build rules for compiling nVHE hyp code
+## Output of this folder is `kvm_nvhe.o`, a partially linked object
+## file containing all nVHE hyp code and data.
+##
+
+hyp-obj := $(patsubst %.o,%.nvhe.o,$(hyp-obj-y))
+targets += $(hyp-obj) kvm_nvhe.tmp.o kvm_nvhe.rel.o hyp.lds hyp-reloc.S hyp-reloc.o
+
+# 1) Compile all source files to `.nvhe.o` object files. The file extension
+#    avoids file name clashes for files shared with VHE.
+$(obj)/%.nvhe.o: $(src)/%.c FORCE
+	$(call if_changed_rule,cc_o_c)
+$(obj)/%.nvhe.o: $(src)/%.S FORCE
+	$(call if_changed_rule,as_o_S)
+
+# 2) Partially link all '.nvhe.o' files and apply the linker script.
+#    Prefixes names of ELF sections with '.hyp', eg. '.hyp.text'.
+#    Note: The following rule assumes that the 'ld' rule puts LDFLAGS before
+#          the list of dependencies to form '-T $(obj)/hyp.lds'. This is to
+#          keep the dependency on the target while avoiding an error from
+#          GNU ld if the linker script is passed to it twice.
+LDFLAGS_kvm_nvhe.tmp.o := -r -T
+$(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE
+	$(call if_changed,ld)
+
+# 3) Generate list of hyp code/data positions that need to be relocated at
+#    runtime. Because the hypervisor is part of the kernel binary, relocations
+#    produce a kernel VA. We enumerate relocations targeting hyp at build time
+#    and convert the kernel VAs at those positions to hyp VAs.
+$(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o FORCE
+	$(call if_changed,hyprel)
+
+# 4) Compile hyp-reloc.S and link it into the existing partially linked object.
+#    The object file now contains a section with pointers to hyp positions that
+#    will contain kernel VAs at runtime. These pointers have relocations on them
+#    so that they get updated as the hyp object is linked into `vmlinux`.
+LDFLAGS_kvm_nvhe.rel.o := -r
+$(obj)/kvm_nvhe.rel.o: $(obj)/kvm_nvhe.tmp.o $(obj)/hyp-reloc.o FORCE
+	$(call if_changed,ld)
+
+# 5) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'.
+#    Prefixes names of ELF symbols with '__kvm_nvhe_'.
+$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.rel.o FORCE
+	$(call if_changed,hypcopy)
+
+# The HYPREL command calls `gen-hyprel` to generate an assembly file with
+# a list of relocations targeting hyp code/data.
+quiet_cmd_hyprel = HYPREL  $@
+      cmd_hyprel = $(HYPREL) $< > $@
+
+# The HYPCOPY command uses `objcopy` to prefix all ELF symbol names
+# to avoid clashes with VHE code/data.
+quiet_cmd_hypcopy = HYPCOPY $@
+      cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@
+
+# Remove ftrace, Shadow Call Stack, and CFI CFLAGS.
+# This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations.
+KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI), $(KBUILD_CFLAGS))
+# Starting from 13.0.0 llvm emits SHT_REL section '.llvm.call-graph-profile'
+# when profile optimization is applied. gen-hyprel does not support SHT_REL and
+# causes a build failure. Remove profile optimization flags.
+KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS))
+
+# KVM nVHE code is run at a different exception code with a different map, so
+# compiler instrumentation that inserts callbacks or checks into the code may
+# cause crashes. Just disable it.
+GCOV_PROFILE	:= n
+KASAN_SANITIZE	:= n
+KCSAN_SANITIZE	:= n
+UBSAN_SANITIZE	:= n
+KCOV_INSTRUMENT	:= n
+
+# Skip objtool checking for this directory because nVHE code is compiled with
+# non-standard build rules.
+OBJECT_FILES_NON_STANDARD := y
--- a/arch/arm64/kvm/hyp/nvhe/cache.S
+++ b/arch/arm64/kvm/hyp/nvhe/cache.S
@ -12,3 +12,14 @@ SYM_FUNC_START(__pi_dcache_clean_inval_poc)
 	ret
 SYM_FUNC_END(__pi_dcache_clean_inval_poc)
 SYM_FUNC_ALIAS(dcache_clean_inval_poc, __pi_dcache_clean_inval_poc)
+
+SYM_FUNC_START(__pi_icache_inval_pou)
+alternative_if ARM64_HAS_CACHE_DIC
+	isb
+	ret
+alternative_else_nop_endif
+
+	invalidate_icache_by_line x0, x1, x2, x3
+	ret
+SYM_FUNC_END(__pi_icache_inval_pou)
+SYM_FUNC_ALIAS(icache_inval_pou, __pi_icache_inval_pou)
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@ -0,0 +1,741 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * FF-A v1.0 proxy to filter out invalid memory-sharing SMC calls issued by
+ * the host. FF-A is a slightly more palatable abbreviation of "Arm Firmware
+ * Framework for Arm A-profile", which is specified by Arm in document
+ * number DEN0077.
+ *
+ * Copyright (C) 2022 - Google LLC
+ * Author: Andrew Walbran <qwandor@google.com>
+ *
+ * This driver hooks into the SMC trapping logic for the host and intercepts
+ * all calls falling within the FF-A range. Each call is either:
+ *
+ *	- Forwarded on unmodified to the SPMD at EL3
+ *	- Rejected as "unsupported"
+ *	- Accompanied by a host stage-2 page-table check/update and reissued
+ *
+ * Consequently, any attempts by the host to make guest memory pages
+ * accessible to the secure world using FF-A will be detected either here
+ * (in the case that the memory is already owned by the guest) or during
+ * donation to the guest (in the case that the memory was previously shared
+ * with the secure world).
+ *
+ * To allow the rolling-back of page-table updates and FF-A calls in the
+ * event of failure, operations involving the RXTX buffers are locked for
+ * the duration and are therefore serialised.
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/arm_ffa.h>
+#include <asm/kvm_pkvm.h>
+
+#include <nvhe/ffa.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/memory.h>
+#include <nvhe/trap_handler.h>
+#include <nvhe/spinlock.h>
+
+/*
+ * "ID value 0 must be returned at the Non-secure physical FF-A instance"
+ * We share this ID with the host.
+ */
+#define HOST_FFA_ID	0
+
+/*
+ * A buffer to hold the maximum descriptor size we can see from the host,
+ * which is required when the SPMD returns a fragmented FFA_MEM_RETRIEVE_RESP
+ * when resolving the handle on the reclaim path.
+ */
+struct kvm_ffa_descriptor_buffer {
+	void	*buf;
+	size_t	len;
+};
+
+static struct kvm_ffa_descriptor_buffer ffa_desc_buf;
+
+struct kvm_ffa_buffers {
+	hyp_spinlock_t lock;
+	void *tx;
+	void *rx;
+};
+
+/*
+ * Note that we don't currently lock these buffers explicitly, instead
+ * relying on the locking of the host FFA buffers as we only have one
+ * client.
+ */
+static struct kvm_ffa_buffers hyp_buffers;
+static struct kvm_ffa_buffers host_buffers;
+
+static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno)
+{
+	*res = (struct arm_smccc_res) {
+		.a0	= FFA_ERROR,
+		.a2	= ffa_errno,
+	};
+}
+
+static void ffa_to_smccc_res_prop(struct arm_smccc_res *res, int ret, u64 prop)
+{
+	if (ret == FFA_RET_SUCCESS) {
+		*res = (struct arm_smccc_res) { .a0 = FFA_SUCCESS,
+						.a2 = prop };
+	} else {
+		ffa_to_smccc_error(res, ret);
+	}
+}
+
+static void ffa_to_smccc_res(struct arm_smccc_res *res, int ret)
+{
+	ffa_to_smccc_res_prop(res, ret, 0);
+}
+
+static void ffa_set_retval(struct kvm_cpu_context *ctxt,
+			   struct arm_smccc_res *res)
+{
+	cpu_reg(ctxt, 0) = res->a0;
+	cpu_reg(ctxt, 1) = res->a1;
+	cpu_reg(ctxt, 2) = res->a2;
+	cpu_reg(ctxt, 3) = res->a3;
+}
+
+static bool is_ffa_call(u64 func_id)
+{
+	return ARM_SMCCC_IS_FAST_CALL(func_id) &&
+	       ARM_SMCCC_OWNER_NUM(func_id) == ARM_SMCCC_OWNER_STANDARD &&
+	       ARM_SMCCC_FUNC_NUM(func_id) >= FFA_MIN_FUNC_NUM &&
+	       ARM_SMCCC_FUNC_NUM(func_id) <= FFA_MAX_FUNC_NUM;
+}
+
+static int spmd_map_ffa_buffers(u64 ffa_page_count)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_smc(FFA_FN64_RXTX_MAP,
+			  hyp_virt_to_phys(hyp_buffers.tx),
+			  hyp_virt_to_phys(hyp_buffers.rx),
+			  ffa_page_count,
+			  0, 0, 0, 0,
+			  &res);
+
+	return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
+}
+
+static int spmd_unmap_ffa_buffers(void)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_smc(FFA_RXTX_UNMAP,
+			  HOST_FFA_ID,
+			  0, 0, 0, 0, 0, 0,
+			  &res);
+
+	return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
+}
+
+static void spmd_mem_frag_tx(struct arm_smccc_res *res, u32 handle_lo,
+			     u32 handle_hi, u32 fraglen, u32 endpoint_id)
+{
+	arm_smccc_1_1_smc(FFA_MEM_FRAG_TX,
+			  handle_lo, handle_hi, fraglen, endpoint_id,
+			  0, 0, 0,
+			  res);
+}
+
+static void spmd_mem_frag_rx(struct arm_smccc_res *res, u32 handle_lo,
+			     u32 handle_hi, u32 fragoff)
+{
+	arm_smccc_1_1_smc(FFA_MEM_FRAG_RX,
+			  handle_lo, handle_hi, fragoff, HOST_FFA_ID,
+			  0, 0, 0,
+			  res);
+}
+
+static void spmd_mem_xfer(struct arm_smccc_res *res, u64 func_id, u32 len,
+			  u32 fraglen)
+{
+	arm_smccc_1_1_smc(func_id, len, fraglen,
+			  0, 0, 0, 0, 0,
+			  res);
+}
+
+static void spmd_mem_reclaim(struct arm_smccc_res *res, u32 handle_lo,
+			     u32 handle_hi, u32 flags)
+{
+	arm_smccc_1_1_smc(FFA_MEM_RECLAIM,
+			  handle_lo, handle_hi, flags,
+			  0, 0, 0, 0,
+			  res);
+}
+
+static void spmd_retrieve_req(struct arm_smccc_res *res, u32 len)
+{
+	arm_smccc_1_1_smc(FFA_FN64_MEM_RETRIEVE_REQ,
+			  len, len,
+			  0, 0, 0, 0, 0,
+			  res);
+}
+
+static void do_ffa_rxtx_map(struct arm_smccc_res *res,
+			    struct kvm_cpu_context *ctxt)
+{
+	DECLARE_REG(phys_addr_t, tx, ctxt, 1);
+	DECLARE_REG(phys_addr_t, rx, ctxt, 2);
+	DECLARE_REG(u32, npages, ctxt, 3);
+	int ret = 0;
+	void *rx_virt, *tx_virt;
+
+	if (npages != (KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) / FFA_PAGE_SIZE) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out;
+	}
+
+	if (!PAGE_ALIGNED(tx) || !PAGE_ALIGNED(rx)) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out;
+	}
+
+	hyp_spin_lock(&host_buffers.lock);
+	if (host_buffers.tx) {
+		ret = FFA_RET_DENIED;
+		goto out_unlock;
+	}
+
+	ret = spmd_map_ffa_buffers(npages);
+	if (ret)
+		goto out_unlock;
+
+	ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(tx));
+	if (ret) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto err_unmap;
+	}
+
+	ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(rx));
+	if (ret) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto err_unshare_tx;
+	}
+
+	tx_virt = hyp_phys_to_virt(tx);
+	ret = hyp_pin_shared_mem(tx_virt, tx_virt + 1);
+	if (ret) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto err_unshare_rx;
+	}
+
+	rx_virt = hyp_phys_to_virt(rx);
+	ret = hyp_pin_shared_mem(rx_virt, rx_virt + 1);
+	if (ret) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto err_unpin_tx;
+	}
+
+	host_buffers.tx = tx_virt;
+	host_buffers.rx = rx_virt;
+
+out_unlock:
+	hyp_spin_unlock(&host_buffers.lock);
+out:
+	ffa_to_smccc_res(res, ret);
+	return;
+
+err_unpin_tx:
+	hyp_unpin_shared_mem(tx_virt, tx_virt + 1);
+err_unshare_rx:
+	__pkvm_host_unshare_hyp(hyp_phys_to_pfn(rx));
+err_unshare_tx:
+	__pkvm_host_unshare_hyp(hyp_phys_to_pfn(tx));
+err_unmap:
+	spmd_unmap_ffa_buffers();
+	goto out_unlock;
+}
+
+static void do_ffa_rxtx_unmap(struct arm_smccc_res *res,
+			      struct kvm_cpu_context *ctxt)
+{
+	DECLARE_REG(u32, id, ctxt, 1);
+	int ret = 0;
+
+	if (id != HOST_FFA_ID) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out;
+	}
+
+	hyp_spin_lock(&host_buffers.lock);
+	if (!host_buffers.tx) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out_unlock;
+	}
+
+	hyp_unpin_shared_mem(host_buffers.tx, host_buffers.tx + 1);
+	WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.tx)));
+	host_buffers.tx = NULL;
+
+	hyp_unpin_shared_mem(host_buffers.rx, host_buffers.rx + 1);
+	WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.rx)));
+	host_buffers.rx = NULL;
+
+	spmd_unmap_ffa_buffers();
+
+out_unlock:
+	hyp_spin_unlock(&host_buffers.lock);
+out:
+	ffa_to_smccc_res(res, ret);
+}
+
+static u32 __ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges,
+				   u32 nranges)
+{
+	u32 i;
+
+	for (i = 0; i < nranges; ++i) {
+		struct ffa_mem_region_addr_range *range = &ranges[i];
+		u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE;
+		u64 pfn = hyp_phys_to_pfn(range->address);
+
+		if (!PAGE_ALIGNED(sz))
+			break;
+
+		if (__pkvm_host_share_ffa(pfn, sz / PAGE_SIZE))
+			break;
+	}
+
+	return i;
+}
+
+static u32 __ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges,
+				     u32 nranges)
+{
+	u32 i;
+
+	for (i = 0; i < nranges; ++i) {
+		struct ffa_mem_region_addr_range *range = &ranges[i];
+		u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE;
+		u64 pfn = hyp_phys_to_pfn(range->address);
+
+		if (!PAGE_ALIGNED(sz))
+			break;
+
+		if (__pkvm_host_unshare_ffa(pfn, sz / PAGE_SIZE))
+			break;
+	}
+
+	return i;
+}
+
+static int ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges,
+				 u32 nranges)
+{
+	u32 nshared = __ffa_host_share_ranges(ranges, nranges);
+	int ret = 0;
+
+	if (nshared != nranges) {
+		WARN_ON(__ffa_host_unshare_ranges(ranges, nshared) != nshared);
+		ret = FFA_RET_DENIED;
+	}
+
+	return ret;
+}
+
+static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges,
+				   u32 nranges)
+{
+	u32 nunshared = __ffa_host_unshare_ranges(ranges, nranges);
+	int ret = 0;
+
+	if (nunshared != nranges) {
+		WARN_ON(__ffa_host_share_ranges(ranges, nunshared) != nunshared);
+		ret = FFA_RET_DENIED;
+	}
+
+	return ret;
+}
+
+static void do_ffa_mem_frag_tx(struct arm_smccc_res *res,
+			       struct kvm_cpu_context *ctxt)
+{
+	DECLARE_REG(u32, handle_lo, ctxt, 1);
+	DECLARE_REG(u32, handle_hi, ctxt, 2);
+	DECLARE_REG(u32, fraglen, ctxt, 3);
+	DECLARE_REG(u32, endpoint_id, ctxt, 4);
+	struct ffa_mem_region_addr_range *buf;
+	int ret = FFA_RET_INVALID_PARAMETERS;
+	u32 nr_ranges;
+
+	if (fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)
+		goto out;
+
+	if (fraglen % sizeof(*buf))
+		goto out;
+
+	hyp_spin_lock(&host_buffers.lock);
+	if (!host_buffers.tx)
+		goto out_unlock;
+
+	buf = hyp_buffers.tx;
+	memcpy(buf, host_buffers.tx, fraglen);
+	nr_ranges = fraglen / sizeof(*buf);
+
+	ret = ffa_host_share_ranges(buf, nr_ranges);
+	if (ret) {
+		/*
+		 * We're effectively aborting the transaction, so we need
+		 * to restore the global state back to what it was prior to
+		 * transmission of the first fragment.
+		 */
+		spmd_mem_reclaim(res, handle_lo, handle_hi, 0);
+		WARN_ON(res->a0 != FFA_SUCCESS);
+		goto out_unlock;
+	}
+
+	spmd_mem_frag_tx(res, handle_lo, handle_hi, fraglen, endpoint_id);
+	if (res->a0 != FFA_SUCCESS && res->a0 != FFA_MEM_FRAG_RX)
+		WARN_ON(ffa_host_unshare_ranges(buf, nr_ranges));
+
+out_unlock:
+	hyp_spin_unlock(&host_buffers.lock);
+out:
+	if (ret)
+		ffa_to_smccc_res(res, ret);
+
+	/*
+	 * If for any reason this did not succeed, we're in trouble as we have
+	 * now lost the content of the previous fragments and we can't rollback
+	 * the host stage-2 changes. The pages previously marked as shared will
+	 * remain stuck in that state forever, hence preventing the host from
+	 * sharing/donating them again and may possibly lead to subsequent
+	 * failures, but this will not compromise confidentiality.
+	 */
+	return;
+}
+
+static __always_inline void do_ffa_mem_xfer(const u64 func_id,
+					    struct arm_smccc_res *res,
+					    struct kvm_cpu_context *ctxt)
+{
+	DECLARE_REG(u32, len, ctxt, 1);
+	DECLARE_REG(u32, fraglen, ctxt, 2);
+	DECLARE_REG(u64, addr_mbz, ctxt, 3);
+	DECLARE_REG(u32, npages_mbz, ctxt, 4);
+	struct ffa_composite_mem_region *reg;
+	struct ffa_mem_region *buf;
+	u32 offset, nr_ranges;
+	int ret = 0;
+
+	BUILD_BUG_ON(func_id != FFA_FN64_MEM_SHARE &&
+		     func_id != FFA_FN64_MEM_LEND);
+
+	if (addr_mbz || npages_mbz || fraglen > len ||
+	    fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out;
+	}
+
+	if (fraglen < sizeof(struct ffa_mem_region) +
+		      sizeof(struct ffa_mem_region_attributes)) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out;
+	}
+
+	hyp_spin_lock(&host_buffers.lock);
+	if (!host_buffers.tx) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out_unlock;
+	}
+
+	buf = hyp_buffers.tx;
+	memcpy(buf, host_buffers.tx, fraglen);
+
+	offset = buf->ep_mem_access[0].composite_off;
+	if (!offset || buf->ep_count != 1 || buf->sender_id != HOST_FFA_ID) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out_unlock;
+	}
+
+	if (fraglen < offset + sizeof(struct ffa_composite_mem_region)) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out_unlock;
+	}
+
+	reg = (void *)buf + offset;
+	nr_ranges = ((void *)buf + fraglen) - (void *)reg->constituents;
+	if (nr_ranges % sizeof(reg->constituents[0])) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out_unlock;
+	}
+
+	nr_ranges /= sizeof(reg->constituents[0]);
+	ret = ffa_host_share_ranges(reg->constituents, nr_ranges);
+	if (ret)
+		goto out_unlock;
+
+	spmd_mem_xfer(res, func_id, len, fraglen);
+	if (fraglen != len) {
+		if (res->a0 != FFA_MEM_FRAG_RX)
+			goto err_unshare;
+
+		if (res->a3 != fraglen)
+			goto err_unshare;
+	} else if (res->a0 != FFA_SUCCESS) {
+		goto err_unshare;
+	}
+
+out_unlock:
+	hyp_spin_unlock(&host_buffers.lock);
+out:
+	if (ret)
+		ffa_to_smccc_res(res, ret);
+	return;
+
+err_unshare:
+	WARN_ON(ffa_host_unshare_ranges(reg->constituents, nr_ranges));
+	goto out_unlock;
+}
+
+static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
+			       struct kvm_cpu_context *ctxt)
+{
+	DECLARE_REG(u32, handle_lo, ctxt, 1);
+	DECLARE_REG(u32, handle_hi, ctxt, 2);
+	DECLARE_REG(u32, flags, ctxt, 3);
+	struct ffa_composite_mem_region *reg;
+	u32 offset, len, fraglen, fragoff;
+	struct ffa_mem_region *buf;
+	int ret = 0;
+	u64 handle;
+
+	handle = PACK_HANDLE(handle_lo, handle_hi);
+
+	hyp_spin_lock(&host_buffers.lock);
+
+	buf = hyp_buffers.tx;
+	*buf = (struct ffa_mem_region) {
+		.sender_id	= HOST_FFA_ID,
+		.handle		= handle,
+	};
+
+	spmd_retrieve_req(res, sizeof(*buf));
+	buf = hyp_buffers.rx;
+	if (res->a0 != FFA_MEM_RETRIEVE_RESP)
+		goto out_unlock;
+
+	len = res->a1;
+	fraglen = res->a2;
+
+	offset = buf->ep_mem_access[0].composite_off;
+	/*
+	 * We can trust the SPMD to get this right, but let's at least
+	 * check that we end up with something that doesn't look _completely_
+	 * bogus.
+	 */
+	if (WARN_ON(offset > len ||
+		    fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) {
+		ret = FFA_RET_ABORTED;
+		goto out_unlock;
+	}
+
+	if (len > ffa_desc_buf.len) {
+		ret = FFA_RET_NO_MEMORY;
+		goto out_unlock;
+	}
+
+	buf = ffa_desc_buf.buf;
+	memcpy(buf, hyp_buffers.rx, fraglen);
+
+	for (fragoff = fraglen; fragoff < len; fragoff += fraglen) {
+		spmd_mem_frag_rx(res, handle_lo, handle_hi, fragoff);
+		if (res->a0 != FFA_MEM_FRAG_TX) {
+			ret = FFA_RET_INVALID_PARAMETERS;
+			goto out_unlock;
+		}
+
+		fraglen = res->a3;
+		memcpy((void *)buf + fragoff, hyp_buffers.rx, fraglen);
+	}
+
+	spmd_mem_reclaim(res, handle_lo, handle_hi, flags);
+	if (res->a0 != FFA_SUCCESS)
+		goto out_unlock;
+
+	reg = (void *)buf + offset;
+	/* If the SPMD was happy, then we should be too. */
+	WARN_ON(ffa_host_unshare_ranges(reg->constituents,
+					reg->addr_range_cnt));
+out_unlock:
+	hyp_spin_unlock(&host_buffers.lock);
+
+	if (ret)
+		ffa_to_smccc_res(res, ret);
+}
+
+static bool ffa_call_unsupported(u64 func_id)
+{
+	switch (func_id) {
+	/* Unsupported memory management calls */
+	case FFA_FN64_MEM_RETRIEVE_REQ:
+	case FFA_MEM_RETRIEVE_RESP:
+	case FFA_MEM_RELINQUISH:
+	case FFA_MEM_OP_PAUSE:
+	case FFA_MEM_OP_RESUME:
+	case FFA_MEM_FRAG_RX:
+	case FFA_FN64_MEM_DONATE:
+	/* Indirect message passing via RX/TX buffers */
+	case FFA_MSG_SEND:
+	case FFA_MSG_POLL:
+	case FFA_MSG_WAIT:
+	/* 32-bit variants of 64-bit calls */
+	case FFA_MSG_SEND_DIRECT_REQ:
+	case FFA_MSG_SEND_DIRECT_RESP:
+	case FFA_RXTX_MAP:
+	case FFA_MEM_DONATE:
+	case FFA_MEM_RETRIEVE_REQ:
+		return true;
+	}
+
+	return false;
+}
+
+static bool do_ffa_features(struct arm_smccc_res *res,
+			    struct kvm_cpu_context *ctxt)
+{
+	DECLARE_REG(u32, id, ctxt, 1);
+	u64 prop = 0;
+	int ret = 0;
+
+	if (ffa_call_unsupported(id)) {
+		ret = FFA_RET_NOT_SUPPORTED;
+		goto out_handled;
+	}
+
+	switch (id) {
+	case FFA_MEM_SHARE:
+	case FFA_FN64_MEM_SHARE:
+	case FFA_MEM_LEND:
+	case FFA_FN64_MEM_LEND:
+		ret = FFA_RET_SUCCESS;
+		prop = 0; /* No support for dynamic buffers */
+		goto out_handled;
+	default:
+		return false;
+	}
+
+out_handled:
+	ffa_to_smccc_res_prop(res, ret, prop);
+	return true;
+}
+
+bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(u64, func_id, host_ctxt, 0);
+	struct arm_smccc_res res;
+
+	if (!is_ffa_call(func_id))
+		return false;
+
+	switch (func_id) {
+	case FFA_FEATURES:
+		if (!do_ffa_features(&res, host_ctxt))
+			return false;
+		goto out_handled;
+	/* Memory management */
+	case FFA_FN64_RXTX_MAP:
+		do_ffa_rxtx_map(&res, host_ctxt);
+		goto out_handled;
+	case FFA_RXTX_UNMAP:
+		do_ffa_rxtx_unmap(&res, host_ctxt);
+		goto out_handled;
+	case FFA_MEM_SHARE:
+	case FFA_FN64_MEM_SHARE:
+		do_ffa_mem_xfer(FFA_FN64_MEM_SHARE, &res, host_ctxt);
+		goto out_handled;
+	case FFA_MEM_RECLAIM:
+		do_ffa_mem_reclaim(&res, host_ctxt);
+		goto out_handled;
+	case FFA_MEM_LEND:
+	case FFA_FN64_MEM_LEND:
+		do_ffa_mem_xfer(FFA_FN64_MEM_LEND, &res, host_ctxt);
+		goto out_handled;
+	case FFA_MEM_FRAG_TX:
+		do_ffa_mem_frag_tx(&res, host_ctxt);
+		goto out_handled;
+	}
+
+	if (!ffa_call_unsupported(func_id))
+		return false; /* Pass through */
+
+	ffa_to_smccc_error(&res, FFA_RET_NOT_SUPPORTED);
+out_handled:
+	ffa_set_retval(host_ctxt, &res);
+	return true;
+}
+
+int hyp_ffa_init(void *pages)
+{
+	struct arm_smccc_res res;
+	size_t min_rxtx_sz;
+	void *tx, *rx;
+
+	if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_1)
+		return 0;
+
+	arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_0, 0, 0, 0, 0, 0, 0, &res);
+	if (res.a0 == FFA_RET_NOT_SUPPORTED)
+		return 0;
+
+	if (res.a0 != FFA_VERSION_1_0)
+		return -EOPNOTSUPP;
+
+	arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res);
+	if (res.a0 != FFA_SUCCESS)
+		return -EOPNOTSUPP;
+
+	if (res.a2 != HOST_FFA_ID)
+		return -EINVAL;
+
+	arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP,
+			  0, 0, 0, 0, 0, 0, &res);
+	if (res.a0 != FFA_SUCCESS)
+		return -EOPNOTSUPP;
+
+	switch (res.a2) {
+	case FFA_FEAT_RXTX_MIN_SZ_4K:
+		min_rxtx_sz = SZ_4K;
+		break;
+	case FFA_FEAT_RXTX_MIN_SZ_16K:
+		min_rxtx_sz = SZ_16K;
+		break;
+	case FFA_FEAT_RXTX_MIN_SZ_64K:
+		min_rxtx_sz = SZ_64K;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (min_rxtx_sz > PAGE_SIZE)
+		return -EOPNOTSUPP;
+
+	tx = pages;
+	pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE;
+	rx = pages;
+	pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE;
+
+	ffa_desc_buf = (struct kvm_ffa_descriptor_buffer) {
+		.buf	= pages,
+		.len	= PAGE_SIZE *
+			  (hyp_ffa_proxy_pages() - (2 * KVM_FFA_MBOX_NR_PAGES)),
+	};
+
+	hyp_buffers = (struct kvm_ffa_buffers) {
+		.lock	= __HYP_SPIN_LOCK_UNLOCKED,
+		.tx	= tx,
+		.rx	= rx,
+	};
+
+	host_buffers = (struct kvm_ffa_buffers) {
+		.lock	= __HYP_SPIN_LOCK_UNLOCKED,
+	};
+
+	return 0;
+}
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
--- a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
@ -8,6 +8,8 @@
 #include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>

+DEFINE_PER_CPU(int, hyp_cpu_number);
+
 /*
 * nVHE copy of data structures tracking available CPU cores.
 * Only entries for CPUs that were online at KVM init are populated.
@ -23,6 +25,8 @@ u64 cpu_logical_map(unsigned int cpu)
 	return hyp_cpu_logical_map[cpu];
 }

+unsigned long __ro_after_init kvm_arm_hyp_percpu_base[NR_CPUS];
+
 unsigned long __hyp_per_cpu_offset(unsigned int cpu)
 {
 	unsigned long *cpu_base_array;
--- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@ -25,5 +25,7 @@ SECTIONS {
 	BEGIN_HYP_SECTION(.data..percpu)
 		PERCPU_INPUT(L1_CACHE_BYTES)
 	END_HYP_SECTION
+
 	HYP_SECTION(.bss)
+	HYP_SECTION(.data)
 }
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@ -0,0 +1,570 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022 Google LLC
+ * Author: David Brazdil <dbrazdil@google.com>
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_asm.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pkvm.h>
+
+#include <hyp/adjust_pc.h>
+#include <nvhe/iommu.h>
+#include <nvhe/mm.h>
+
+#define DRV_ID(drv_addr)			((unsigned long)drv_addr)
+
+enum {
+	IOMMU_DRIVER_NOT_READY = 0,
+	IOMMU_DRIVER_INITIALIZING,
+	IOMMU_DRIVER_READY,
+};
+
+/* List of registered IOMMU drivers, protected with iommu_drv_lock. */
+static LIST_HEAD(iommu_drivers);
+/* IOMMU device list. Must only be accessed with host_mmu.lock held. */
+static LIST_HEAD(iommu_list);
+
+static bool iommu_finalized;
+static DEFINE_HYP_SPINLOCK(iommu_registration_lock);
+static DEFINE_HYP_SPINLOCK(iommu_drv_lock);
+
+static void *iommu_mem_pool;
+static size_t iommu_mem_remaining;
+
+static void assert_host_component_locked(void)
+{
+	hyp_assert_lock_held(&host_mmu.lock);
+}
+
+static void host_lock_component(void)
+{
+	hyp_spin_lock(&host_mmu.lock);
+}
+
+static void host_unlock_component(void)
+{
+	hyp_spin_unlock(&host_mmu.lock);
+}
+
+/*
+ * Find IOMMU driver by its ID. The input ID is treated as unstrusted
+ * and is properly validated.
+ */
+static inline struct pkvm_iommu_driver *get_driver(unsigned long id)
+{
+	struct pkvm_iommu_driver *drv, *ret = NULL;
+
+	hyp_spin_lock(&iommu_drv_lock);
+	list_for_each_entry(drv, &iommu_drivers, list) {
+		if (DRV_ID(drv) == id) {
+			ret =  drv;
+			break;
+		}
+	}
+	hyp_spin_unlock(&iommu_drv_lock);
+	return ret;
+}
+
+static inline bool driver_acquire_init(struct pkvm_iommu_driver *drv)
+{
+	return atomic_cmpxchg_acquire(&drv->state, IOMMU_DRIVER_NOT_READY,
+				      IOMMU_DRIVER_INITIALIZING)
+			== IOMMU_DRIVER_NOT_READY;
+}
+
+static inline void driver_release_init(struct pkvm_iommu_driver *drv,
+				       bool success)
+{
+	atomic_set_release(&drv->state, success ? IOMMU_DRIVER_READY
+						: IOMMU_DRIVER_NOT_READY);
+}
+
+static inline bool is_driver_ready(struct pkvm_iommu_driver *drv)
+{
+	return atomic_read(&drv->state) == IOMMU_DRIVER_READY;
+}
+
+static size_t __iommu_alloc_size(struct pkvm_iommu_driver *drv)
+{
+	return ALIGN(sizeof(struct pkvm_iommu) + drv->ops->data_size,
+		     sizeof(unsigned long));
+}
+
+static bool validate_driver_id_unique(struct pkvm_iommu_driver *drv)
+{
+	struct pkvm_iommu_driver *cur;
+
+	hyp_assert_lock_held(&iommu_drv_lock);
+	list_for_each_entry(cur, &iommu_drivers, list) {
+		if (DRV_ID(drv) == DRV_ID(cur))
+			return false;
+	}
+	return true;
+}
+
+static int __pkvm_register_iommu_driver(struct pkvm_iommu_driver *drv)
+{
+	int ret = 0;
+
+	if (!drv)
+		return -EINVAL;
+
+	hyp_assert_lock_held(&iommu_registration_lock);
+	hyp_spin_lock(&iommu_drv_lock);
+	if (validate_driver_id_unique(drv))
+		list_add_tail(&drv->list, &iommu_drivers);
+	else
+		ret = -EEXIST;
+	hyp_spin_unlock(&iommu_drv_lock);
+	return ret;
+}
+
+/* Global memory pool for allocating IOMMU list entry structs. */
+static inline struct pkvm_iommu *alloc_iommu(struct pkvm_iommu_driver *drv,
+					     void *mem, size_t mem_size)
+{
+	size_t size = __iommu_alloc_size(drv);
+	void *ptr;
+
+	assert_host_component_locked();
+
+	/*
+	 * If new memory is being provided, replace the existing pool with it.
+	 * Any remaining memory in the pool is discarded.
+	 */
+	if (mem && mem_size) {
+		iommu_mem_pool = mem;
+		iommu_mem_remaining = mem_size;
+	}
+
+	if (size > iommu_mem_remaining)
+		return NULL;
+
+	ptr = iommu_mem_pool;
+	iommu_mem_pool += size;
+	iommu_mem_remaining -= size;
+	return ptr;
+}
+
+static inline void free_iommu(struct pkvm_iommu_driver *drv, struct pkvm_iommu *ptr)
+{
+	size_t size = __iommu_alloc_size(drv);
+
+	assert_host_component_locked();
+
+	if (!ptr)
+		return;
+
+	/* Only allow freeing the last allocated buffer. */
+	if ((void *)ptr + size != iommu_mem_pool)
+		return;
+
+	iommu_mem_pool -= size;
+	iommu_mem_remaining += size;
+}
+
+static bool is_overlap(phys_addr_t r1_start, size_t r1_size,
+		       phys_addr_t r2_start, size_t r2_size)
+{
+	phys_addr_t r1_end = r1_start + r1_size;
+	phys_addr_t r2_end = r2_start + r2_size;
+
+	return (r1_start < r2_end) && (r2_start < r1_end);
+}
+
+static bool is_mmio_range(phys_addr_t base, size_t size)
+{
+	struct memblock_region *reg;
+	phys_addr_t limit = BIT(host_mmu.pgt.ia_bits);
+	size_t i;
+
+	/* Check against limits of host IPA space. */
+	if ((base >= limit) || !size || (size > limit - base))
+		return false;
+
+	for (i = 0; i < hyp_memblock_nr; i++) {
+		reg = &hyp_memory[i];
+		if (is_overlap(base, size, reg->base, reg->size))
+			return false;
+	}
+	return true;
+}
+
+static int __snapshot_host_stage2(u64 start, u64 pa_max, u32 level,
+				  kvm_pte_t *ptep,
+				  enum kvm_pgtable_walk_flags flags,
+				  void * const arg)
+{
+	struct pkvm_iommu_driver * const drv = arg;
+	u64 end = start + kvm_granule_size(level);
+	kvm_pte_t pte = *ptep;
+
+	/*
+	 * Valid stage-2 entries are created lazily, invalid ones eagerly.
+	 * Note: In the future we may need to check if [start,end) is MMIO.
+	 * Note: Drivers initialize their PTs to all memory owned by the host,
+	 * so we only call the driver on regions where that is not the case.
+	 */
+	if (pte && !kvm_pte_valid(pte))
+		drv->ops->host_stage2_idmap_prepare(start, end, /*prot*/ 0);
+	return 0;
+}
+
+static int snapshot_host_stage2(struct pkvm_iommu_driver * const drv)
+{
+	struct kvm_pgtable_walker walker = {
+		.cb	= __snapshot_host_stage2,
+		.arg	= drv,
+		.flags	= KVM_PGTABLE_WALK_LEAF,
+	};
+	struct kvm_pgtable *pgt = &host_mmu.pgt;
+
+	if (!drv->ops->host_stage2_idmap_prepare)
+		return 0;
+
+	return kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker);
+}
+
+static bool validate_against_existing_iommus(struct pkvm_iommu *dev)
+{
+	struct pkvm_iommu *other;
+
+	assert_host_component_locked();
+
+	list_for_each_entry(other, &iommu_list, list) {
+		/* Device ID must be unique. */
+		if (dev->id == other->id)
+			return false;
+
+		/* MMIO regions must not overlap. */
+		if (is_overlap(dev->pa, dev->size, other->pa, other->size))
+			return false;
+	}
+	return true;
+}
+
+static struct pkvm_iommu *find_iommu_by_id(unsigned long id)
+{
+	struct pkvm_iommu *dev;
+
+	assert_host_component_locked();
+
+	list_for_each_entry(dev, &iommu_list, list) {
+		if (dev->id == id)
+			return dev;
+	}
+	return NULL;
+}
+
+/*
+ * Initialize EL2 IOMMU driver.
+ *
+ * This is a common hypercall for driver initialization. Driver-specific
+ * arguments are passed in a shared memory buffer. The driver is expected to
+ * initialize it's page-table bookkeeping.
+ */
+int __pkvm_iommu_driver_init(struct pkvm_iommu_driver *drv, void *data, size_t size)
+{
+	const struct pkvm_iommu_ops *ops;
+	int ret = 0;
+
+	/* New driver initialization not allowed after __pkvm_iommu_finalize(). */
+	hyp_spin_lock(&iommu_registration_lock);
+	if (iommu_finalized) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
+	ret =  __pkvm_register_iommu_driver(drv);
+	if (ret)
+		return ret;
+
+	if (!drv->ops) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (!driver_acquire_init(drv)) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	ops = drv->ops;
+
+	/* This can change stage-2 mappings. */
+	if (ops->init) {
+		ret = hyp_pin_shared_mem(data, data + size);
+		if (!ret) {
+			ret = ops->init(data, size);
+			hyp_unpin_shared_mem(data, data + size);
+		}
+		if (ret)
+			goto out_release;
+	}
+
+	/*
+	 * Walk host stage-2 and pass current mappings to the driver. Start
+	 * accepting host stage-2 updates as soon as the host lock is released.
+	 */
+	host_lock_component();
+	ret = snapshot_host_stage2(drv);
+	if (!ret)
+		driver_release_init(drv, /*success=*/true);
+	host_unlock_component();
+
+out_release:
+	if (ret)
+		driver_release_init(drv, /*success=*/false);
+
+out_unlock:
+	hyp_spin_unlock(&iommu_registration_lock);
+	return ret;
+}
+
+int __pkvm_iommu_register(unsigned long dev_id, unsigned long drv_id,
+			  phys_addr_t dev_pa, size_t dev_size,
+			  unsigned long parent_id,
+			  void *kern_mem_va, size_t mem_size)
+{
+	struct pkvm_iommu *dev = NULL;
+	struct pkvm_iommu_driver *drv;
+	void *mem_va = NULL;
+	int ret = 0;
+
+	/* New device registration not allowed after __pkvm_iommu_finalize(). */
+	hyp_spin_lock(&iommu_registration_lock);
+	if (iommu_finalized) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
+	drv = get_driver(drv_id);
+	if (!drv || !is_driver_ready(drv)) {
+		ret = -ENOENT;
+		goto out_unlock;
+	}
+
+	if (!PAGE_ALIGNED(dev_pa) || !PAGE_ALIGNED(dev_size)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (!is_mmio_range(dev_pa, dev_size)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/*
+	 * Accept memory donation if the host is providing new memory.
+	 * Note: We do not return the memory even if there is an error later.
+	 */
+	if (kern_mem_va && mem_size) {
+		mem_va = kern_hyp_va(kern_mem_va);
+
+		if (!PAGE_ALIGNED(mem_va) || !PAGE_ALIGNED(mem_size)) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+
+		ret = __pkvm_host_donate_hyp(hyp_virt_to_pfn(mem_va),
+					     mem_size >> PAGE_SHIFT);
+		if (ret)
+			goto out_unlock;
+	}
+
+	host_lock_component();
+
+	/* Allocate memory for the new device entry. */
+	dev = alloc_iommu(drv, mem_va, mem_size);
+	if (!dev) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+
+	/* Populate the new device entry. */
+	*dev = (struct pkvm_iommu){
+		.children = LIST_HEAD_INIT(dev->children),
+		.id = dev_id,
+		.ops = drv->ops,
+		.pa = dev_pa,
+		.size = dev_size,
+	};
+
+	if (!validate_against_existing_iommus(dev)) {
+		ret = -EBUSY;
+		goto out_free;
+	}
+
+	if (parent_id) {
+		dev->parent = find_iommu_by_id(parent_id);
+		if (!dev->parent) {
+			ret = -EINVAL;
+			goto out_free;
+		}
+
+		if (dev->parent->ops->validate_child) {
+			ret = dev->parent->ops->validate_child(dev->parent, dev);
+			if (ret)
+				goto out_free;
+		}
+	}
+
+	if (dev->ops->validate) {
+		ret = dev->ops->validate(dev);
+		if (ret)
+			goto out_free;
+	}
+
+	/*
+	 * Unmap the device's MMIO range from host stage-2. If registration
+	 * is successful, future attempts to re-map will be blocked by
+	 * pkvm_iommu_host_stage2_adjust_range.
+	 */
+	ret = host_stage2_unmap_reg_locked(dev_pa, dev_size);
+	if (ret)
+		goto out_free;
+
+	/* Create EL2 mapping for the device. */
+	ret = __pkvm_create_private_mapping(dev_pa, dev_size,
+					    PAGE_HYP_DEVICE, (unsigned long *)(&dev->va));
+	if (ret){
+		goto out_free;
+	}
+
+	/* Register device and prevent host from mapping the MMIO range. */
+	list_add_tail(&dev->list, &iommu_list);
+	if (dev->parent)
+		list_add_tail(&dev->siblings, &dev->parent->children);
+
+out_free:
+	if (ret)
+		free_iommu(drv, dev);
+	host_unlock_component();
+
+out_unlock:
+	hyp_spin_unlock(&iommu_registration_lock);
+	return ret;
+}
+
+int __pkvm_iommu_finalize(void)
+{
+	int ret = 0;
+
+	hyp_spin_lock(&iommu_registration_lock);
+	if (!iommu_finalized)
+		iommu_finalized = true;
+	else
+		ret = -EPERM;
+	hyp_spin_unlock(&iommu_registration_lock);
+	return ret;
+}
+
+int __pkvm_iommu_pm_notify(unsigned long dev_id, enum pkvm_iommu_pm_event event)
+{
+	struct pkvm_iommu *dev;
+	int ret;
+
+	host_lock_component();
+	dev = find_iommu_by_id(dev_id);
+	if (dev) {
+		if (event == PKVM_IOMMU_PM_SUSPEND) {
+			ret = dev->ops->suspend ? dev->ops->suspend(dev) : 0;
+			if (!ret)
+				dev->powered = false;
+		} else if (event == PKVM_IOMMU_PM_RESUME) {
+			ret = dev->ops->resume ? dev->ops->resume(dev) : 0;
+			if (!ret)
+				dev->powered = true;
+		} else {
+			ret = -EINVAL;
+		}
+	} else {
+		ret = -ENODEV;
+	}
+	host_unlock_component();
+	return ret;
+}
+
+/*
+ * Check host memory access against IOMMUs' MMIO regions.
+ * Returns -EPERM if the address is within the bounds of a registered device.
+ * Otherwise returns zero and adjusts boundaries of the new mapping to avoid
+ * MMIO regions of registered IOMMUs.
+ */
+int pkvm_iommu_host_stage2_adjust_range(phys_addr_t addr, phys_addr_t *start,
+					phys_addr_t *end)
+{
+	struct pkvm_iommu *dev;
+	phys_addr_t new_start = *start;
+	phys_addr_t new_end = *end;
+	phys_addr_t dev_start, dev_end;
+
+	assert_host_component_locked();
+
+	list_for_each_entry(dev, &iommu_list, list) {
+		dev_start = dev->pa;
+		dev_end = dev_start + dev->size;
+
+		if (addr < dev_start)
+			new_end = min(new_end, dev_start);
+		else if (addr >= dev_end)
+			new_start = max(new_start, dev_end);
+		else
+			return -EPERM;
+	}
+
+	*start = new_start;
+	*end = new_end;
+	return 0;
+}
+
+bool pkvm_iommu_host_dabt_handler(struct kvm_cpu_context *host_ctxt, u32 esr,
+				  phys_addr_t pa)
+{
+	struct pkvm_iommu *dev;
+
+	assert_host_component_locked();
+
+	list_for_each_entry(dev, &iommu_list, list) {
+		if (pa < dev->pa || pa >= dev->pa + dev->size)
+			continue;
+
+		/* No 'powered' check - the host assumes it is powered. */
+		if (!dev->ops->host_dabt_handler ||
+		    !dev->ops->host_dabt_handler(dev, host_ctxt, esr, pa - dev->pa))
+			return false;
+
+		kvm_skip_host_instr();
+		return true;
+	}
+	return false;
+}
+
+void pkvm_iommu_host_stage2_idmap(phys_addr_t start, phys_addr_t end,
+				  enum kvm_pgtable_prot prot)
+{
+	struct pkvm_iommu_driver *drv;
+	struct pkvm_iommu *dev;
+
+	assert_host_component_locked();
+	hyp_spin_lock(&iommu_drv_lock);
+	list_for_each_entry(drv, &iommu_drivers, list) {
+		if (drv && is_driver_ready(drv) && drv->ops->host_stage2_idmap_prepare)
+			drv->ops->host_stage2_idmap_prepare(start, end, prot);
+	}
+	hyp_spin_unlock(&iommu_drv_lock);
+
+	list_for_each_entry(dev, &iommu_list, list) {
+		if (dev->powered && dev->ops->host_stage2_idmap_apply)
+			dev->ops->host_stage2_idmap_apply(dev, start, end);
+	}
+
+	list_for_each_entry(dev, &iommu_list, list) {
+		if (dev->powered && dev->ops->host_stage2_idmap_complete)
+			dev->ops->host_stage2_idmap_complete(dev);
+	}
+}
--- a/arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c
@ -0,0 +1,321 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022 - Google LLC
+ */
+
+#include <asm/io-mpt-s2mpu.h>
+
+#define GRAN_BYTE(gran)			((gran << V9_MPT_PROT_BITS) | (gran))
+#define GRAN_HWORD(gran)		((GRAN_BYTE(gran) << 8) | (GRAN_BYTE(gran)))
+#define GRAN_WORD(gran)			(((u32)(GRAN_HWORD(gran) << 16) | (GRAN_HWORD(gran))))
+#define GRAN_DWORD(gran)		((u64)((u64)GRAN_WORD(gran) << 32) | (u64)(GRAN_WORD(gran)))
+
+#define SMPT_NUM_TO_BYTE(x)		((x) / SMPT_GRAN / SMPT_ELEMS_PER_BYTE(config_prot_bits))
+#define BYTE_TO_SMPT_INDEX(x)	((x) / SMPT_WORD_BYTE_RANGE(config_prot_bits))
+
+
+/*
+ * MPT table ops can be configured only for one version at runtime,
+ * these variables will hold version specific data set a run time init, to avoid
+ * having duplicate code or unnessery check during operations.
+ */
+static u32 config_prot_bits;
+static u32 config_access_shift;
+static const u64 *config_lut_prot;
+static u32 config_gran_mask;
+static u32 this_version;
+
+/*
+ * page table entries for different protection look up table
+ * granularity is compile time config, so we can do this also for
+ * this array without having duplicate arrays
+ */
+static const u64 v9_mpt_prot_doubleword[] = {
+	[MPT_PROT_NONE] = 0x0000000000000000 | GRAN_DWORD(SMPT_GRAN_ATTR),
+	[MPT_PROT_R]    = 0x4444444444444444 | GRAN_DWORD(SMPT_GRAN_ATTR),
+	[MPT_PROT_W]	= 0x8888888888888888 | GRAN_DWORD(SMPT_GRAN_ATTR),
+	[MPT_PROT_RW]   = 0xcccccccccccccccc | GRAN_DWORD(SMPT_GRAN_ATTR),
+};
+static const u64 mpt_prot_doubleword[] = {
+	[MPT_PROT_NONE] = 0x0000000000000000,
+	[MPT_PROT_R]    = 0x5555555555555555,
+	[MPT_PROT_W]	= 0xaaaaaaaaaaaaaaaa,
+	[MPT_PROT_RW]   = 0xffffffffffffffff,
+};
+
+static inline int pte_from_addr_smpt(u32 *smpt, u64 addr)
+{
+	u32 word_idx, idx, pte, val;
+
+	word_idx = BYTE_TO_SMPT_INDEX(addr);
+	val = READ_ONCE(smpt[word_idx]);
+	idx  = (addr / SMPT_GRAN) % SMPT_ELEMS_PER_WORD(config_prot_bits);
+
+	pte  = (val >> (idx * config_prot_bits)) & ((1 << config_prot_bits)-1);
+	return pte;
+}
+
+static inline int prot_from_addr_smpt(u32 *smpt, u64 addr)
+{
+	int pte = pte_from_addr_smpt(smpt, addr);
+
+	return (pte >> config_access_shift);
+}
+
+/* Set protection bits of SMPT in a given range without using memset. */
+static void __set_smpt_range_slow(u32 *smpt, size_t start_gb_byte,
+					 size_t end_gb_byte, enum mpt_prot prot)
+{
+	size_t i, start_word_byte, end_word_byte, word_idx, first_elem, last_elem;
+	u32 val;
+
+	/* Iterate over u32 words. */
+	start_word_byte = start_gb_byte;
+	while (start_word_byte < end_gb_byte) {
+		/* Determine the range of bytes covered by this word. */
+		word_idx = BYTE_TO_SMPT_INDEX(start_word_byte);
+		end_word_byte = min(
+			ALIGN(start_word_byte + 1, SMPT_WORD_BYTE_RANGE(config_prot_bits)),
+			end_gb_byte);
+
+		/* Identify protection bit offsets within the word. */
+		first_elem = (start_word_byte / SMPT_GRAN) % SMPT_ELEMS_PER_WORD(config_prot_bits);
+		last_elem =
+			((end_word_byte - 1) / SMPT_GRAN) % SMPT_ELEMS_PER_WORD(config_prot_bits);
+
+		/* Modify the corresponding word. */
+		val = READ_ONCE(smpt[word_idx]);
+		for (i = first_elem; i <= last_elem; i++) {
+			val &= ~(MPT_PROT_MASK << (i * config_prot_bits + config_access_shift));
+			val |= prot << (i * config_prot_bits + config_access_shift);
+		}
+		WRITE_ONCE(smpt[word_idx], val);
+
+		start_word_byte = end_word_byte;
+	}
+}
+
+/* Set protection bits of SMPT in a given range. */
+static void __set_smpt_range(u32 *smpt, size_t start_gb_byte,
+				    size_t end_gb_byte, enum mpt_prot prot)
+{
+	size_t interlude_start, interlude_end, interlude_bytes, word_idx;
+
+	char prot_byte = (char)config_lut_prot[prot];
+
+	if (start_gb_byte >= end_gb_byte)
+		return;
+
+	/* Check if range spans at least one full u32 word. */
+	interlude_start = ALIGN(start_gb_byte, SMPT_WORD_BYTE_RANGE(config_prot_bits));
+	interlude_end = ALIGN_DOWN(end_gb_byte, SMPT_WORD_BYTE_RANGE(config_prot_bits));
+
+	/*
+	 * If not, fall back to editing bits in the given range.
+	 * sets bit for PTEs that are in less than 32 bits (can't be done by memset)
+	 */
+	if (interlude_start >= interlude_end) {
+		__set_smpt_range_slow(smpt, start_gb_byte, end_gb_byte, prot);
+		return;
+	}
+
+	/* Use bit-editing for prologue/epilogue, memset for interlude. */
+	word_idx = BYTE_TO_SMPT_INDEX(interlude_start);
+	interlude_bytes = SMPT_NUM_TO_BYTE(interlude_end - interlude_start);
+
+	/*
+	 * These are pages in the start and at then end that are
+	 * not part of full 32 bit SMPT word.
+	 */
+	__set_smpt_range_slow(smpt, start_gb_byte, interlude_start, prot);
+	memset(&smpt[word_idx], prot_byte, interlude_bytes);
+	__set_smpt_range_slow(smpt, interlude_end, end_gb_byte, prot);
+}
+
+/* Returns true if all SMPT protection bits match 'prot'. */
+static bool __is_smpt_uniform(u32 *smpt, enum mpt_prot prot)
+{
+	size_t i;
+	u64 *doublewords = (u64 *)smpt;
+
+	for (i = 0; i < SMPT_NUM_WORDS(config_prot_bits) / 2; i++) {
+		if (doublewords[i] != config_lut_prot[prot])
+			return false;
+	}
+	return true;
+}
+
+/*
+ * Set protection bits of FMPT/SMPT in a given range.
+ * Returns flags specifying whether L1/L2 changes need to be made visible
+ * to the device.
+ */
+static void __set_fmpt_range(struct fmpt *fmpt, size_t start_gb_byte,
+				    size_t end_gb_byte, enum mpt_prot prot)
+{
+	if (start_gb_byte == 0 && end_gb_byte >= SZ_1G) {
+		/* Update covers the entire GB region. */
+		if (fmpt->gran_1g && fmpt->prot == prot) {
+			fmpt->flags = 0;
+			return;
+		}
+
+		fmpt->gran_1g = true;
+		fmpt->prot = prot;
+		fmpt->flags = MPT_UPDATE_L1;
+		return;
+	}
+
+	if (fmpt->gran_1g) {
+		/* GB region currently uses 1G mapping. */
+		if (fmpt->prot == prot) {
+			fmpt->flags = 0;
+			return;
+		}
+
+		/*
+		 * Range has different mapping than the rest of the GB.
+		 * Convert to PAGE_SIZE mapping.
+		 */
+		fmpt->gran_1g = false;
+		__set_smpt_range(fmpt->smpt, 0, start_gb_byte, fmpt->prot);
+		__set_smpt_range(fmpt->smpt, start_gb_byte, end_gb_byte, prot);
+		__set_smpt_range(fmpt->smpt, end_gb_byte, SZ_1G, fmpt->prot);
+		fmpt->flags = MPT_UPDATE_L1 | MPT_UPDATE_L2;
+		return;
+	}
+
+	/* GB region currently uses PAGE_SIZE mapping. */
+	__set_smpt_range(fmpt->smpt, start_gb_byte, end_gb_byte, prot);
+
+	/* Check if the entire GB region has the same prot bits. */
+	if (!__is_smpt_uniform(fmpt->smpt, prot)) {
+		fmpt->flags = MPT_UPDATE_L2;
+		return;
+	}
+
+	fmpt->gran_1g = true;
+	fmpt->prot = prot;
+	fmpt->flags = MPT_UPDATE_L1;
+}
+
+static u32 smpt_size(void)
+{
+	return SMPT_SIZE(config_prot_bits);
+}
+
+static void __set_l1entry_attr_with_prot(void *dev_va, unsigned int gb,
+					 unsigned int vid, enum mpt_prot prot)
+{
+	writel_relaxed(L1ENTRY_ATTR_1G(prot),
+		       dev_va + REG_NS_L1ENTRY_ATTR(vid, gb));
+}
+
+static void __set_l1entry_attr_with_fmpt(void *dev_va, unsigned int gb,
+					 unsigned int vid, struct fmpt *fmpt)
+{
+	if (fmpt->gran_1g) {
+		__set_l1entry_attr_with_prot(dev_va, gb, vid, fmpt->prot);
+	} else {
+		/* Order against writes to the SMPT. */
+		writel(config_gran_mask | L1ENTRY_ATTR_L2TABLE_EN,
+		       dev_va + REG_NS_L1ENTRY_ATTR(vid, gb));
+	}
+}
+
+static void __set_l1entry_l2table_addr(void *dev_va, unsigned int gb,
+				       unsigned int vid, phys_addr_t addr)
+{
+	/* Order against writes to the SMPT. */
+	writel(L1ENTRY_L2TABLE_ADDR(addr),
+	       dev_va + REG_NS_L1ENTRY_L2TABLE_ADDR(vid, gb));
+}
+
+static void init_with_prot(void *dev_va, enum mpt_prot prot)
+{
+	unsigned int gb, vid;
+
+	for_each_gb_and_vid(gb, vid)
+		__set_l1entry_attr_with_prot(dev_va, gb, vid, prot);
+}
+
+static void init_with_mpt(void *dev_va, struct mpt *mpt)
+{
+	unsigned int gb, vid;
+	struct fmpt *fmpt;
+
+	for_each_gb_and_vid(gb, vid) {
+		fmpt = &mpt->fmpt[gb];
+		__set_l1entry_l2table_addr(dev_va, gb, vid, __hyp_pa(fmpt->smpt));
+		__set_l1entry_attr_with_fmpt(dev_va, gb, vid, fmpt);
+	}
+}
+
+static void apply_range(void *dev_va, struct mpt *mpt, u32 first_gb, u32 last_gb)
+{
+	unsigned int gb, vid;
+	struct fmpt *fmpt;
+
+	for_each_gb_in_range(gb, first_gb, last_gb) {
+		fmpt = &mpt->fmpt[gb];
+		if (fmpt->flags & MPT_UPDATE_L1) {
+			for_each_vid(vid)
+				__set_l1entry_attr_with_fmpt(dev_va, gb, vid, fmpt);
+		}
+	}
+}
+
+static void prepare_range(struct mpt *mpt, phys_addr_t first_byte,
+			  phys_addr_t last_byte, enum mpt_prot prot)
+{
+	unsigned int first_gb = first_byte / SZ_1G;
+	unsigned int last_gb = last_byte / SZ_1G;
+	size_t start_gb_byte, end_gb_byte;
+	unsigned int gb;
+	struct fmpt *fmpt;
+
+	for_each_gb_in_range(gb, first_gb, last_gb) {
+		fmpt = &mpt->fmpt[gb];
+		start_gb_byte = (gb == first_gb) ? first_byte % SZ_1G : 0;
+		end_gb_byte = (gb == last_gb) ? (last_byte % SZ_1G) + 1 : SZ_1G;
+
+		__set_fmpt_range(fmpt, start_gb_byte, end_gb_byte, prot);
+
+		if (fmpt->flags & MPT_UPDATE_L2)
+			kvm_flush_dcache_to_poc(fmpt->smpt, smpt_size());
+	}
+}
+
+static const struct s2mpu_mpt_ops this_ops = {
+	.smpt_size = smpt_size,
+	.init_with_prot = init_with_prot,
+	.init_with_mpt = init_with_mpt,
+	.apply_range = apply_range,
+	.prepare_range = prepare_range,
+	.pte_from_addr_smpt = pte_from_addr_smpt,
+};
+
+const struct s2mpu_mpt_ops *s2mpu_get_mpt_ops(struct s2mpu_mpt_cfg cfg)
+{
+
+	/* If called before with different version return NULL. */
+	if (WARN_ON(this_version && (this_version != cfg.version)))
+		return NULL;
+	/* 2MB granularity not supported in V9 */
+	if ((cfg.version == S2MPU_VERSION_9) && (SMPT_GRAN_ATTR != L1ENTRY_ATTR_GRAN_2M)) {
+		config_prot_bits = V9_MPT_PROT_BITS;
+		config_access_shift = V9_MPT_ACCESS_SHIFT;
+		config_lut_prot = v9_mpt_prot_doubleword;
+		config_gran_mask = L1ENTRY_ATTR_GRAN(SMPT_GRAN_ATTR, V9_L1ENTRY_ATTR_GRAN_MASK);
+		this_version = cfg.version;
+		return &this_ops;
+	} else if ((cfg.version == S2MPU_VERSION_2) || (cfg.version == S2MPU_VERSION_1)) {
+		config_prot_bits = MPT_PROT_BITS;
+		config_access_shift = MPT_ACCESS_SHIFT;
+		config_lut_prot = mpt_prot_doubleword;
+		config_gran_mask = L1ENTRY_ATTR_GRAN(SMPT_GRAN_ATTR, L1ENTRY_ATTR_GRAN_MASK);
+		this_version = cfg.version;
+		return &this_ops;
+	}
+	return NULL;
+}
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@ -0,0 +1,703 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 - Google LLC
+ * Author: David Brazdil <dbrazdil@google.com>
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_asm.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_s2mpu.h>
+
+#include <linux/arm-smccc.h>
+
+#include <nvhe/iommu.h>
+#include <nvhe/memory.h>
+#include <nvhe/mm.h>
+#include <nvhe/spinlock.h>
+#include <nvhe/trap_handler.h>
+#include <asm/io-mpt-s2mpu.h>
+
+#define SMC_CMD_PREPARE_PD_ONOFF	0x82000410
+#define SMC_MODE_POWER_UP		1
+
+#define PA_MAX				((phys_addr_t)SZ_1G * NR_GIGABYTES)
+
+#define SYNC_MAX_RETRIES		5
+#define SYNC_TIMEOUT			5
+#define SYNC_TIMEOUT_MULTIPLIER		3
+
+#define CTX_CFG_ENTRY(ctxid, nr_ctx, vid) \
+	(CONTEXT_CFG_VALID_VID_CTX_VID(ctxid, vid) \
+	 | (((ctxid) < (nr_ctx)) ? CONTEXT_CFG_VALID_VID_CTX_VALID(ctxid) : 0))
+
+#define for_each_child(child, dev) \
+	list_for_each_entry((child), &(dev)->children, siblings)
+
+/* HW version-specific operations. */
+struct s2mpu_reg_ops {
+	int (*init)(struct pkvm_iommu *dev);
+	void (*set_control_regs)(struct pkvm_iommu *dev);
+	u32 (*host_mmio_reg_access_mask)(size_t off, bool is_write);
+};
+
+struct s2mpu_drv_data {
+	u32 version;
+	u32 context_cfg_valid_vid;
+};
+
+static const struct s2mpu_mpt_ops *mpt_ops;
+static const struct s2mpu_reg_ops *reg_ops;
+static struct mpt host_mpt;
+
+const struct pkvm_iommu_ops pkvm_s2mpu_ops;
+const struct pkvm_iommu_ops pkvm_sysmmu_sync_ops;
+
+static inline enum mpt_prot prot_to_mpt(enum kvm_pgtable_prot prot)
+{
+	return ((prot & KVM_PGTABLE_PROT_R) ? MPT_PROT_R : 0) |
+	       ((prot & KVM_PGTABLE_PROT_W) ? MPT_PROT_W : 0);
+}
+
+static bool is_version(struct pkvm_iommu *dev, u32 version)
+{
+	struct s2mpu_drv_data *data = (struct s2mpu_drv_data *)dev->data;
+
+	return (data->version & VERSION_CHECK_MASK) == version;
+}
+
+static u32 __context_cfg_valid_vid(struct pkvm_iommu *dev, u32 vid_bmap)
+{
+	struct s2mpu_drv_data *data = (struct s2mpu_drv_data *)dev->data;
+	u8 ctx_vid[NR_CTX_IDS] = { 0 };
+	unsigned int vid, ctx = 0;
+	unsigned int num_ctx;
+	u32 res;
+
+	/* Only initialize once. */
+	if (data->context_cfg_valid_vid)
+		return data->context_cfg_valid_vid;
+
+	num_ctx = readl_relaxed(dev->va + REG_NS_NUM_CONTEXT) & NUM_CONTEXT_MASK;
+	while (vid_bmap) {
+		/* Break if we cannot allocate more. */
+		if (ctx >= num_ctx)
+			break;
+
+		vid = __ffs(vid_bmap);
+		vid_bmap &= ~BIT(vid);
+		ctx_vid[ctx++] = vid;
+	}
+
+	/* The following loop was unrolled so bitmasks are constant. */
+	BUILD_BUG_ON(NR_CTX_IDS != 8);
+	res = CTX_CFG_ENTRY(0, ctx, ctx_vid[0])
+	    | CTX_CFG_ENTRY(1, ctx, ctx_vid[1])
+	    | CTX_CFG_ENTRY(2, ctx, ctx_vid[2])
+	    | CTX_CFG_ENTRY(3, ctx, ctx_vid[3])
+	    | CTX_CFG_ENTRY(4, ctx, ctx_vid[4])
+	    | CTX_CFG_ENTRY(5, ctx, ctx_vid[5])
+	    | CTX_CFG_ENTRY(6, ctx, ctx_vid[6])
+	    | CTX_CFG_ENTRY(7, ctx, ctx_vid[7]);
+
+	data->context_cfg_valid_vid = res;
+	return res;
+}
+
+static int __initialize_v2(struct pkvm_iommu *dev)
+{
+	u32 ssmt_valid_vid_bmap, ctx_cfg;
+
+	/* Assume all VIDs may be generated by the connected SSMTs for now. */
+	ssmt_valid_vid_bmap = ALL_VIDS_BITMAP;
+	ctx_cfg = __context_cfg_valid_vid(dev, ssmt_valid_vid_bmap);
+	if (!ctx_cfg)
+		return -EINVAL;
+
+	/*
+	 * Write CONTEXT_CFG_VALID_VID configuration before touching L1ENTRY*
+	 * registers. Writes to those registers are ignored unless there is
+	 * a context ID allocated to the corresponding VID (v2 only).
+	 */
+	writel_relaxed(ctx_cfg, dev->va + REG_NS_CONTEXT_CFG_VALID_VID);
+	return 0;
+}
+
+static int __initialize(struct pkvm_iommu *dev)
+{
+	struct s2mpu_drv_data *data = (struct s2mpu_drv_data *)dev->data;
+
+	if (!data->version)
+		data->version = readl_relaxed(dev->va + REG_NS_VERSION);
+
+	switch (data->version & VERSION_CHECK_MASK) {
+	case S2MPU_VERSION_1:
+		return 0;
+	case S2MPU_VERSION_2:
+		return __initialize_v2(dev);
+	default:
+		return -EINVAL;
+	}
+}
+
+static void __set_control_regs(struct pkvm_iommu *dev)
+{
+	u32 ctrl0 = 0, irq_vids;
+
+	/*
+	 * Note: We set the values of CTRL0, CTRL1 and CFG registers here but we
+	 * still rely on the correctness of their reset values. S2MPUs *must*
+	 * reset to a state where all DMA traffic is blocked until the hypervisor
+	 * writes its configuration to the S2MPU. A malicious EL1 could otherwise
+	 * attempt to bypass the permission checks in the window between powering
+	 * on the S2MPU and this function being called.
+	 */
+
+	/* Enable the S2MPU, otherwise all traffic would be allowed through. */
+	ctrl0 |= CTRL0_ENABLE;
+
+	/*
+	 * Enable interrupts on fault for all VIDs. The IRQ must also be
+	 * specified in DT to get unmasked in the GIC.
+	 */
+	ctrl0 |= CTRL0_INTERRUPT_ENABLE;
+	irq_vids = ALL_VIDS_BITMAP;
+
+	/* Return SLVERR/DECERR to device on permission fault. */
+	ctrl0 |= is_version(dev, S2MPU_VERSION_2) ? CTRL0_FAULT_RESP_TYPE_DECERR
+						  : CTRL0_FAULT_RESP_TYPE_SLVERR;
+
+	writel_relaxed(irq_vids, dev->va + REG_NS_INTERRUPT_ENABLE_PER_VID_SET);
+	writel_relaxed(0, dev->va + REG_NS_CFG);
+	writel_relaxed(0, dev->va + REG_NS_CTRL1);
+	writel_relaxed(ctrl0, dev->va + REG_NS_CTRL0);
+}
+static void __set_control_regs_v9(struct pkvm_iommu *dev)
+{
+	/* Return DECERR to device on permission fault. */
+	writel_relaxed(ALL_VIDS_BITMAP,
+		       dev->va + REG_NS_V9_CTRL_ERR_RESP_T_PER_VID_SET);
+	/*
+	 * Enable interrupts on fault for all VIDs. The IRQ must also be
+	 * specified in DT to get unmasked in the GIC.
+	 */
+	writel_relaxed(ALL_VIDS_BITMAP,
+		       dev->va + REG_NS_INTERRUPT_ENABLE_PER_VID_SET);
+	writel_relaxed(0, dev->va + REG_NS_CTRL0);
+	/* Enable the S2MPU, otherwise all traffic would be allowed through. */
+	writel_relaxed(ALL_VIDS_BITMAP,
+		       dev->va + REG_NS_V9_CTRL_PROT_EN_PER_VID_SET);
+	writel_relaxed(0, dev->va + REG_NS_V9_CFG_MPTW_ATTRIBUTE);
+}
+
+/*
+ * Poll the given SFR until its value has all bits of a given mask set.
+ * Returns true if successful, false if not successful after a given number of
+ * attempts.
+ */
+static bool __wait_until(void __iomem *addr, u32 mask, size_t max_attempts)
+{
+	size_t i;
+
+	for (i = 0; i < max_attempts; i++) {
+		if ((readl_relaxed(addr) & mask) == mask)
+			return true;
+	}
+	return false;
+}
+
+/* Poll the given SFR as long as its value has all bits of a given mask set. */
+static void __wait_while(void __iomem *addr, u32 mask)
+{
+	while ((readl_relaxed(addr) & mask) == mask)
+		continue;
+}
+
+static void __sync_cmd_start(struct pkvm_iommu *sync)
+{
+	writel_relaxed(SYNC_CMD_SYNC, sync->va + REG_NS_SYNC_CMD);
+}
+
+static void __invalidation_barrier_slow(struct pkvm_iommu *sync)
+{
+	size_t i, timeout;
+
+	/*
+	 * Wait for transactions to drain if SysMMU_SYNCs were registered.
+	 * Assumes that they are in the same power domain as the S2MPU.
+	 *
+	 * The algorithm will try initiating the SYNC if the SYNC_COMP_COMPLETE
+	 * bit has not been set after a given number of attempts, increasing the
+	 * timeout exponentially each time. If this cycle fails a given number
+	 * of times, the algorithm will give up completely to avoid deadlock.
+	 */
+	timeout = SYNC_TIMEOUT;
+	for (i = 0; i < SYNC_MAX_RETRIES; i++) {
+		__sync_cmd_start(sync);
+		if (__wait_until(sync->va + REG_NS_SYNC_COMP, SYNC_COMP_COMPLETE, timeout))
+			break;
+		timeout *= SYNC_TIMEOUT_MULTIPLIER;
+	}
+}
+
+/* Initiate invalidation barrier. */
+static void __invalidation_barrier_init(struct pkvm_iommu *dev)
+{
+	struct pkvm_iommu *sync;
+
+	for_each_child(sync, dev)
+		__sync_cmd_start(sync);
+}
+
+/* Wait for invalidation to complete. */
+static void __invalidation_barrier_complete(struct pkvm_iommu *dev)
+{
+	struct pkvm_iommu *sync;
+
+	/*
+	 * Check if the SYNC_COMP_COMPLETE bit has been set for individual
+	 * devices. If not, fall back to non-parallel invalidation.
+	 */
+	for_each_child(sync, dev) {
+		if (!(readl_relaxed(sync->va + REG_NS_SYNC_COMP) & SYNC_COMP_COMPLETE))
+			__invalidation_barrier_slow(sync);
+	}
+
+	/* Must not access SFRs while S2MPU is busy invalidating */
+	if (is_version(dev, S2MPU_VERSION_2) || is_version(dev, S2MPU_VERSION_9)) {
+		__wait_while(dev->va + REG_NS_STATUS,
+			     STATUS_BUSY | STATUS_ON_INVALIDATING);
+	}
+}
+
+static void __all_invalidation(struct pkvm_iommu *dev)
+{
+	writel_relaxed(INVALIDATION_INVALIDATE, dev->va + REG_NS_ALL_INVALIDATION);
+	__invalidation_barrier_init(dev);
+	__invalidation_barrier_complete(dev);
+}
+
+static void __range_invalidation_init(struct pkvm_iommu *dev, phys_addr_t first_byte,
+				      phys_addr_t last_byte)
+{
+	u32 start_ppn = first_byte >> RANGE_INVALIDATION_PPN_SHIFT;
+	u32 end_ppn = last_byte >> RANGE_INVALIDATION_PPN_SHIFT;
+
+	writel_relaxed(start_ppn, dev->va + REG_NS_RANGE_INVALIDATION_START_PPN);
+	writel_relaxed(end_ppn, dev->va + REG_NS_RANGE_INVALIDATION_END_PPN);
+	writel_relaxed(INVALIDATION_INVALIDATE, dev->va + REG_NS_RANGE_INVALIDATION);
+	__invalidation_barrier_init(dev);
+}
+
+/*
+ * Initialize S2MPU device and set all GB regions to 1G granularity with
+ * given protection bits.
+ */
+static int initialize_with_prot(struct pkvm_iommu *dev, enum mpt_prot prot)
+{
+	int ret;
+
+	ret = reg_ops->init(dev);
+	if (ret)
+		return ret;
+
+	mpt_ops->init_with_prot(dev->va, prot);
+	__all_invalidation(dev);
+
+	/* Set control registers, enable the S2MPU. */
+	reg_ops->set_control_regs(dev);
+	return 0;
+}
+
+/*
+ * Initialize S2MPU device, set L2 table addresses and configure L1TABLE_ATTR
+ * registers according to the given MPT struct.
+ */
+static int initialize_with_mpt(struct pkvm_iommu *dev, struct mpt *mpt)
+{
+	int ret;
+
+	ret = reg_ops->init(dev);
+	if (ret)
+		return ret;
+
+	mpt_ops->init_with_mpt(dev->va, mpt);
+	__all_invalidation(dev);
+
+	/* Set control registers, enable the S2MPU. */
+	reg_ops->set_control_regs(dev);
+	return 0;
+}
+
+static bool to_valid_range(phys_addr_t *start, phys_addr_t *end)
+{
+	phys_addr_t new_start = *start;
+	phys_addr_t new_end = *end;
+
+	if (new_end > PA_MAX)
+		new_end = PA_MAX;
+
+	new_start = ALIGN_DOWN(new_start, SMPT_GRAN);
+	new_end = ALIGN(new_end, SMPT_GRAN);
+
+	if (new_start >= new_end)
+		return false;
+
+	*start = new_start;
+	*end = new_end;
+	return true;
+}
+
+static void __mpt_idmap_prepare(struct mpt *mpt, phys_addr_t first_byte,
+				phys_addr_t last_byte, enum mpt_prot prot)
+{
+	mpt_ops->prepare_range(mpt, first_byte, last_byte, prot);
+}
+
+static void __mpt_idmap_apply(struct pkvm_iommu *dev, struct mpt *mpt,
+			      phys_addr_t first_byte, phys_addr_t last_byte)
+{
+	unsigned int first_gb = first_byte / SZ_1G;
+	unsigned int last_gb = last_byte / SZ_1G;
+
+	mpt_ops->apply_range(dev->va, mpt, first_gb, last_gb);
+	/* Initiate invalidation, completed in __mdt_idmap_complete. */
+	__range_invalidation_init(dev, first_byte, last_byte);
+}
+
+static void __mpt_idmap_complete(struct pkvm_iommu *dev, struct mpt *mpt)
+{
+	__invalidation_barrier_complete(dev);
+}
+
+static void s2mpu_host_stage2_idmap_prepare(phys_addr_t start, phys_addr_t end,
+					    enum kvm_pgtable_prot prot)
+{
+	if (!to_valid_range(&start, &end))
+		return;
+
+	__mpt_idmap_prepare(&host_mpt, start, end - 1, prot_to_mpt(prot));
+}
+
+static void s2mpu_host_stage2_idmap_apply(struct pkvm_iommu *dev,
+					  phys_addr_t start, phys_addr_t end)
+{
+	if (!to_valid_range(&start, &end))
+		return;
+
+	__mpt_idmap_apply(dev, &host_mpt, start, end - 1);
+}
+
+static void s2mpu_host_stage2_idmap_complete(struct pkvm_iommu *dev)
+{
+	__mpt_idmap_complete(dev, &host_mpt);
+}
+
+static int s2mpu_resume(struct pkvm_iommu *dev)
+{
+	/*
+	 * Initialize the S2MPU with the host stage-2 MPT. It is paramount
+	 * that the S2MPU reset state is enabled and blocking all traffic,
+	 * otherwise the host would not be forced to call the resume HVC
+	 * before issuing DMA traffic.
+	 */
+	return initialize_with_mpt(dev, &host_mpt);
+}
+
+static int s2mpu_suspend(struct pkvm_iommu *dev)
+{
+	/*
+	 * Stop updating the S2MPU when the host informs us about the intention
+	 * to suspend it. Writes to powered-down MMIO registers would trigger
+	 * SErrors in EL1 otherwise. However, hyp must put S2MPU back to
+	 * blocking state first, in case the host does not actually power it
+	 * down and continues issuing DMA traffic.
+	 */
+	return initialize_with_prot(dev, MPT_PROT_NONE);
+}
+
+static u32 host_mmio_reg_access_mask_v9(size_t off, bool is_write)
+{
+	const u32 no_access = 0;
+	const u32 read_write = (u32)(-1);
+	const u32 read_only = is_write ? no_access : read_write;
+	const u32 write_only = is_write ? read_write : no_access;
+
+	switch (off) {
+	/* Allow reading control registers for debugging. */
+	case REG_NS_CTRL0:
+		return read_only & V9_CTRL0_MASK;
+	case REG_NS_V9_CTRL_ERR_RESP_T_PER_VID_SET:
+		return read_only & ALL_VIDS_BITMAP;
+	case REG_NS_V9_CTRL_PROT_EN_PER_VID_SET:
+		return read_only & ALL_VIDS_BITMAP;
+	case REG_NS_V9_READ_STLB:
+		return write_only & (V9_READ_STLB_MASK_TYPEA|V9_READ_STLB_MASK_TYPEB);
+	case REG_NS_V9_READ_STLB_TPN:
+		return read_only & V9_READ_STLB_TPN_MASK;
+	case REG_NS_V9_READ_STLB_TAG_PPN:
+		return read_only & V9_READ_STLB_TAG_PPN_MASK;
+	case REG_NS_V9_READ_STLB_TAG_OTHERS:
+		return read_only & V9_READ_STLB_TAG_OTHERS_MASK;
+	case REG_NS_V9_READ_STLB_DATA:
+		return read_only;
+	case REG_NS_V9_MPTC_INFO:
+		return read_only & V9_READ_MPTC_INFO_MASK;
+	case REG_NS_V9_READ_MPTC:
+		return write_only & V9_READ_MPTC_MASK;
+	case REG_NS_V9_READ_MPTC_TAG_PPN:
+		return read_only & V9_READ_MPTC_TAG_PPN_MASK;
+	case REG_NS_V9_READ_MPTC_TAG_OTHERS:
+		return read_only & V9_READ_MPTC_TAG_OTHERS_MASK;
+	case REG_NS_V9_READ_MPTC_DATA:
+		return read_only;
+	case REG_NS_V9_PMMU_INFO:
+		return read_only & V9_READ_PMMU_INFO_MASK;
+	case REG_NS_V9_READ_PTLB:
+		return write_only & V9_READ_PTLB_MASK;
+	case REG_NS_V9_READ_PTLB_TAG:
+		return read_only & V9_READ_PTLB_TAG_MASK;
+	case REG_NS_V9_READ_PTLB_DATA_S1_EN_PPN_AP:
+		return read_only & V9_READ_PTLB_DATA_S1_ENABLE_PPN_AP_MASK;
+	case REG_NS_V9_READ_PTLB_DATA_S1_DIS_AP_LIST:
+		return read_only;
+	case REG_NS_V9_PMMU_INDICATOR:
+		return read_only & V9_READ_PMMU_INDICATOR_MASK;
+	case REG_NS_V9_SWALKER_INFO:
+		return read_only&V9_SWALKER_INFO_MASK;
+	};
+	if (off >= REG_NS_V9_PMMU_PTLB_INFO(0) && off < REG_NS_V9_PMMU_PTLB_INFO(V9_MAX_PTLB_NUM))
+		return read_only&V9_READ_PMMU_PTLB_INFO_MASK;
+	if (off >= REG_NS_V9_STLB_INFO(0) && off < REG_NS_V9_STLB_INFO(V9_MAX_STLB_NUM))
+		return read_only&V9_READ_SLTB_INFO_MASK;
+
+	return no_access;
+}
+
+static u32 host_mmio_reg_access_mask_v1_v2(size_t off, bool is_write)
+{
+	const u32 no_access = 0;
+	const u32 read_write = (u32)(-1);
+	const u32 read_only = is_write ? no_access : read_write;
+	const u32 write_only = is_write ? read_write : no_access;
+
+	switch (off) {
+	/* Allow reading control registers for debugging. */
+	case REG_NS_CTRL0:
+		return read_only & CTRL0_MASK;
+	case REG_NS_CTRL1:
+		return read_only & CTRL1_MASK;
+	/* Allow reading MPTC entries for debugging. That involves:
+	 *   - writing (set,way) to READ_MPTC
+	 *   - reading READ_MPTC_*
+	 */
+	case REG_NS_READ_MPTC:
+		return write_only & READ_MPTC_MASK;
+	case REG_NS_READ_MPTC_TAG_PPN:
+		return read_only & READ_MPTC_TAG_PPN_MASK;
+	case REG_NS_READ_MPTC_TAG_OTHERS:
+		return read_only & READ_MPTC_TAG_OTHERS_MASK;
+	case REG_NS_READ_MPTC_DATA:
+		return read_only;
+	};
+	return no_access;
+}
+
+static u32 host_mmio_reg_access_mask(size_t off, bool is_write)
+{
+	const u32 no_access  = 0;
+	const u32 read_write = (u32)(-1);
+	const u32 read_only  = is_write ? no_access  : read_write;
+	const u32 write_only = is_write ? read_write : no_access;
+	u32 masked_off;
+
+	switch (off) {
+	case REG_NS_CFG:
+		return read_only & CFG_MASK;
+	/* Allow EL1 IRQ handler to clear interrupts. */
+	case REG_NS_INTERRUPT_CLEAR:
+		return write_only & ALL_VIDS_BITMAP;
+	/* Allow reading number of sets used by MPTC. */
+	case REG_NS_INFO:
+		return read_only & INFO_NUM_SET_MASK;
+	/* Allow EL1 IRQ handler to read bitmap of pending interrupts. */
+	case REG_NS_FAULT_STATUS:
+		return read_only & ALL_VIDS_BITMAP;
+	}
+
+	/* Allow reading L1ENTRY registers for debugging. */
+	if (off >= REG_NS_L1ENTRY_L2TABLE_ADDR(0, 0) &&
+	    off < REG_NS_L1ENTRY_ATTR(NR_VIDS, 0))
+		return read_only;
+
+	/* Allow EL1 IRQ handler to read fault information. */
+	masked_off = off & ~REG_NS_FAULT_VID_MASK;
+	if ((masked_off == REG_NS_FAULT_PA_LOW(0)) ||
+	    (masked_off == REG_NS_FAULT_PA_HIGH(0)) ||
+	    (masked_off == REG_NS_FAULT_INFO(0)))
+		return read_only;
+
+	/* Check version-specific registers. */
+	return reg_ops->host_mmio_reg_access_mask(off, is_write);
+}
+
+static bool s2mpu_host_dabt_handler(struct pkvm_iommu *dev,
+				    struct kvm_cpu_context *host_ctxt,
+				    u32 esr, size_t off)
+{
+	bool is_write = esr & ESR_ELx_WNR;
+	unsigned int len = BIT((esr & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT);
+	int rd = (esr & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT;
+	u32 mask;
+
+	/* Only handle MMIO access with u32 size and alignment. */
+	if ((len != sizeof(u32)) || (off & (sizeof(u32) - 1)))
+		return false;
+
+	mask = host_mmio_reg_access_mask(off, is_write);
+	if (!mask)
+		return false;
+
+	if (is_write)
+		writel_relaxed(cpu_reg(host_ctxt, rd) & mask, dev->va + off);
+	else
+		cpu_reg(host_ctxt, rd) = readl_relaxed(dev->va + off) & mask;
+	return true;
+}
+/*
+ * Operations that differ between versions. We need to maintain
+ * old behaviour were v1 and v2 can be used together.
+ */
+const struct s2mpu_reg_ops ops_v1_v2 = {
+	.init = __initialize,
+	.host_mmio_reg_access_mask = host_mmio_reg_access_mask_v1_v2,
+	.set_control_regs = __set_control_regs,
+};
+const struct s2mpu_reg_ops ops_v9 = {
+	.init = __initialize_v2,
+	.host_mmio_reg_access_mask = host_mmio_reg_access_mask_v9,
+	.set_control_regs = __set_control_regs_v9,
+};
+
+static int s2mpu_init(void *data, size_t size)
+{
+	struct mpt in_mpt;
+	u32 *smpt;
+	phys_addr_t pa;
+	unsigned int gb;
+	int ret = 0;
+	int smpt_nr_pages, smpt_size;
+	struct s2mpu_mpt_cfg cfg;
+
+	if (size != sizeof(in_mpt))
+		return -EINVAL;
+
+	/* The host can concurrently modify 'data'. Copy it to avoid TOCTOU. */
+	memcpy(&in_mpt, data, sizeof(in_mpt));
+
+	cfg.version = in_mpt.version;
+	/* Make sure the version sent is supported by the driver. */
+	if ((cfg.version == S2MPU_VERSION_1) || (cfg.version == S2MPU_VERSION_2))
+		reg_ops = &ops_v1_v2;
+	else if (cfg.version == S2MPU_VERSION_9)
+		reg_ops = &ops_v9;
+	else
+		return -ENODEV;
+
+	/* Get page table operations for this version. */
+	mpt_ops = s2mpu_get_mpt_ops(cfg);
+	/* If version is wrong return. */
+	if (!mpt_ops)
+		return -EINVAL;
+
+	smpt_size = mpt_ops->smpt_size();
+	smpt_nr_pages = smpt_size / PAGE_SIZE;
+
+	/* Take ownership of all SMPT buffers. This will also map them in. */
+	for_each_gb(gb) {
+		smpt = kern_hyp_va(in_mpt.fmpt[gb].smpt);
+		pa = __hyp_pa(smpt);
+
+		if (!IS_ALIGNED(pa, smpt_size)) {
+			ret = -EINVAL;
+			break;
+		}
+
+		ret = __pkvm_host_donate_hyp(pa >> PAGE_SHIFT, smpt_nr_pages);
+		if (ret)
+			break;
+
+		host_mpt.fmpt[gb] = (struct fmpt){
+			.smpt = smpt,
+			.gran_1g = true,
+			.prot = MPT_PROT_RW,
+		};
+	}
+
+	/* Try to return memory back if there was an error. */
+	if (ret) {
+		for_each_gb(gb) {
+			smpt = host_mpt.fmpt[gb].smpt;
+			if (!smpt)
+				break;
+
+			WARN_ON(__pkvm_hyp_donate_host(__hyp_pa(smpt) >> PAGE_SHIFT,
+						       smpt_nr_pages));
+		}
+		memset(&host_mpt, 0, sizeof(host_mpt));
+	}
+
+	return ret;
+}
+
+static int s2mpu_validate(struct pkvm_iommu *dev)
+{
+	if (dev->size != S2MPU_MMIO_SIZE)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int s2mpu_validate_child(struct pkvm_iommu *dev, struct pkvm_iommu *child)
+{
+	if (child->ops != &pkvm_sysmmu_sync_ops)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int sysmmu_sync_validate(struct pkvm_iommu *dev)
+{
+	if (dev->size != SYSMMU_SYNC_S2_MMIO_SIZE)
+		return -EINVAL;
+
+	if (!dev->parent || dev->parent->ops != &pkvm_s2mpu_ops)
+		return -EINVAL;
+
+	return 0;
+}
+
+const struct pkvm_iommu_ops pkvm_s2mpu_ops = (struct pkvm_iommu_ops){
+	.init = s2mpu_init,
+	.validate = s2mpu_validate,
+	.validate_child = s2mpu_validate_child,
+	.resume = s2mpu_resume,
+	.suspend = s2mpu_suspend,
+	.host_stage2_idmap_prepare = s2mpu_host_stage2_idmap_prepare,
+	.host_stage2_idmap_apply = s2mpu_host_stage2_idmap_apply,
+	.host_stage2_idmap_complete = s2mpu_host_stage2_idmap_complete,
+	.host_dabt_handler = s2mpu_host_dabt_handler,
+	.data_size = sizeof(struct s2mpu_drv_data),
+};
+
+const struct pkvm_iommu_ops pkvm_sysmmu_sync_ops = (struct pkvm_iommu_ops){
+	.validate = sysmmu_sync_validate,
+};
+struct pkvm_iommu_driver pkvm_s2mpu_driver = (struct pkvm_iommu_driver){
+	.ops = &pkvm_s2mpu_ops,
+};
+struct pkvm_iommu_driver pkvm_sysmmu_sync_driver = (struct pkvm_iommu_driver){
+	.ops = &pkvm_sysmmu_sync_ops,
+};
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@ -14,7 +14,9 @@
 #include <nvhe/early_alloc.h>
 #include <nvhe/gfp.h>
 #include <nvhe/memory.h>
+#include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>
+#include <nvhe/modules.h>
 #include <nvhe/spinlock.h>

 struct kvm_pgtable pkvm_pgtable;
@ -23,7 +25,14 @@ hyp_spinlock_t pkvm_pgd_lock;
 struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS];
 unsigned int hyp_memblock_nr;

-static u64 __io_map_base;
+static u64 __private_range_base;
+static u64 __private_range_cur;
+
+struct hyp_fixmap_slot {
+	u64 addr;
+	kvm_pte_t *ptep;
+};
+static DEFINE_PER_CPU(struct hyp_fixmap_slot, fixmap_slots);

 static int __pkvm_create_mappings(unsigned long start, unsigned long size,
 				  unsigned long phys, enum kvm_pgtable_prot prot)
@ -42,29 +51,29 @@ static int __pkvm_create_mappings(unsigned long start, unsigned long size,
 * @size:	The size of the VA range to reserve.
 * @haddr:	The hypervisor virtual start address of the allocation.
 *
- * The private virtual address (VA) range is allocated above __io_map_base
+ * The private virtual address (VA) range is allocated above __private_range_base
 * and aligned based on the order of @size.
 *
 * Return: 0 on success or negative error code on failure.
 */
 int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr)
 {
-	unsigned long base, addr;
+	unsigned long cur, addr;
 	int ret = 0;

 	hyp_spin_lock(&pkvm_pgd_lock);

 	/* Align the allocation based on the order of its size */
-	addr = ALIGN(__io_map_base, PAGE_SIZE << get_order(size));
+	addr = ALIGN(__private_range_cur, PAGE_SIZE << get_order(size));

 	/* The allocated size is always a multiple of PAGE_SIZE */
-	base = addr + PAGE_ALIGN(size);
+	cur = addr + PAGE_ALIGN(size);

-	/* Are we overflowing on the vmemmap ? */
-	if (!addr || base > __hyp_vmemmap)
+	/* Has the private range grown too large ? */
+	if (!addr || cur > __hyp_vmemmap || (cur - __private_range_base) > __PKVM_PRIVATE_SZ) {
 		ret = -ENOMEM;
-	else {
-		__io_map_base = base;
+	} else {
+		__private_range_cur = cur;
 		*haddr = addr;
 	}

@ -93,6 +102,48 @@ int __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
 	return err;
 }

+void *__pkvm_alloc_module_va(u64 nr_pages)
+{
+	unsigned long addr = 0;
+
+	pkvm_modules_lock();
+	if (pkvm_modules_enabled())
+		pkvm_alloc_private_va_range(nr_pages << PAGE_SHIFT, &addr);
+	pkvm_modules_unlock();
+
+	return (void *)addr;
+}
+
+int __pkvm_map_module_page(u64 pfn, void *va, enum kvm_pgtable_prot prot)
+{
+	int ret = -EACCES;
+
+	pkvm_modules_lock();
+
+	if (!pkvm_modules_enabled())
+		goto err;
+
+	ret = __pkvm_host_donate_hyp(pfn, 1);
+	if (ret)
+		goto err;
+
+	ret = __pkvm_create_mappings((unsigned long)va, PAGE_SIZE, hyp_pfn_to_phys(pfn), prot);
+err:
+	pkvm_modules_unlock();
+
+	return ret;
+}
+
+void __pkvm_unmap_module_page(u64 pfn, void *va)
+{
+	pkvm_modules_lock();
+	if (pkvm_modules_enabled()) {
+		WARN_ON(__pkvm_hyp_donate_host(pfn, 1));
+		pkvm_remove_mappings(va, va + PAGE_SIZE);
+	}
+	pkvm_modules_unlock();
+}
+
 int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot)
 {
 	unsigned long start = (unsigned long)from;
@ -129,13 +180,45 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
 	return ret;
 }

-int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back)
+void pkvm_remove_mappings(void *from, void *to)
 {
-	unsigned long start, end;
+	unsigned long size = (unsigned long)to - (unsigned long)from;

-	hyp_vmemmap_range(phys, size, &start, &end);
+	hyp_spin_lock(&pkvm_pgd_lock);
+	WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, (u64)from, size) != size);
+	hyp_spin_unlock(&pkvm_pgd_lock);
+}

-	return __pkvm_create_mappings(start, end - start, back, PAGE_HYP);
+int hyp_back_vmemmap(phys_addr_t back)
+{
+	unsigned long i, start, size, end = 0;
+	int ret;
+
+	for (i = 0; i < hyp_memblock_nr; i++) {
+		start = hyp_memory[i].base;
+		start = ALIGN_DOWN((u64)hyp_phys_to_page(start), PAGE_SIZE);
+		/*
+		 * The begining of the hyp_vmemmap region for the current
+		 * memblock may already be backed by the page backing the end
+		 * the previous region, so avoid mapping it twice.
+		 */
+		start = max(start, end);
+
+		end = hyp_memory[i].base + hyp_memory[i].size;
+		end = PAGE_ALIGN((u64)hyp_phys_to_page(end));
+		if (start >= end)
+			continue;
+
+		size = end - start;
+		ret = __pkvm_create_mappings(start, size, back, PAGE_HYP);
+		if (ret)
+			return ret;
+
+		memset(hyp_phys_to_virt(back), 0, size);
+		back += size;
+	}
+
+	return 0;
 }

 static void *__hyp_bp_vect_base;
@ -189,6 +272,103 @@ int hyp_map_vectors(void)
 	return 0;
 }

+void *hyp_fixmap_map(phys_addr_t phys)
+{
+	struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots);
+	kvm_pte_t pte, *ptep = slot->ptep;
+
+	pte = *ptep;
+	pte &= ~kvm_phys_to_pte(KVM_PHYS_INVALID);
+	pte |= kvm_phys_to_pte(phys) | KVM_PTE_VALID;
+	WRITE_ONCE(*ptep, pte);
+	dsb(ishst);
+
+	return (void *)slot->addr + offset_in_page(phys);
+}
+
+static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
+{
+	kvm_pte_t *ptep = slot->ptep;
+	u64 addr = slot->addr;
+
+	WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID);
+
+	/*
+	 * Irritatingly, the architecture requires that we use inner-shareable
+	 * broadcast TLB invalidation here in case another CPU speculates
+	 * through our fixmap and decides to create an "amalagamation of the
+	 * values held in the TLB" due to the apparent lack of a
+	 * break-before-make sequence.
+	 *
+	 * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03
+	 */
+	dsb(ishst);
+	__tlbi_level(vale2is, __TLBI_VADDR(addr, 0), (KVM_PGTABLE_MAX_LEVELS - 1));
+	dsb(ish);
+	isb();
+}
+
+void hyp_fixmap_unmap(void)
+{
+	fixmap_clear_slot(this_cpu_ptr(&fixmap_slots));
+}
+
+static int __create_fixmap_slot_cb(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+				   enum kvm_pgtable_walk_flags flag,
+				   void * const arg)
+{
+	struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)arg);
+
+	if (!kvm_pte_valid(*ptep) || level != KVM_PGTABLE_MAX_LEVELS - 1)
+		return -EINVAL;
+
+	slot->addr = addr;
+	slot->ptep = ptep;
+
+	/*
+	 * Clear the PTE, but keep the page-table page refcount elevated to
+	 * prevent it from ever being freed. This lets us manipulate the PTEs
+	 * by hand safely without ever needing to allocate memory.
+	 */
+	fixmap_clear_slot(slot);
+
+	return 0;
+}
+
+static int create_fixmap_slot(u64 addr, u64 cpu)
+{
+	struct kvm_pgtable_walker walker = {
+		.cb	= __create_fixmap_slot_cb,
+		.flags	= KVM_PGTABLE_WALK_LEAF,
+		.arg = (void *)cpu,
+	};
+
+	return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker);
+}
+
+int hyp_create_pcpu_fixmap(void)
+{
+	unsigned long addr, i;
+	int ret;
+
+	for (i = 0; i < hyp_nr_cpus; i++) {
+		ret = pkvm_alloc_private_va_range(PAGE_SIZE, &addr);
+		if (ret)
+			return ret;
+
+		ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PAGE_SIZE,
+					  __hyp_pa(__hyp_bss_start), PAGE_HYP);
+		if (ret)
+			return ret;
+
+		ret = create_fixmap_slot(addr, i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 int hyp_create_idmap(u32 hyp_va_bits)
 {
 	unsigned long start, end;
@ -207,9 +387,43 @@ int hyp_create_idmap(u32 hyp_va_bits)
 	 * with the idmap to place the IOs and the vmemmap. IOs use the lower
 	 * half of the quarter and the vmemmap the upper half.
 	 */
-	__io_map_base = start & BIT(hyp_va_bits - 2);
-	__io_map_base ^= BIT(hyp_va_bits - 2);
-	__hyp_vmemmap = __io_map_base | BIT(hyp_va_bits - 3);
+	__private_range_base = start & BIT(hyp_va_bits - 2);
+	__private_range_base ^= BIT(hyp_va_bits - 2);
+	__private_range_cur = __private_range_base;
+	__hyp_vmemmap = __private_range_base | BIT(hyp_va_bits - 3);

 	return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC);
 }
+
+static void *admit_host_page(void *arg)
+{
+	struct kvm_hyp_memcache *host_mc = arg;
+
+	if (!host_mc->nr_pages)
+		return NULL;
+
+	/*
+	 * The host still owns the pages in its memcache, so we need to go
+	 * through a full host-to-hyp donation cycle to change it. Fortunately,
+	 * __pkvm_host_donate_hyp() takes care of races for us, so if it
+	 * succeeds we're good to go.
+	 */
+	if (__pkvm_host_donate_hyp(hyp_phys_to_pfn(host_mc->head), 1))
+		return NULL;
+
+	return pop_hyp_memcache(host_mc, hyp_phys_to_virt);
+}
+
+/* Refill our local memcache by poping pages from the one provided by the host. */
+int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
+		    struct kvm_hyp_memcache *host_mc)
+{
+	struct kvm_hyp_memcache tmp = *host_mc;
+	int ret;
+
+	ret =  __topup_hyp_memcache(mc, min_pages, admit_host_page,
+				    hyp_virt_to_phys, &tmp);
+	*host_mc = tmp;
+
+	return ret;
+}
--- a/arch/arm64/kvm/hyp/nvhe/module.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/module.lds.S
@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <asm/hyp_image.h>
+#include <asm/page-def.h>
+
+SECTIONS {
+	.hyp.text : {
+		HYP_SECTION_SYMBOL_NAME(.text) = .;
+		*(.text .text.*)
+	}
+
+	.hyp.bss : {
+		HYP_SECTION_SYMBOL_NAME(.bss) = .;
+		*(.bss .bss.*)
+	}
+
+	.hyp.rodata : {
+		HYP_SECTION_SYMBOL_NAME(.rodata) = .;
+		*(.rodata .rodata.*)
+	}
+
+	.hyp.data : {
+		HYP_SECTION_SYMBOL_NAME(.data) = .;
+		*(.data .data.*)
+	}
+}
--- a/arch/arm64/kvm/hyp/nvhe/modules.c
+++ b/arch/arm64/kvm/hyp/nvhe/modules.c
@ -0,0 +1,208 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 Google LLC
+ */
+#include <asm/kvm_host.h>
+#include <asm/kvm_pkvm_module.h>
+
+#include <nvhe/mem_protect.h>
+#include <nvhe/modules.h>
+#include <nvhe/mm.h>
+#include <nvhe/serial.h>
+#include <nvhe/spinlock.h>
+#include <nvhe/trap_handler.h>
+
+static void __kvm_flush_dcache_to_poc(void *addr, size_t size)
+{
+	kvm_flush_dcache_to_poc((unsigned long)addr, (unsigned long)size);
+}
+
+DEFINE_HYP_SPINLOCK(modules_lock);
+
+bool __pkvm_modules_enabled __ro_after_init;
+
+void pkvm_modules_lock(void)
+{
+	hyp_spin_lock(&modules_lock);
+}
+
+void pkvm_modules_unlock(void)
+{
+	hyp_spin_unlock(&modules_lock);
+}
+
+bool pkvm_modules_enabled(void)
+{
+	return __pkvm_modules_enabled;
+}
+
+static u64 early_lm_pages;
+static void *__pkvm_linear_map_early(phys_addr_t phys, size_t size, enum kvm_pgtable_prot prot)
+{
+	void *addr = NULL;
+	int ret;
+
+	if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size))
+		return NULL;
+
+	pkvm_modules_lock();
+	if (!__pkvm_modules_enabled)
+		goto out;
+
+	addr = __hyp_va(phys);
+	ret = pkvm_create_mappings(addr, addr + size, prot);
+	if (ret)
+		addr = NULL;
+	else
+		early_lm_pages += size >> PAGE_SHIFT;
+out:
+	pkvm_modules_unlock();
+
+	return addr;
+}
+
+static void __pkvm_linear_unmap_early(void *addr, size_t size)
+{
+	pkvm_modules_lock();
+	pkvm_remove_mappings(addr, addr + size);
+	early_lm_pages -= size >> PAGE_SHIFT;
+	pkvm_modules_unlock();
+}
+
+int __pkvm_close_module_registration(void)
+{
+	int ret;
+
+	pkvm_modules_lock();
+	/*
+	 * Page ownership tracking might go out of sync if there are stale
+	 * entries in pKVM's linear map range, so they must really be gone by
+	 * now.
+	 */
+	WARN_ON(early_lm_pages);
+
+	ret = __pkvm_modules_enabled ? 0 : -EACCES;
+	if (!ret) {
+		void *addr = hyp_fixmap_map(__hyp_pa(&__pkvm_modules_enabled));
+		*(bool *)addr = false;
+		hyp_fixmap_unmap();
+	}
+	pkvm_modules_unlock();
+
+	/* The fuse is blown! No way back until reset */
+	return ret;
+}
+
+const struct pkvm_module_ops module_ops = {
+	.create_private_mapping = __pkvm_create_private_mapping,
+	.register_serial_driver = __pkvm_register_serial_driver,
+	.puts = hyp_puts,
+	.putx64 = hyp_putx64,
+	.fixmap_map = hyp_fixmap_map,
+	.fixmap_unmap = hyp_fixmap_unmap,
+	.linear_map_early = __pkvm_linear_map_early,
+	.linear_unmap_early = __pkvm_linear_unmap_early,
+	.flush_dcache_to_poc = __kvm_flush_dcache_to_poc,
+	.register_host_perm_fault_handler = hyp_register_host_perm_fault_handler,
+	.protect_host_page = hyp_protect_host_page,
+	.register_host_smc_handler = __pkvm_register_host_smc_handler,
+	.register_default_trap_handler = __pkvm_register_default_trap_handler,
+	.register_illegal_abt_notifier = __pkvm_register_illegal_abt_notifier,
+	.register_psci_notifier = __pkvm_register_psci_notifier,
+	.register_hyp_panic_notifier = __pkvm_register_hyp_panic_notifier,
+};
+
+int __pkvm_init_module(void *module_init)
+{
+	int (*do_module_init)(const struct pkvm_module_ops *ops) = module_init;
+	int ret;
+
+	pkvm_modules_lock();
+	if (!pkvm_modules_enabled()) {
+		ret = -EACCES;
+		goto err;
+	}
+	ret = do_module_init(&module_ops);
+err:
+	pkvm_modules_unlock();
+
+	return ret;
+}
+
+#define MAX_DYNAMIC_HCALLS 128
+
+atomic_t num_dynamic_hcalls = ATOMIC_INIT(0);
+DEFINE_HYP_SPINLOCK(dyn_hcall_lock);
+
+static dyn_hcall_t host_dynamic_hcalls[MAX_DYNAMIC_HCALLS];
+
+int handle_host_dynamic_hcall(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(unsigned long, id, host_ctxt, 0);
+	dyn_hcall_t hfn;
+	int dyn_id;
+
+	/*
+	 * TODO: static key to protect when no dynamic hcall is registered?
+	 */
+
+	dyn_id = (int)(id - KVM_HOST_SMCCC_ID(0)) -
+		 __KVM_HOST_SMCCC_FUNC___dynamic_hcalls;
+	if (dyn_id < 0)
+		return HCALL_UNHANDLED;
+
+	cpu_reg(host_ctxt, 0) = SMCCC_RET_NOT_SUPPORTED;
+
+	/*
+	 * Order access to num_dynamic_hcalls and host_dynamic_hcalls. Paired
+	 * with __pkvm_register_hcall().
+	 */
+	if (dyn_id >= atomic_read_acquire(&num_dynamic_hcalls))
+		goto end;
+
+	hfn = READ_ONCE(host_dynamic_hcalls[dyn_id]);
+	if (!hfn)
+		goto end;
+
+	cpu_reg(host_ctxt, 0) = SMCCC_RET_SUCCESS;
+	hfn(host_ctxt);
+end:
+	return HCALL_HANDLED;
+}
+
+int __pkvm_register_hcall(unsigned long hvn_hyp_va)
+{
+	dyn_hcall_t hfn = (void *)hvn_hyp_va;
+	int reserved_id, ret;
+
+	pkvm_modules_lock();
+	if (!pkvm_modules_enabled()) {
+		ret = -EACCES;
+		goto err;
+	}
+
+	hyp_spin_lock(&dyn_hcall_lock);
+
+	reserved_id = atomic_read(&num_dynamic_hcalls);
+
+	if (reserved_id >= MAX_DYNAMIC_HCALLS) {
+		ret = -ENOMEM;
+		goto err_hcall_unlock;
+	}
+
+	WRITE_ONCE(host_dynamic_hcalls[reserved_id], hfn);
+
+	/*
+	 * Order access to num_dynamic_hcalls and host_dynamic_hcalls. Paired
+	 * with handle_host_dynamic_hcall.
+	 */
+	atomic_set_release(&num_dynamic_hcalls, reserved_id + 1);
+
+	ret = reserved_id + __KVM_HOST_SMCCC_FUNC___dynamic_hcalls;
+err_hcall_unlock:
+	hyp_spin_unlock(&dyn_hcall_lock);
+err:
+	pkvm_modules_unlock();
+
+	return ret;
+};
--- a/Show More
+++ b/Show More