android_kernel_xiaomi_sm8450/mm/page_alloc.c
Greg Kroah-Hartman 18cd39b706 Merge tag 'android12-5.10.136_r00' into android12-5.10
This is the merge of the upstream LTS release of 5.10.136 into the
android12-5.10 branch.

It contains the following commits:

ee965fe12d Merge branch 'android12-5.10' into branch 'android12-5.10-lts'
b7247246f6 Merge 5.10.136 into android12-5.10-lts
6eae1503dd Linux 5.10.136
1bea03b44e x86/speculation: Add LFENCE to RSB fill sequence
509c2c9fe7 x86/speculation: Add RSB VM Exit protections
e5b556a7b2 macintosh/adb: fix oob read in do_adb_query() function
75742ffc36 Bluetooth: btusb: Add Realtek RTL8852C support ID 0x13D3:0x3586
40e2e7f1bf Bluetooth: btusb: Add Realtek RTL8852C support ID 0x13D3:0x3587
9c45bb363e Bluetooth: btusb: Add Realtek RTL8852C support ID 0x0CB8:0xC558
3a292cb181 Bluetooth: btusb: Add Realtek RTL8852C support ID 0x04C5:0x1675
1a2a2e3456 Bluetooth: btusb: Add Realtek RTL8852C support ID 0x04CA:0x4007
e81f95d030 Bluetooth: btusb: Add support of IMC Networks PID 0x3568
918ce738e2 Bluetooth: hci_bcm: Add DT compatible for CYW55572
033a4455d9 Bluetooth: hci_bcm: Add BCM4349B1 variant
50763f0ac0 selftests: KVM: Handle compiler optimizations in ucall
a56e1ccdb7 tools/kvm_stat: fix display of error when multiple processes are found
3c77292d52 crypto: arm64/poly1305 - fix a read out-of-bound
e2c63e1afd ACPI: APEI: Better fix to avoid spamming the console with old error logs
6ccff35588 ACPI: video: Shortening quirk list by identifying Clevo by board_name only
a2b472b152 ACPI: video: Force backlight native for some TongFang devices
a01a4e9f5d tun: avoid double free in tun_free_netdev
1069087e2f selftests/bpf: Check dst_port only on the client socket
042fb1c281 selftests/bpf: Extend verifier and bpf_sock tests for dst_port loads
78c8397132 ath9k_htc: fix NULL pointer dereference at ath9k_htc_tx_get_packet()
4f3b852336 ath9k_htc: fix NULL pointer dereference at ath9k_htc_rxep()
45b69848a2 x86/speculation: Make all RETbleed mitigations 64-bit only
30abcdabf2 Merge 5.10.135 into android12-5.10-lts
f6ce9a9115 Merge 5.10.134 into android12-5.10-lts
4fd9cb57a3 Linux 5.10.135
4bfc9dc608 selftests: bpf: Don't run sk_lookup in verifier tests
6d3fad2b44 bpf: Add PROG_TEST_RUN support for sk_lookup programs
6aad811b37 bpf: Consolidate shared test timing code
545fc3524c x86/bugs: Do not enable IBPB at firmware entry when IBPB is not available
14b494b7aa xfs: Enforce attr3 buffer recovery order
e5f9d4e0f8 xfs: logging the on disk inode LSN can make it go backwards
c1268acaa0 xfs: remove dead stale buf unpin handling code
c85cbb0b21 xfs: hold buffer across unpin and potential shutdown processing
d8f5bb0a09 xfs: force the log offline when log intent item recovery fails
eccacbcbfd xfs: fix log intent recovery ENOSPC shutdowns when inactivating inodes
17c8097fb0 xfs: prevent UAF in xfs_log_item_in_current_chkpt
6d3605f84e xfs: xfs_log_force_lsn isn't passed a LSN
41fbfdaba9 xfs: refactor xfs_file_fsync
aadc39fd5b docs/kernel-parameters: Update descriptions for "mitigations=" param with retbleed
c4cd52ab1e EDAC/ghes: Set the DIMM label unconditionally
c454639172 ARM: 9216/1: Fix MAX_DMA_ADDRESS overflow
e500aa9f2d mt7601u: add USB device ID for some versions of XiaoDu WiFi Dongle.
2670f76a56 page_alloc: fix invalid watermark check on a negative value
8014246694 ARM: crypto: comment out gcc warning that breaks clang builds
6f3505588d sctp: leave the err path free in sctp_stream_init to sctp_stream_free
510e5b3791 sfc: disable softirqs for ptp TX
3ec42508a6 perf symbol: Correct address for bss symbols
6807897695 virtio-net: fix the race between refill work and close
440dccd80f netfilter: nf_queue: do not allow packet truncation below transport header offset
aeb2ff9f9f sctp: fix sleep in atomic context bug in timer handlers
fad6caf9b1 i40e: Fix interface init with MSI interrupts (no MSI-X)
e4a7acd6b4 tcp: Fix data-races around sysctl_tcp_reflect_tos.
f310fb69a0 tcp: Fix a data-race around sysctl_tcp_comp_sack_nr.
d2476f2059 tcp: Fix a data-race around sysctl_tcp_comp_sack_slack_ns.
4832397891 tcp: Fix a data-race around sysctl_tcp_comp_sack_delay_ns.
530a4da37e net: macsec: fix potential resource leak in macsec_add_rxsa() and macsec_add_txsa()
6e0e0464f1 macsec: always read MACSEC_SA_ATTR_PN as a u64
2daf0a1261 macsec: limit replay window size with XPN
0755c9d05a macsec: fix error message in macsec_add_rxsa and _txsa
54c295a30f macsec: fix NULL deref in macsec_add_rxsa
034bfadc8f Documentation: fix sctp_wmem in ip-sysctl.rst
4aea33f404 tcp: Fix a data-race around sysctl_tcp_invalid_ratelimit.
c4e6029a85 tcp: Fix a data-race around sysctl_tcp_autocorking.
83edb788e6 tcp: Fix a data-race around sysctl_tcp_min_rtt_wlen.
f47e7e5b49 tcp: Fix a data-race around sysctl_tcp_min_tso_segs.
5584fe9718 net: sungem_phy: Add of_node_put() for reference returned by of_get_parent()
b399ffafff igmp: Fix data-races around sysctl_igmp_qrv.
4c1318dabe net/tls: Remove the context from the list in tls_device_down
8008e797ec ipv6/addrconf: fix a null-ptr-deref bug for ip6_ptr
a84b8b53a5 net: ping6: Fix memleak in ipv6_renew_options().
c37c7f35d7 tcp: Fix a data-race around sysctl_tcp_challenge_ack_limit.
9ffb4fdfd8 tcp: Fix a data-race around sysctl_tcp_limit_output_bytes.
3e93312583 tcp: Fix data-races around sysctl_tcp_moderate_rcvbuf.
77ac046a9a Revert "tcp: change pingpong threshold to 3"
54a73d6544 scsi: ufs: host: Hold reference returned by of_parse_phandle()
160f79561e ice: do not setup vlan for loopback VSI
9ed6f97c8d ice: check (DD | EOF) bits on Rx descriptor rather than (EOP | RS)
2b4b373271 tcp: Fix data-races around sysctl_tcp_no_ssthresh_metrics_save.
3fb21b67c0 tcp: Fix a data-race around sysctl_tcp_nometrics_save.
81c45f49e6 tcp: Fix a data-race around sysctl_tcp_frto.
312ce3901f tcp: Fix a data-race around sysctl_tcp_adv_win_scale.
3cddb7a7a5 tcp: Fix a data-race around sysctl_tcp_app_win.
f10a5f905a tcp: Fix data-races around sysctl_tcp_dsack.
7fa8999b31 watch_queue: Fix missing locking in add_watch_to_object()
45a84f04a9 watch_queue: Fix missing rcu annotation
b38a8802c5 nouveau/svm: Fix to migrate all requested pages
bd46ca4146 s390/archrandom: prevent CPACF trng invocations in interrupt context
1228934cf2 ntfs: fix use-after-free in ntfs_ucsncmp()
5528990512 Revert "ocfs2: mount shared volume without ha stack"
de5d4654ac Bluetooth: L2CAP: Fix use-after-free caused by l2cap_chan_put
a46cc20143 Merge 5.10.133 into android12-5.10-lts
3f05c6dd13 ANDROID: fix up 5.10.132 merge with the virtio_mmio.c driver
7a62a4b621 Linux 5.10.134
bb1990a300 watch-queue: remove spurious double semicolon
f7c1fc0dec net: usb: ax88179_178a needs FLAG_SEND_ZLP
08afa87f58 tty: use new tty_insert_flip_string_and_push_buffer() in pty_write()
a4bb7ef2d6 tty: extract tty_flip_buffer_commit() from tty_flip_buffer_push()
c84986d097 tty: drop tty_schedule_flip()
4d374625cc tty: the rest, stop using tty_schedule_flip()
6a81848252 tty: drivers/tty/, stop using tty_schedule_flip()
0adf21eec5 watchqueue: make sure to serialize 'wqueue->defunct' properly
c0a3a9eb26 x86/alternative: Report missing return thunk details
b7b9e5cc8b x86/amd: Use IBPB for firmware calls
14fc9233aa Bluetooth: Fix bt_skb_sendmmsg not allocating partial chunks
f44e65e6f0 Bluetooth: SCO: Fix sco_send_frame returning skb->len
a8feae8bd2 Bluetooth: Fix passing NULL to PTR_ERR
5283591c84 Bluetooth: RFCOMM: Replace use of memcpy_from_msg with bt_skb_sendmmsg
341a029cf3 Bluetooth: SCO: Replace use of memcpy_from_msg with bt_skb_sendmsg
3cce0e771f Bluetooth: Add bt_skb_sendmmsg helper
c87b2bc9d7 Bluetooth: Add bt_skb_sendmsg helper
4faf4bbc2d ALSA: memalloc: Align buffer allocations in page size
d1dc861cd6 bitfield.h: Fix "type of reg too small for mask" test
f62ffdb5e2 drm/imx/dcss: fix unused but set variable warnings
577b624689 dlm: fix pending remove if msg allocation fails
cdcd20aa2c x86/bugs: Warn when "ibrs" mitigation is selected on Enhanced IBRS parts
26d5eb3c25 sched/deadline: Fix BUG_ON condition for deboosted tasks
0c722a32f2 bpf: Make sure mac_header was set before using it
ddb3f0b688 mm/mempolicy: fix uninit-value in mpol_rebind_policy()
3616776bc5 KVM: Don't null dereference ops->destroy
684896e675 spi: bcm2835: bcm2835_spi_handle_err(): fix NULL pointer deref for non DMA transfers
0648526633 tcp: Fix data-races around sysctl_tcp_max_reordering.
805f1c7ce4 tcp: Fix a data-race around sysctl_tcp_rfc1337.
03bb3892f3 tcp: Fix a data-race around sysctl_tcp_stdurg.
daa8b5b869 tcp: Fix a data-race around sysctl_tcp_retrans_collapse.
0e3f82a03e tcp: Fix data-races around sysctl_tcp_slow_start_after_idle.
cc133e4f4b tcp: Fix a data-race around sysctl_tcp_thin_linear_timeouts.
d8781f7cd0 tcp: Fix data-races around sysctl_tcp_recovery.
11e8b013d1 tcp: Fix a data-race around sysctl_tcp_early_retrans.
ffc388f6f0 tcp: Fix data-races around sysctl knobs related to SYN option.
fcaef69c79 udp: Fix a data-race around sysctl_udp_l3mdev_accept.
9add240f76 ip: Fix data-races around sysctl_ip_prot_sock.
e045d672ba ipv4: Fix a data-race around sysctl_fib_multipath_use_neigh.
36f1d9c607 drm/imx/dcss: Add missing of_node_put() in fail path
665cbe91de be2net: Fix buffer overflow in be_get_module_eeprom
4752392855 gpio: pca953x: use the correct register address when regcache sync during init
a941e6d5ba gpio: pca953x: use the correct range when do regmap sync
928ded3fc1 gpio: pca953x: only use single read/write for No AI mode
b82de63f8f ixgbe: Add locking to prevent panic when setting sriov_numvfs to zero
6f949e5615 i40e: Fix erroneous adapter reinitialization during recovery process
c6af943249 iavf: Fix handling of dummy receive descriptors
0dc2f19d8c tcp: Fix data-races around sysctl_tcp_fastopen_blackhole_timeout.
22938534c6 tcp: Fix data-races around sysctl_tcp_fastopen.
b3ce32e33a tcp: Fix data-races around sysctl_max_syn_backlog.
b6c189aa80 tcp: Fix a data-race around sysctl_tcp_tw_reuse.
fd6f1284e3 tcp: Fix a data-race around sysctl_tcp_notsent_lowat.
768e424607 tcp: Fix data-races around some timeout sysctl knobs.
474510e174 tcp: Fix data-races around sysctl_tcp_reordering.
dc1a78a2b2 tcp: Fix data-races around sysctl_tcp_syncookies.
fc489055e7 tcp: Fix data-races around keepalive sysctl knobs.
f85119fb3f igmp: Fix data-races around sysctl_igmp_max_msf.
7d26db0053 igmp: Fix a data-race around sysctl_igmp_max_memberships.
473aad9ad5 igmp: Fix data-races around sysctl_igmp_llm_reports.
e80ff0b966 net/tls: Fix race in TLS device down flow
a3ac79f38d net: stmmac: fix dma queue left shift overflow issue
f3da643d87 i2c: cadence: Change large transfer count reset logic to be unconditional
dd7b5ba44b net: stmmac: fix unbalanced ptp clock issue in suspend/resume flow
c61aede097 tcp: Fix a data-race around sysctl_tcp_probe_interval.
d452ce36f2 tcp: Fix a data-race around sysctl_tcp_probe_threshold.
d5bece4df6 tcp: Fix a data-race around sysctl_tcp_mtu_probe_floor.
97992e8fef tcp: Fix data-races around sysctl_tcp_min_snd_mss.
514d2254c7 tcp: Fix data-races around sysctl_tcp_base_mss.
77a04845f0 tcp: Fix data-races around sysctl_tcp_mtu_probing.
d4f65615db tcp/dccp: Fix a data-race around sysctl_tcp_fwmark_accept.
0ee76fe01f ip: Fix a data-race around sysctl_fwmark_reflect.
611ba70e5a ip: Fix a data-race around sysctl_ip_autobind_reuse.
94269132d0 ip: Fix data-races around sysctl_ip_nonlocal_bind.
11038fa781 ip: Fix data-races around sysctl_ip_fwd_update_priority.
b96ed5ccb0 ip: Fix data-races around sysctl_ip_fwd_use_pmtu.
5e343e3ef4 ip: Fix data-races around sysctl_ip_no_pmtu_disc.
77836dbe35 igc: Reinstate IGC_REMOVED logic and implement it properly
fb6031203e drm/amdgpu/display: add quirk handling for stutter mode
43128b3eee perf/core: Fix data race between perf_event_set_output() and perf_mmap_close()
5694b162f2 pinctrl: ralink: Check for null return of devm_kcalloc
493ceca327 power/reset: arm-versatile: Fix refcount leak in versatile_reboot_probe
47b696dd65 xfrm: xfrm_policy: fix a possible double xfrm_pols_put() in xfrm_bundle_lookup()
3777ea39f0 serial: mvebu-uart: correctly report configured baudrate value
e744aad0c4 PCI: hv: Fix interrupt mapping for multi-MSI
522bd31d6b PCI: hv: Reuse existing IRTE allocation in compose_msi_msg()
73bf070408 PCI: hv: Fix hv_arch_irq_unmask() for multi-MSI
f1d2f1ce05 PCI: hv: Fix multi-MSI to allow more than one MSI vector
b07240ce4a Revert "m68knommu: only set CONFIG_ISA_DMA_API for ColdFire sub-arch"
4f900c37f1 net: inline rollback_registered_many()
bf2f3d1970 net: move rollback_registered_many()
672fac0a43 net: inline rollback_registered()
b1158677d4 net: move net_set_todo inside rollback_registered()
2e11856ec3 net: make sure devices go through netdev_wait_all_refs
ed6964ff47 net: make free_netdev() more lenient with unregistering devices
2686f62fa7 docs: net: explain struct net_device lifetime
7a99c7c32c xen/gntdev: Ignore failure to unmap INVALID_GRANT_HANDLE
2ee0cab11f io_uring: Use original task for req identity in io_identity_cow()
ab5050fd74 lockdown: Fix kexec lockdown bypass with ima policy
426336de35 mlxsw: spectrum_router: Fix IPv4 nexthop gateway indication
15155fa898 riscv: add as-options for modules with assembly compontents
31f3bb363a pinctrl: stm32: fix optional IRQ support to gpios
bbc03f7ab8 Revert "cgroup: Use separate src/dst nodes when preloading css_sets for migration"
0c724b692d Merge 5.10.132 into android12-5.10-lts
ccdb3f9143 Merge 5.10.131 into android12-5.10-lts
50c9c56f73 Merge 5.10.130 into android12-5.10-lts
2be16baf4d Merge 5.10.129 into android12-5.10-lts
96fb478c9d Merge 5.10.128 into android12-5.10-lts
195692d0ab Merge 5.10.127 into android12-5.10-lts
f93a6ac3d6 Merge 5.10.126 into android12-5.10-lts
36c687c707 Merge 5.10.125 into android12-5.10-lts
4e3458d6d3 Merge 5.10.124 into android12-5.10-lts
fa431a5707 Merge 5.10.123 into android12-5.10-lts
8a8eb074ed Merge 5.10.122 into android12-5.10-lts
0ced6946ac Revert "drm: fix EDID struct for old ARM OABI format"
dca272b05d Revert "mailbox: forward the hrtimer if not queued and under a lock"
a73f6da5a3 Revert "Fonts: Make font size unsigned in font_desc"
8324f66c71 Revert "parisc/stifb: Keep track of hardware path of graphics card"
26e506a63e Revert "Bluetooth: Interleave with allowlist scan"
8046f2ad50 Revert "Bluetooth: use inclusive language when filtering devices"
b41a77c33b Revert "Bluetooth: use hdev lock for accept_list and reject_list in conn req"
fe07069084 Revert "thermal/drivers/core: Use a char pointer for the cooling device name"
361d75b4c1 Revert "thermal/core: Fix memory leak in __thermal_cooling_device_register()"
090d920be9 Revert "thermal/core: fix a UAF bug in __thermal_cooling_device_register()"
2dc56158cb Revert "thermal/core: Fix memory leak in the error path"
28fd8700b4 Revert "ALSA: jack: Access input_dev under mutex"
8636671438 Revert "gpiolib: of: Introduce hook for missing gpio-ranges"
0889c70b1f Revert "pinctrl: bcm2835: implement hook for missing gpio-ranges"
eaa4878a26 Revert "ext4: fix use-after-free in ext4_rename_dir_prepare"
f004760d69 Revert "ext4: verify dir block before splitting it"
5034934536 Linux 5.10.133
2fc7f18ba2 tools headers: Remove broken definition of __LITTLE_ENDIAN
060e39b8c2 tools arch: Update arch/x86/lib/mem{cpy,set}_64.S copies used in 'perf bench mem memcpy' - again
fbf60f83e2 objtool: Fix elf_create_undef_symbol() endianness
39065d5434 kvm: fix objtool relocation warning
6849ed81a3 x86: Use -mindirect-branch-cs-prefix for RETPOLINE builds
8e2774270a um: Add missing apply_returns()
725da3e67c x86/bugs: Remove apostrophe typo
81604506c2 tools headers cpufeatures: Sync with the kernel sources
3f93b8630a tools arch x86: Sync the msr-index.h copy with the kernel sources
2ef1b06cea KVM: emulate: do not adjust size of fastop and setcc subroutines
8e31dfd630 x86/kvm: fix FASTOP_SIZE when return thunks are enabled
5779e2f0cc efi/x86: use naked RET on mixed mode call wrapper
abf88ff134 x86/speculation: Use DECLARE_PER_CPU for x86_spec_ctrl_current
ecc0d92a9f x86/asm/32: Fix ANNOTATE_UNRET_SAFE use on 32-bit
95d89ec7db x86/ftrace: Add UNWIND_HINT_FUNC annotation for ftrace_stub
668cb1ddf0 x86/xen: Fix initialisation in hypercall_page after rethunk
81f20e5000 x86, kvm: use proper ASM macros for kvm_vcpu_is_preempted
844947eee3 tools/insn: Restore the relative include paths for cross building
c035ca88b0 x86/static_call: Serialize __static_call_fixup() properly
eb38964b6f x86/speculation: Disable RRSBA behavior
c2ca992144 x86/kexec: Disable RET on kexec
51552b6b52 x86/bugs: Do not enable IBPB-on-entry when IBPB is not supported
609336351d x86/bugs: Add Cannon lake to RETBleed affected CPU list
b24fdd0f1c x86/retbleed: Add fine grained Kconfig knobs
f7851ed697 x86/cpu/amd: Enumerate BTC_NO
a74f5d23e6 x86/common: Stamp out the stepping madness
4d7f72b6e1 x86/speculation: Fill RSB on vmexit for IBRS
47ae76fb27 KVM: VMX: Fix IBRS handling after vmexit
5269be9111 KVM: VMX: Prevent guest RSB poisoning attacks with eIBRS
84061fff2a KVM: VMX: Convert launched argument to flags
07401c2311 KVM: VMX: Flatten __vmx_vcpu_run()
df93717a32 objtool: Re-add UNWIND_HINT_{SAVE_RESTORE}
1dbefa5772 x86/speculation: Remove x86_spec_ctrl_mask
ce11f91b21 x86/speculation: Use cached host SPEC_CTRL value for guest entry/exit
aad83db22e x86/speculation: Fix SPEC_CTRL write on SMT state change
d29c07912a x86/speculation: Fix firmware entry SPEC_CTRL handling
f1b01ace81 x86/speculation: Fix RSB filling with CONFIG_RETPOLINE=n
ea1aa926f4 x86/cpu/amd: Add Spectral Chicken
0d1a8a16e6 objtool: Add entry UNRET validation
fbab1c94eb x86/bugs: Do IBPB fallback check only once
c8845b8754 x86/bugs: Add retbleed=ibpb
f728eff263 x86/xen: Rename SYS* entry points
28aa3fa0b2 objtool: Update Retpoline validation
55bba093fd intel_idle: Disable IBRS during long idle
e8142e2d6c x86/bugs: Report Intel retbleed vulnerability
a0f8ef71d7 x86/bugs: Split spectre_v2_select_mitigation() and spectre_v2_user_select_mitigation()
dabc2a1b40 x86/speculation: Add spectre_v2=ibrs option to support Kernel IBRS
6d7e13ccc4 x86/bugs: Optimize SPEC_CTRL MSR writes
3dddacf8c3 x86/entry: Add kernel IBRS implementation
9e727e0d94 x86/bugs: Keep a per-CPU IA32_SPEC_CTRL value
a989e75136 x86/bugs: Enable STIBP for JMP2RET
3f29791d56 x86/bugs: Add AMD retbleed= boot parameter
876750cca4 x86/bugs: Report AMD retbleed vulnerability
df748593c5 x86: Add magic AMD return-thunk
c70d6f8214 objtool: Treat .text.__x86.* as noinstr
c9eb5dcdc8 x86: Use return-thunk in asm code
5b2edaf709 x86/sev: Avoid using __x86_return_thunk
d6eb50e9b7 x86/vsyscall_emu/64: Don't use RET in vsyscall emulation
ee4996f07d x86/kvm: Fix SETcc emulation for return thunks
e0e06a9227 x86/bpf: Use alternative RET encoding
00b136bb62 x86/ftrace: Use alternative RET encoding
7723edf5ed x86,static_call: Use alternative RET encoding
446eb6f089 objtool: skip non-text sections when adding return-thunk sites
8bdb25f7ae x86,objtool: Create .return_sites
716410960b x86: Undo return-thunk damage
270de63cf4 x86/retpoline: Use -mfunction-return
37b9bb0941 Makefile: Set retpoline cflags based on CONFIG_CC_IS_{CLANG,GCC}
3e519ed8d5 x86/retpoline: Swizzle retpoline thunk
6a2b142886 x86/retpoline: Cleanup some #ifdefery
feec5277d5 x86/cpufeatures: Move RETPOLINE flags to word 11
7070bbb66c x86/kvm/vmx: Make noinstr clean
accb8cfd50 x86/realmode: build with -D__DISABLE_EXPORTS
236b959da9 objtool: Fix objtool regression on x32 systems
148811a842 x86/entry: Remove skip_r11rcx
e1db6c8a69 objtool: Fix symbol creation
3e8afd072d objtool: Fix type of reloc::addend
42ec4d7135 objtool: Fix code relocs vs weak symbols
831d5c07b7 objtool: Fix SLS validation for kcov tail-call replacement
9728af8857 crypto: x86/poly1305 - Fixup SLS
03c5c33e04 objtool: Default ignore INT3 for unreachable
bef21f88b4 kvm/emulate: Fix SETcc emulation function offsets with SLS
494ed76c14 tools arch: Update arch/x86/lib/mem{cpy,set}_64.S copies used in 'perf bench mem memcpy'
e9925a4584 x86: Add straight-line-speculation mitigation
0f8532c283 objtool: Add straight-line-speculation validation
1f6e6683c4 x86/alternative: Relax text_poke_bp() constraint
277f4ddc36 x86: Prepare inline-asm for straight-line-speculation
3c91e22576 x86: Prepare asm files for straight-line-speculation
a512fcd881 x86/lib/atomic64_386_32: Rename things
c2746d567d bpf,x86: Respect X86_FEATURE_RETPOLINE*
1713e5c4f8 bpf,x86: Simplify computing label offsets
38a80a3ca2 x86/alternative: Add debug prints to apply_retpolines()
3d13ee0d41 x86/alternative: Try inline spectre_v2=retpoline,amd
b0e2dc9506 x86/alternative: Handle Jcc __x86_indirect_thunk_\reg
381fd04c97 x86/alternative: Implement .retpoline_sites support
6eb95718f3 x86/retpoline: Create a retpoline thunk array
0de47ad5b9 x86/retpoline: Move the retpoline thunk declarations to nospec-branch.h
41ef958070 x86/asm: Fixup odd GEN-for-each-reg.h usage
8ef808b3f4 x86/asm: Fix register order
ccb8fc65a3 x86/retpoline: Remove unused replacement symbols
908bd980a8 objtool,x86: Replace alternatives with .retpoline_sites
023e78bbf1 objtool: Explicitly avoid self modifying code in .altinstr_replacement
6e4676f438 objtool: Classify symbols
acc0be56b4 objtool: Handle __sanitize_cov*() tail calls
9d7ec2418a objtool: Introduce CFI hash
e8b1128fb0 objtool: Make .altinstructions section entry size consistent
1afa44480b objtool: Remove reloc symbol type checks in get_alt_entry()
e7118a25a8 objtool: print out the symbol type when complaining about it
7ea0731957 objtool: Teach get_alt_entry() about more relocation types
364e463097 objtool: Don't make .altinstructions writable
f231b2ee85 objtool/x86: Ignore __x86_indirect_alt_* symbols
e32542e9ed objtool: Only rewrite unconditional retpoline thunk calls
a031925382 objtool: Fix .symtab_shndx handling for elf_create_undef_symbol()
76474a9dd3 x86/alternative: Optimize single-byte NOPs at an arbitrary position
f3fe1b141d objtool: Support asm jump tables
0b2c8bf498 objtool/x86: Rewrite retpoline thunk calls
ed7783dca5 objtool: Skip magical retpoline .altinstr_replacement
e87c18c4a9 objtool: Cache instruction relocs
33092b4866 objtool: Keep track of retpoline call sites
8a6d73f7db objtool: Add elf_create_undef_symbol()
b69e1b4b68 objtool: Extract elf_symbol_add()
da962cd0a2 objtool: Extract elf_strtab_concat()
b37c439250 objtool: Create reloc sections implicitly
fcdb7926d3 objtool: Add elf_create_reloc() helper
c9049cf480 objtool: Rework the elf_rebuild_reloc_section() logic
d42fa5bf19 objtool: Handle per arch retpoline naming
6e95f8caff objtool: Correctly handle retpoline thunk calls
28ca351296 x86/retpoline: Simplify retpolines
e68db6f780 x86/alternatives: Optimize optimize_nops()
9a6471666b x86: Add insn_decode_kernel()
d9cd219114 x86/alternative: Use insn_decode()
e6f8dc86a1 x86/insn-eval: Handle return values from the decoder
6bc6875b82 x86/insn: Add an insn_decode() API
76c513c87f x86/insn: Add a __ignore_sync_check__ marker
a3d96c7439 x86/insn: Rename insn_decode() to insn_decode_from_regs()
fd80da64cf x86/alternative: Use ALTERNATIVE_TERNARY() in _static_cpu_has()
341e6178c1 x86/alternative: Support ALTERNATIVE_TERNARY
0c4c698569 x86/alternative: Support not-feature
c9cf908b89 x86/alternative: Merge include files
5f93d900b9 x86/xen: Support objtool vmlinux.o validation in xen-head.S
b626e17c11 x86/xen: Support objtool validation in xen-asm.S
3116dee270 objtool: Combine UNWIND_HINT_RET_OFFSET and UNWIND_HINT_FUNC
53e89bc78e objtool: Assume only ELF functions do sibling calls
3e674f2652 objtool: Support retpoline jump detection for vmlinux.o
917a4f6348 objtool: Support stack layout changes in alternatives
e9197d768f objtool: Add 'alt_group' struct
1d516bd72a objtool: Refactor ORC section generation
dd87aa5f61 KVM/nVMX: Use __vmx_vcpu_run in nested_vmx_check_vmentry_hw
0ca2ba6e4d KVM/VMX: Use TEST %REG,%REG instead of CMP $0,%REG in vmenter.S
0e8e989142 Merge 5.10.121 into android12-5.10-lts
2de0a17df4 Merge 5.10.120 into android12-5.10-lts
7748091a31 Linux 5.10.132
06a5dc3911 x86/pat: Fix x86_has_pat_wp()
d9cb6fabc9 serial: 8250: Fix PM usage_count for console handover
e1bd94dd9e serial: pl011: UPSTAT_AUTORTS requires .throttle/unthrottle
b8c4661126 serial: stm32: Clear prev values before setting RTS delays
039ffe436a serial: 8250: fix return error code in serial8250_request_std_resource()
bfee93c9a6 vt: fix memory overlapping when deleting chars in the buffer
5450430199 tty: serial: samsung_tty: set dma burst_size to 1
0e5668ed7b usb: dwc3: gadget: Fix event pending check
f1e01a42dc usb: typec: add missing uevent when partner support PD
61ab5d644e USB: serial: ftdi_sio: add Belimo device ids
58b94325ee signal handling: don't use BUG_ON() for debugging
e75f692b79 nvme-pci: phison e16 has bogus namespace ids
54bf0b8c75 Revert "can: xilinx_can: Limit CANFD brp to 2"
35ce2c64e5 ARM: dts: stm32: use the correct clock source for CEC on stm32mp151
227ee155ea soc: ixp4xx/npe: Fix unused match warning
136d7987fc x86: Clear .brk area at early boot
fd830d8dd5 irqchip: or1k-pic: Undefine mask_ack for level triggered hardware
dae43b3792 ASoC: madera: Fix event generation for rate controls
cae4b78f3c ASoC: madera: Fix event generation for OUT1 demux
a7634527cb ASoC: cs47l15: Fix event generation for low power mux control
41f97b0ecf ASoC: dapm: Initialise kcontrol data for mux/demux controls
11a14e4f31 ASoC: wm5110: Fix DRE control
6cbbe59fdc ASoC: SOF: Intel: hda-loader: Clarify the cl_dsp_init() flow
ef1e38532f pinctrl: aspeed: Fix potential NULL dereference in aspeed_pinmux_set_mux()
13fb9105cf ASoC: ops: Fix off by one in range control validation
67dc32542a net: sfp: fix memory leak in sfp_probe()
104594de27 nvme: fix regression when disconnect a recovering ctrl
5504e63832 nvme-tcp: always fail a request when sending it failed
de876f36f9 NFC: nxp-nci: don't print header length mismatch on i2c error
efa78f2ae3 net: tipc: fix possible refcount leak in tipc_sk_create()
bacfef0bf2 platform/x86: hp-wmi: Ignore Sanitization Mode event
3ea9dbf7c2 cpufreq: pmac32-cpufreq: Fix refcount leak bug
24cd0b9bfd scsi: hisi_sas: Limit max hw sectors for v3 HW
c458ebd659 netfilter: br_netfilter: do not skip all hooks with 0 priority
93135dca8c virtio_mmio: Restore guest page size on resume
d611580032 virtio_mmio: Add missing PM calls to freeze/restore
31e16a5e11 mm: sysctl: fix missing numa_stat when !CONFIG_HUGETLB_PAGE
c713de1d80 net/tls: Check for errors in tls_device_init
eb58fd350a KVM: x86: Fully initialize 'struct kvm_lapic_irq' in kvm_pv_kick_cpu_op()
c2978d0124 net: atlantic: remove aq_nic_deinit() when resume
38e081ee06 net: atlantic: remove deep parameter on suspend/resume functions
b82e4ad58a sfc: fix kernel panic when creating VF
2d4efc9a0e seg6: bpf: fix skb checksum in bpf_push_seg6_encap()
7b38df59a8 seg6: fix skb checksum in SRv6 End.B6 and End.B6.Encaps behaviors
834fa0a22f seg6: fix skb checksum evaluation in SRH encapsulation/insertion
c224050081 sfc: fix use after free when disabling sriov
c1d9702ceb ima: Fix potential memory leak in ima_init_crypto()
eb360267e1 ima: force signature verification when CONFIG_KEXEC_SIG is configured
29c6a632f8 net: ftgmac100: Hold reference returned by of_get_child_by_name()
a51040d4b1 nexthop: Fix data-races around nexthop_compat_mode.
2c56958de8 ipv4: Fix data-races around sysctl_ip_dynaddr.
038a87b3e4 raw: Fix a data-race around sysctl_raw_l3mdev_accept.
38d78c7b4b icmp: Fix a data-race around sysctl_icmp_ratemask.
4ebf261532 icmp: Fix a data-race around sysctl_icmp_ratelimit.
b8871d9186 sysctl: Fix data-races in proc_dointvec_ms_jiffies().
2744e302e7 drm/i915/gt: Serialize TLB invalidates with GT resets
636e5dbaf0 drm/i915/selftests: fix a couple IS_ERR() vs NULL tests
359f2bca79 ARM: dts: sunxi: Fix SPI NOR campatible on Orange Pi Zero
e1aa73454a ARM: dts: at91: sama5d2: Fix typo in i2s1 node
418b191d5f ipv4: Fix a data-race around sysctl_fib_sync_mem.
e088ceb73c icmp: Fix data-races around sysctl.
fe2a35fa2c cipso: Fix data-races around sysctl.
f5811b8df2 net: Fix data-races around sysctl_mem.
d54b6ef53c inetpeer: Fix data-races around sysctl.
6481a8a72a tcp: Fix a data-race around sysctl_tcp_max_orphans.
609ce7ff75 sysctl: Fix data races in proc_dointvec_jiffies().
a5ee448d38 sysctl: Fix data races in proc_doulongvec_minmax().
e3a2144b3b sysctl: Fix data races in proc_douintvec_minmax().
71ddde27c2 sysctl: Fix data races in proc_dointvec_minmax().
d5d54714e3 sysctl: Fix data races in proc_douintvec().
80cc28a4b4 sysctl: Fix data races in proc_dointvec().
9cc8edc571 net: stmmac: dwc-qos: Disable split header for Tegra194
cd201332cc ASoC: Intel: Skylake: Correct the handling of fmt_config flexible array
fbb87a0ed2 ASoC: Intel: Skylake: Correct the ssp rate discovery in skl_get_ssp_clks()
bb8bf80387 ASoC: tas2764: Fix amp gain register offset & default
f1cd988de4 ASoC: tas2764: Correct playback volume range
52d1b4250c ASoC: tas2764: Fix and extend FSYNC polarity handling
249fe2d20d ASoC: tas2764: Add post reset delays
f160a1f970 ASoC: sgtl5000: Fix noise on shutdown/remove
831e190175 ima: Fix a potential integer overflow in ima_appraise_measurement
592f3bad00 drm/i915: fix a possible refcount leak in intel_dp_add_mst_connector()
4cb5c1950b net/mlx5e: Fix capability check for updating vnic env counters
6eb1d0c370 net/mlx5e: kTLS, Fix build time constant test in RX
c87d5211be net/mlx5e: kTLS, Fix build time constant test in TX
d6cab2e06c ARM: 9210/1: Mark the FDT_FIXED sections as shareable
3d82fba7d3 ARM: 9209/1: Spectre-BHB: avoid pr_info() every time a CPU comes out of idle
0c300e294d spi: amd: Limit max transfer and message size
d8d42c92fe ARM: dts: imx6qdl-ts7970: Fix ngpio typo and count
91f90b571f ext4: fix race condition between ext4_write and ext4_convert_inline_data
9d883b3f00 Revert "evm: Fix memleak in init_desc"
41007669fc sh: convert nommu io{re,un}map() to static inline functions
ea4dbcfb95 nilfs2: fix incorrect masking of permission flags for symlinks
14e63942d6 fs/remap: constrain dedupe of EOF blocks
0581613df7 drm/panfrost: Fix shrinker list corruption by madvise IOCTL
2e760fe05d drm/panfrost: Put mapping instead of shmem obj on panfrost_mmu_map_fault_addr() error
c1ea39a77c btrfs: return -EAGAIN for NOWAIT dio reads/writes on compressed and inline extents
7657e39585 cgroup: Use separate src/dst nodes when preloading css_sets for migration
e013ea2a51 wifi: mac80211: fix queue selection for mesh/OCB interfaces
db6e8c3015 ARM: 9214/1: alignment: advance IT state after emulating Thumb instruction
f851e4f402 ARM: 9213/1: Print message about disabled Spectre workarounds only once
fa40bb3a5f ip: fix dflt addr selection for connected nexthop
4d3e0fb05e net: sock: tracing: Fix sock_exceed_buf_limit not to dereference stale pointer
78a1400c42 tracing/histograms: Fix memory leak problem
931dbcc2e0 mm: split huge PUD on wp_huge_pud fallback
91530f675e fix race between exit_itimers() and /proc/pid/timers
b9c32a6886 xen/netback: avoid entering xenvif_rx_next_skb() with an empty rx queue
782a6b07b1 ALSA: hda/realtek - Enable the headset-mic on a Xiaomi's laptop
cacac3e13a ALSA: hda/realtek - Fix headset mic problem for a HP machine with alc221
08ab39027a ALSA: hda/realtek - Fix headset mic problem for a HP machine with alc671
4d0d15d184 ALSA: hda/realtek: Fix headset mic for Acer SF313-51
b642a3476a ALSA: hda/conexant: Apply quirk for another HP ProDesk 600 G3 model
4486bbe928 ALSA: hda - Add fixup for Dell Latitidue E5430
8f95261a00 Linux 5.10.131
cc5ee0e0ee Revert "mtd: rawnand: gpmi: Fix setting busy timeout setting"
ebc9fb07d2 ANDROID: random: fix CRC issues with the merge
e61ebc6383 ANDROID: change function signatures for some random functions.
830f0202d7 ANDROID: cpu/hotplug: avoid breaking Android ABI by fusing cpuhp steps
fee299e72e ANDROID: random: add back removed callback functions
6cc2db3cde UPSTREAM: Revert "net: af_key: add check for pfkey_broadcast in function pfkey_process"
05982f0cbb UPSTREAM: lib/crypto: add prompts back to crypto libraries
f2eb31a498 Merge 5.10.119 into android12-5.10-lts
26ae9c3614 Linux 5.10.130
8365b151fd dmaengine: ti: Add missing put_device in ti_dra7_xbar_route_allocate
37147e22cd dmaengine: ti: Fix refcount leak in ti_dra7_xbar_route_allocate
1be247db20 dmaengine: at_xdma: handle errors of at_xdmac_alloc_desc() correctly
7b721f5aec dmaengine: pl330: Fix lockdep warning about non-static key
e23cfb3fdc ida: don't use BUG_ON() for debugging
37995f034f dt-bindings: dma: allwinner,sun50i-a64-dma: Fix min/max typo
ca4a919584 misc: rtsx_usb: set return value in rsp_buf alloc err path
ff79e0ca2b misc: rtsx_usb: use separate command and response buffers
af7d9d4abe misc: rtsx_usb: fix use of dma mapped buffer for usb bulk transfer
86884017bb dmaengine: imx-sdma: Allow imx8m for imx7 FW revs
9b329edd77 i2c: cadence: Unregister the clk notifier in error path
26938bd28c r8169: fix accessing unset transport header
904f622ec7 selftests: forwarding: fix error message in learning_test
9906c22340 selftests: forwarding: fix learning_test when h1 supports IFF_UNICAST_FLT
859b889029 selftests: forwarding: fix flood_unicast_test when h2 supports IFF_UNICAST_FLT
23cdc57d88 ibmvnic: Properly dispose of all skbs during a failover.
2b4659c145 i40e: Fix dropped jumbo frames statistics
5561bddd05 xsk: Clear page contiguity bit when unmapping pool
87d2bb8882 ARM: dts: at91: sama5d2_icp: fix eeprom compatibles
9b7d8e28b6 ARM: dts: at91: sam9x60ek: fix eeprom compatible and size
ade03e5ea7 ARM: at91: pm: use proper compatibles for sam9x60's rtc and rtt
b40ac801cb ARM: at91: pm: use proper compatible for sama5d2's rtc
4c3e73a66a arm64: dts: qcom: msm8992-*: Fix vdd_lvs1_2-supply typo
1d0c3ced2d pinctrl: sunxi: sunxi_pconf_set: use correct offset
e1cda2a03d arm64: dts: imx8mp-evk: correct I2C3 pad settings
2ade1b1d92 arm64: dts: imx8mp-evk: correct gpio-led pad settings
17b3883ba5 arm64: dts: imx8mp-evk: correct the uart2 pinctl value
43319ee6a0 arm64: dts: imx8mp-evk: correct mmc pad settings
6bf74a1e74 arm64: dts: qcom: msm8994: Fix CPU6/7 reg values
2c0d10ce00 pinctrl: sunxi: a83t: Fix NAND function name for some pins
3d90607e7e ARM: meson: Fix refcount leak in meson_smp_prepare_cpus
e14930e9f9 xfs: remove incorrect ASSERT in xfs_rename
852952ea0e can: kvaser_usb: kvaser_usb_leaf: fix bittiming limits
a741e762e1 can: kvaser_usb: kvaser_usb_leaf: fix CAN clock frequency regression
f439d08ef1 can: kvaser_usb: replace run-time checks with struct kvaser_usb_driver_info
79af7be44c powerpc/powernv: delay rng platform device creation until later in boot
19104425c9 video: of_display_timing.h: include errno.h
96fa24eb1a memregion: Fix memregion_free() fallback definition
d6931bff1c PM: runtime: Redefine pm_runtime_release_supplier()
cecb806c76 fbcon: Prevent that screen size is smaller than font size
b727561ddc fbcon: Disallow setting font bigger than screen size
b81212828a fbmem: Check virtual screen sizes in fb_set_var()
d03e8ed72d fbdev: fbmem: Fix logo center image dx issue
963c80f070 iommu/vt-d: Fix PCI bus rescan device hot add
0a5e36dbcb netfilter: nf_tables: stricter validation of element data
4a6430b99f netfilter: nft_set_pipapo: release elements in clone from abort path
4f59d12efe net: rose: fix UAF bug caused by rose_t0timer_expiry
0085da9df3 usbnet: fix memory leak in error case
e917be1f83 bpf: Fix insufficient bounds propagation from adjust_scalar_min_max_vals
9adec73349 bpf: Fix incorrect verifier simulation around jmp32's jeq/jne
d0b8e22399 can: gs_usb: gs_usb_open/close(): fix memory leak
b6f4b347a1 can: grcan: grcan_probe(): remove extra of_node_get()
85cd41070d can: bcm: use call_rcu() instead of costly synchronize_rcu()
b75d4bec85 ALSA: hda/realtek: Add quirk for Clevo L140PU
6c32496964 mm/slub: add missing TID updates on slab deactivation
7208d1236f Linux 5.10.129
0e21ef1801 clocksource/drivers/ixp4xx: remove EXPORT_SYMBOL_GPL from ixp4xx_timer_setup()
7055e34462 net: usb: qmi_wwan: add Telit 0x1070 composition
f1a53bb27f net: usb: qmi_wwan: add Telit 0x1060 composition
43c8d33ce3 xen/arm: Fix race in RB-tree based P2M accounting
547b7c640d xen-netfront: restore __skb_queue_tail() positioning in xennet_get_responses()
cbbd2d2531 xen/blkfront: force data bouncing when backend is untrusted
4923217af5 xen/netfront: force data bouncing when backend is untrusted
728d68bfe6 xen/netfront: fix leaking data in shared pages
cfea428030 xen/blkfront: fix leaking data in shared pages
d341e5a754 selftests/rseq: Change type of rseq_offset to ptrdiff_t
7e617278bf selftests/rseq: x86-32: use %gs segment selector for accessing rseq thread area
27f6361cb4 selftests/rseq: x86-64: use %fs segment selector for accessing rseq thread area
a4312e2d81 selftests/rseq: Fix: work-around asm goto compiler bugs
7e1a0a9a44 selftests/rseq: Remove arm/mips asm goto compiler work-around
ba4d79af71 selftests/rseq: Fix warnings about #if checks of undefined tokens
35c6f5047f selftests/rseq: Fix ppc32 offsets by using long rather than off_t
dbc1f0ee60 selftests/rseq: Fix ppc32 missing instruction selection "u" and "x" for load/store
d4f631ea2d selftests/rseq: Fix ppc32: wrong rseq_cs 32-bit field pointer on big endian
e85fdae4df selftests/rseq: Uplift rseq selftests for compatibility with glibc-2.35
c79e564535 selftests/rseq: Introduce thread pointer getters
4a78bf83e2 selftests/rseq: Introduce rseq_get_abi() helper
3c2a416c80 selftests/rseq: Remove volatile from __rseq_abi
68e1232c6e selftests/rseq: Remove useless assignment to cpu variable
3e77ed4f90 selftests/rseq: introduce own copy of rseq uapi header
54cd556487 selftests/rseq: remove ARRAY_SIZE define from individual tests
14894cf692 hwmon: (ibmaem) don't call platform_device_del() if platform_device_add() fails
f72d410dbf ipv6/sit: fix ipip6_tunnel_get_prl return value
25055da22a sit: use min
652fd40eb0 drivers: cpufreq: Add missing of_node_put() in qoriq-cpufreq.c
79963021fd xen/gntdev: Avoid blocking in unmap_grant_pages()
5f614f5f70 tcp: add a missing nf_reset_ct() in 3WHS handling
9203dfb3ed xfs: fix xfs_reflink_unshare usage of filemap_write_and_wait_range
f874e16870 xfs: update superblock counters correctly for !lazysbcount
7ab7458d7a xfs: fix xfs_trans slab cache name
f12968a5a4 xfs: ensure xfs_errortag_random_default matches XFS_ERRTAG_MAX
da61388f9a xfs: Skip repetitive warnings about mount options
6b7dab812c xfs: rename variable mp to parsing_mp
b261cd005a xfs: use current->journal_info for detecting transaction recursion
c36d41b65e net: tun: avoid disabling NAPI twice
59c51c3b54 tunnels: do not assume mac header is set in skb_tunnel_check_pmtu()
c9fc52c173 io_uring: ensure that send/sendmsg and recv/recvmsg check sqe->ioprio
b8def021ac epic100: fix use after free on rmmod
456bc33887 tipc: move bc link creation back to tipc_node_create
09f9946235 NFC: nxp-nci: Don't issue a zero length i2c_master_read()
7d363362e0 nfc: nfcmrvl: Fix irq_of_parse_and_map() return value
63b2fe509f net: bonding: fix use-after-free after 802.3ad slave unbind
7597ed348e net: bonding: fix possible NULL deref in rlb code
ac12337229 net/sched: act_api: Notify user space if any actions were flushed before error
91d3bb82c4 netfilter: nft_dynset: restore set element counter when failing to update
4b480a7940 s390: remove unneeded 'select BUILD_BIN2C'
e65027fdeb PM / devfreq: exynos-ppmu: Fix refcount leak in of_get_devfreq_events
653bdcd833 caif_virtio: fix race between virtio_device_ready() and ndo_open()
208ff79675 NFSD: restore EINVAL error translation in nfsd_commit()
db82bb6054 net: ipv6: unexport __init-annotated seg6_hmac_net_init()
eb1757ca20 usbnet: fix memory allocation in helpers
fae2a9fb1e linux/dim: Fix divide by 0 in RDMA DIM
b0cab8b517 RDMA/cm: Fix memory leak in ib_cm_insert_listen
9de276dfb2 RDMA/qedr: Fix reporting QP timeout attribute
a42bd00f00 net: dp83822: disable rx error interrupt
9c06d84855 net: dp83822: disable false carrier interrupt
c70ca16f72 net: tun: stop NAPI when detaching queues
bec1be0a74 net: tun: unlink NAPI from device on destruction
0b2499c801 net: dsa: bcm_sf2: force pause link settings
3f55912a1a selftests/net: pass ipv6_args to udpgso_bench's IPv6 TCP test
f7b8fb4584 virtio-net: fix race between ndo_open() and virtio_device_ready()
c0a28f2ddf net: usb: ax88179_178a: Fix packet receiving
8f74cb27c2 net: rose: fix UAF bugs caused by timer handler
6a0b9512a6 SUNRPC: Fix READ_PLUS crasher
ed03a650fb s390/archrandom: simplify back to earlier design and initialize earlier
d8bca518d5 dm raid: fix KASAN warning in raid5_add_disks
9bf2b0757b dm raid: fix accesses beyond end of raid member array
213c550deb powerpc/bpf: Fix use of user_pt_regs in uapi
68a34e478a powerpc/book3e: Fix PUD allocation size in map_kernel_page()
e188bbdb92 powerpc/prom_init: Fix kernel config grep
e6a7d30b65 nvdimm: Fix badblocks clear off-by-one error
0b99c4a189 nvme-pci: add NVME_QUIRK_BOGUS_NID for ADATA XPG SX6000LNP (AKA SPECTRIX S40G)
e77804158b ipv6: take care of disable_policy when restoring routes
03b9e01659 drm/amdgpu: To flush tlb for MMHUB of RAVEN series
ea86c1430c Linux 5.10.128
2d10984d99 net: mscc: ocelot: allow unregistered IP multicast flooding
6a656280e7 powerpc/ftrace: Remove ftrace init tramp once kernel init is complete
6b734f7b70 xfs: check sb_meta_uuid for dabuf buffer recovery
071e750ffb xfs: remove all COW fork extents when remounting readonly
1e76bd4c67 xfs: Fix the free logic of state in xfs_attr_node_hasname
0cdccc05da xfs: punch out data fork delalloc blocks on COW writeback failure
db3f8110c3 xfs: use kmem_cache_free() for kmem_cache objects
09c9902cd8 bcache: memset on stack variables in bch_btree_check() and bch_sectors_dirty_init()
c4ff3ffe01 tick/nohz: unexport __init-annotated tick_nohz_full_setup()
069fff50d4 drm: remove drm_fb_helper_modinit
52dc7f3f6f MAINTAINERS: add Amir as xfs maintainer for 5.10.y
fa7f6a5f56 Merge branch 'android12-5.10' into branch 'android12-5.10-lts'
deb587b1a4 Linux 5.10.127
1cca46c205 powerpc/pseries: wire up rng during setup_arch()
95d73d510b kbuild: link vmlinux only once for CONFIG_TRIM_UNUSED_KSYMS (2nd attempt)
feb5ab7986 random: update comment from copy_to_user() -> copy_to_iter()
959bbaf5b7 modpost: fix section mismatch check for exported init/exit sections
c980392af1 ARM: cns3xxx: Fix refcount leak in cns3xxx_init
889aad2203 memory: samsung: exynos5422-dmc: Fix refcount leak in of_get_dram_timings
44a5b3a073 ARM: Fix refcount leak in axxia_boot_secondary
30bbfeb480 soc: bcm: brcmstb: pm: pm-arm: Fix refcount leak in brcmstb_pm_probe
68f28d52e6 ARM: exynos: Fix refcount leak in exynos_map_pmu
59fdf10814 ARM: dts: imx6qdl: correct PU regulator ramp delay
fb70bd8675 ARM: dts: imx7: Move hsic_phy power domain to HSIC PHY node
f78acc4288 powerpc/powernv: wire up rng during setup_arch
7db1ba660b powerpc/rtas: Allow ibm,platform-dump RTAS call with null buffer address
1f5a9205a3 powerpc: Enable execve syscall exit tracepoint
ca144919af parisc: Enable ARCH_HAS_STRICT_MODULE_RWX
a1c902349a parisc/stifb: Fix fb_is_primary_device() only available with CONFIG_FB_STI
af0ff2da01 xtensa: Fix refcount leak bug in time.c
6c0839cf1b xtensa: xtfpga: Fix refcount leak bug in setup
501652a2ad iio: adc: adi-axi-adc: Fix refcount leak in adi_axi_adc_attach_client
d40514d440 iio: adc: axp288: Override TS pin bias current for some models
d579c893dd iio: adc: stm32: Fix IRQs on STM32F4 by removing custom spurious IRQs message
62284d45e2 iio: adc: stm32: Fix ADCs iteration in irq handler
e3ebb9d16c iio: imu: inv_icm42600: Fix broken icm42600 (chip id 0 value)
3e0af68b99 iio: adc: stm32: fix maximum clock rate for stm32mp15x
b07a30a774 iio: trigger: sysfs: fix use-after-free on remove
399788e819 iio: gyro: mpu3050: Fix the error handling in mpu3050_power_up()
c1ec7d52a2 iio: accel: mma8452: ignore the return value of reset operation
42caf44906 iio:accel:mxc4005: rearrange iio trigger get and register
e26dcf6279 iio:accel:bma180: rearrange iio trigger get and register
f26379e199 iio:chemical:ccs811: rearrange iio trigger get and register
4b6cdcff7c f2fs: attach inline_data after setting compression
2d7bdb6a5a usb: chipidea: udc: check request status before setting device address
656eca37aa USB: gadget: Fix double-free bug in raw_gadget driver
54604108be usb: gadget: Fix non-unique driver names in raw-gadget driver
d87dec22fd xhci-pci: Allow host runtime PM as default for Intel Meteor Lake xHCI
114080d04a xhci-pci: Allow host runtime PM as default for Intel Raptor Lake xHCI
b8142a8465 xhci: turn off port power in shutdown
116c3e81b0 usb: typec: wcove: Drop wrong dependency to INTEL_SOC_PMIC
a547662534 iio: adc: vf610: fix conversion mode sysfs node name
58c3a27e9c iio: mma8452: fix probe fail when device tree compatible is used.
5ee016f612 s390/cpumf: Handle events cycles and instructions identical
abe487a88a gpio: winbond: Fix error code in winbond_gpio_get()
30531e0d7b nvme: move the Samsung X5 quirk entry to the core quirks
169f7d7705 nvme-pci: add NO APST quirk for Kioxia device
938f594266 nvme-pci: allocate nvme_command within driver pdu
ba388d4e9a nvme: don't check nvme_req flags for new req
e7ccaa1aba nvme: mark nvme_setup_passsthru() inline
3ee62a1f07 nvme: split nvme_alloc_request()
fe06c692cd nvme: centralize setting the timeout in nvme_alloc_request
afbc954e78 Revert "net/tls: fix tls_sk_proto_close executed repeatedly"
340fbdc801 virtio_net: fix xdp_rxq_info bug after suspend/resume
3bccf82169 igb: Make DMA faster when CPU is active on the PCIe link
7d7450363f regmap-irq: Fix a bug in regmap_irq_enable() for type_in_mask chips
40b3815b2c ice: ethtool: advertise 1000M speeds properly
7b564e3254 afs: Fix dynamic root getattr
3c22192db0 MIPS: Remove repetitive increase irq_err_count
cc649a7865 x86/xen: Remove undefined behavior in setup_features()
b60c375ad1 selftests: netfilter: correct PKTGEN_SCRIPT_PATHS in nft_concat_range.sh
20119c1e0f udmabuf: add back sanity check
e82376b632 net/tls: fix tls_sk_proto_close executed repeatedly
cec9867ee5 erspan: do not assume transport header is always set
acf76125bb drm/msm/dp: fix connect/disconnect handled at irq_hpd
61f8f4034c drm/msm/dp: promote irq_hpd handle to handle link training correctly
d11cb08215 drm/msm/dp: deinitialize mainlink if link training failed
3d67cb00cb drm/msm/dp: fixes wrong connection state caused by failure of link train
efb2b69160 drm/msm/dp: check core_initialized before disable interrupts at dp_display_unbind()
d16a433982 drm/msm/mdp4: Fix refcount leak in mdp4_modeset_init_intf
363fd6e346 net/sched: sch_netem: Fix arithmetic in netem_dump() for 32-bit platforms
2e3216b929 bonding: ARP monitor spams NETDEV_NOTIFY_PEERS notifiers
c12a2c9b1b igb: fix a use-after-free issue in igb_clean_tx_ring
361c5521c1 tipc: fix use-after-free Read in tipc_named_reinit
f299d3fbe4 tipc: simplify the finalize work queue
ab7f565ac7 phy: aquantia: Fix AN when higher speeds than 1G are not advertised
a51c199e4d bpf, x86: Fix tail call count offset calculation on bpf2bpf call
4ae116428e drm/sun4i: Fix crash during suspend after component bind failure
516760f1d2 bpf: Fix request_sock leak in sk lookup helpers
505a375eea drm/msm: use for_each_sgtable_sg to iterate over scatterlist
10eb239e29 scsi: scsi_debug: Fix zone transition to full condition
15cc30ac2a netfilter: use get_random_u32 instead of prandom
95f80c8843 netfilter: nftables: add nft_parse_register_store() and use it
ec9b0a8d30 netfilter: nftables: add nft_parse_register_load() and use it
8adedb4711 drm/msm: Fix double pm_runtime_disable() call
8682335375 USB: serial: option: add Quectel RM500K module support
9e6e063e54 USB: serial: option: add Quectel EM05-G modem
0b3006a862 USB: serial: option: add Telit LE910Cx 0x1250 composition
f6a266e0dc dm mirror log: clear log bits up to BITS_PER_LONG boundary
03d1874b82 dm era: commit metadata in postsuspend after worker stops
273106c2df ata: libata: add qc->flags in ata_qc_complete_template tracepoint
156427b312 mtd: rawnand: gpmi: Fix setting busy timeout setting
07e56884cd mmc: sdhci-pci-o2micro: Fix card detect by dealing with debouncing
0ae82e1ccb btrfs: add error messages to all unrecognized mount options
49e3e449bc net: openvswitch: fix parsing of nw_proto for IPv6 fragments
1508658aec ALSA: hda/realtek: Add quirk for Clevo NS50PU
6e8e503159 ALSA: hda/realtek: Add quirk for Clevo PD70PNT
80307458a1 ALSA: hda/realtek: Apply fixup for Lenovo Yoga Duet 7 properly
7fcbc89d47 ALSA: hda/realtek - ALC897 headset MIC no sound
f5ea433d56 ALSA: hda/realtek: Add mute LED quirk for HP Omen laptop
6437329060 ALSA: hda/conexant: Fix missing beep setup
12a6be5d11 ALSA: hda/via: Fix missing beep setup
5e80f923b8 random: quiet urandom warning ratelimit suppression message
310ebbd9f5 random: schedule mix_interrupt_randomness() less often
3acb7dc242 vt: drop old FONT ioctls
9cae50bdfa Linux 5.10.126
fb2fbb3c10 io_uring: use separate list entry for iopoll requests
6a7c3bcc3c Linux 5.10.125
df3f3bb505 io_uring: add missing item types for various requests
1a264b3a69 arm64: mm: Don't invalidate FROM_DEVICE buffers at start of DMA transfer
a1508d164e serial: core: Initialize rs485 RTS polarity already on probe
7ccb026ecb tcp: drop the hash_32() part from the index calculation
9429b75bc2 tcp: increase source port perturb table to 2^16
24b922a5da tcp: dynamically allocate the perturb table used by source ports
d28e64b1c6 tcp: add small random increments to the source port
dd46a868fc tcp: use different parts of the port_offset for index and offset
743acb5207 tcp: add some entropy in __inet_hash_connect()
16b1994679 usb: gadget: u_ether: fix regression in setting fixed MAC address
355be61311 zonefs: fix zonefs_iomap_begin() for reads
ee4677b78e s390/mm: use non-quiescing sske for KVM switch to keyed guest
73c2a811f6 Revert "xfrm: Add possibility to set the default to block if we have no policy"
e21944a82a Revert "net: xfrm: fix shift-out-of-bounce"
f7160ab103 Revert "xfrm: make user policy API complete"
df0ff8d194 Revert "xfrm: notify default policy on update"
4ead88c0e8 Revert "xfrm: fix dflt policy check when there is no policy configured"
42dadcf0a8 Revert "xfrm: rework default policy structure"
ece9c2a70f Revert "xfrm: fix "disable_policy" flag use when arriving from different devices"
9dcde7a741 Revert "include/uapi/linux/xfrm.h: Fix XFRM_MSG_MAPPING ABI breakage"
4f3fee72a7 Linux 5.10.124
e0b6018894 clk: imx8mp: fix usb_root_clk parent
a3e50506ea powerpc/book3e: get rid of #include <generated/compile.h>
ff4443f3fc igc: Enable PCIe PTM
f0a7adff63 Revert "PCI: Make pci_enable_ptm() private"
e1513a714d net: openvswitch: fix misuse of the cached connection on tuple changes
09b55dc90b net/sched: act_police: more accurate MTU policing
73bc8a5e8e dma-direct: don't over-decrypt memory
aa9a001efa virtio-pci: Remove wrong address verification in vp_del_vqs()
be98641034 ALSA: hda/realtek: fix right sounds and mute/micmute LEDs for HP machine
401bef1f95 KVM: SVM: Use kzalloc for sev ioctl interfaces to prevent kernel data leak
d6be031a2f KVM: x86: Account a variety of miscellaneous allocations
d74d7865e2 KVM: arm64: Don't read a HW interrupt pending state in user context
bfd004a1d3 ext4: add reserved GDT blocks check
0ca74dacfd ext4: make variable "count" signed
6fdaf31ad5 ext4: fix bug_on ext4_mb_use_inode_pa
e27430c1f1 drm/amd/display: Cap OLED brightness per max frame-average luminance
ba751f0d25 dm mirror log: round up region bitmap size to BITS_PER_LONG
33ba36351e serial: 8250: Store to lsr_save_flags after lsr read
57901c658f usb: gadget: lpc32xx_udc: Fix refcount leak in lpc32xx_udc_probe
a44a8a762f usb: dwc2: Fix memory leak in dwc2_hcd_init
791da3e6c8 USB: serial: io_ti: add Agilent E5805A support
0e13274bc6 USB: serial: option: add support for Cinterion MV31 with new baseline
d721986e96 crypto: memneq - move into lib/
308b8f31c0 comedi: vmk80xx: fix expression for tx buffer size
9308be3d9a mei: me: add raptor lake point S DID
9ea9c92275 i2c: designware: Use standard optional ref clock implementation
506a88a5bf irqchip/gic-v3: Fix refcount leak in gic_populate_ppi_partitions
7c9dd9d23f irqchip/gic-v3: Fix error handling in gic_populate_ppi_partitions
e52a58b79f irqchip/gic/realview: Fix refcount leak in realview_gic_of_init
716587a57a i2c: npcm7xx: Add check for platform_driver_register
b559ef9dfc faddr2line: Fix overlapping text section failures, the sequel
7fa28a7c3d block: Fix handling of offline queues in blk_mq_alloc_request_hctx()
2d825fb53b certs/blacklist_hashes.c: fix const confusion in certs blacklist
bc28fde909 arm64: ftrace: consistently handle PLTs.
e177f17fe4 arm64: ftrace: fix branch range checks
64072389be net: ax25: Fix deadlock caused by skb_recv_datagram in ax25_recvmsg
28069e026e net: bgmac: Fix an erroneous kfree() in bgmac_remove()
984793f255 mlxsw: spectrum_cnt: Reorder counter pools
b90ae84a8a nvme: add device name to warning in uuid_show()
42f7cbe2c2 nvme: use sysfs_emit instead of sprintf
63b26fe025 drm/i915/reset: Fix error_state_read ptr + offset use
2b2180449a misc: atmel-ssc: Fix IRQ check in ssc_probe
65ca4db68b tty: goldfish: Fix free_irq() on remove
5334455067 Drivers: hv: vmbus: Release cpu lock in error case
814092927a i40e: Fix call trace in setup_tx_descriptors
43dfd1169c i40e: Fix calculating the number of queue pairs
ef4d73da0a i40e: Fix adding ADQ filter to TC0
db965e2757 clocksource: hyper-v: unexport __init-annotated hv_init_clocksource()
8acc3e228e pNFS: Avoid a live lock condition in pnfs_update_layout()
03ea83324a pNFS: Don't keep retrying if the server replied NFS4ERR_LAYOUTUNAVAILABLE
4603a37f6e random: credit cpu and bootloader seeds by default
9d667348dc gpio: dwapb: Don't print error on -EPROBE_DEFER
f3c8bfd6dc MIPS: Loongson-3: fix compile mips cpu_hwmon as module build error.
85340c0634 mellanox: mlx5: avoid uninitialized variable warning with gcc-12
38c519df8e net: ethernet: mtk_eth_soc: fix misuse of mem alloc interface netdev[napi]_alloc_frag
b8879ca1fd ipv6: Fix signed integer overflow in l2tp_ip6_sendmsg
0eeec1a8b0 nfc: nfcmrvl: Fix memory leak in nfcmrvl_play_deferred
6c18f47f47 virtio-mmio: fix missing put_device() when vm_cmdline_parent registration failed
d539feb6df ALSA: hda/realtek - Add HW8326 support
16dd002eb8 scsi: pmcraid: Fix missing resource cleanup in error case
410b692621 scsi: ipr: Fix missing/incorrect resource cleanup in error case
85acc5bf05 scsi: lpfc: Allow reduced polling rate for nvme_admin_async_event cmd completion
916145bf9d scsi: lpfc: Fix port stuck in bypassed state after LIP in PT2PT topology
f416fee125 scsi: vmw_pvscsi: Expand vcpuHint to 16 bits
0e9994b865 Input: soc_button_array - also add Lenovo Yoga Tablet2 1051F to dmi_use_low_level_irq
2e640e5e44 ASoC: wm_adsp: Fix event generation for wm_adsp_fw_put()
a572c74402 ASoC: es8328: Fix event generation for deemphasis control
c7b8c3758f ASoC: wm8962: Fix suspend while playing music
8656623bdc quota: Prevent memory allocation recursion while holding dq_lock
36cd19e7d4 ata: libata-core: fix NULL pointer deref in ata_host_alloc_pinfo()
440b2a62da ASoC: cs42l51: Correct minimum value for SX volume control
f93d8fe3dc ASoC: cs42l56: Correct typo in minimum level for SX volume controls
13e5b76d3d ASoC: cs42l52: Correct TLV for Bypass Volume
b8a47bcc4d ASoC: cs53l30: Correct number of volume levels on SX controls
70e355867d ASoC: cs35l36: Update digital volume TLV
cb6a0b83f1 ASoC: cs42l52: Fix TLV scales for mixer controls
d7be05aff2 dma-debug: make things less spammy under memory pressure
1b54c00657 ASoC: nau8822: Add operation for internal PLL off and on
2c9548bc26 powerpc/kasan: Silence KASAN warnings in __get_wchan()
b5699bff1d arm64: dts: imx8mm-beacon: Enable RTS-CTS on UART3
28bbdca6a7 bpf: Fix incorrect memory charge cost calculation in stack_map_alloc()
f14816f2f9 nfsd: Replace use of rwsem with errseq_t
56a7f57da5 9p: missing chunk of "fs/9p: Don't update file type when updating file attributes"
2a59239b22 Linux 5.10.123
aa238a92cc x86/speculation/mmio: Print SMT warning
bde15fdcce KVM: x86/speculation: Disable Fill buffer clear within guests
6df693dca3 x86/speculation/mmio: Reuse SRBDS mitigation for SBDS
cf1c01a5e4 x86/speculation/srbds: Update SRBDS mitigation selection
001415e4e6 x86/speculation/mmio: Add sysfs reporting for Processor MMIO Stale Data
3eb1180564 x86/speculation/mmio: Enable CPU Fill buffer clearing on idle
56f0bca5e9 x86/bugs: Group MDS, TAA & Processor MMIO Stale Data mitigations
26f6f231f6 x86/speculation/mmio: Add mitigation for Processor MMIO Stale Data
f83d4e5be4 x86/speculation: Add a common function for MD_CLEAR mitigation update
e66310bc96 x86/speculation/mmio: Enumerate Processor MMIO Stale Data bug
f8a85334a5 Documentation: Add documentation for Processor MMIO Stale Data
5754c570a5 Linux 5.10.122
9ba2b4ac35 tcp: fix tcp_mtup_probe_success vs wrong snd_cwnd
5e34b49756 dmaengine: idxd: add missing callback function to support DMA_INTERRUPT
b8c17121f0 zonefs: fix handling of explicit_open option on mount
ef51997771 PCI: qcom: Fix pipe clock imbalance
63bcb9da91 md/raid0: Ignore RAID0 layout if the second zone has only one device
418db40cc7 interconnect: Restore sync state by ignoring ipa-virt in provider count
bcae8f8338 interconnect: qcom: sc7180: Drop IP0 interconnects
fe6caf5122 powerpc/mm: Switch obsolete dssall to .long
3be74fc0af powerpc/32: Fix overread/overwrite of thread_struct via ptrace
fa0d3d71dc drm/atomic: Force bridge self-refresh-exit on CRTC switch
dbe04e874d drm/bridge: analogix_dp: Support PSR-exit to disable transition
61297ee0c3 Input: bcm5974 - set missing URB_NO_TRANSFER_DMA_MAP urb flag
2dba96d19d ixgbe: fix unexpected VLAN Rx in promisc mode on VF
91620cded9 ixgbe: fix bcast packets Rx on VF after promisc removal
cdd9227373 nfc: st21nfca: fix incorrect sizing calculations in EVT_TRANSACTION
54423649bc nfc: st21nfca: fix memory leaks in EVT_TRANSACTION handling
4f0a2c46f5 nfc: st21nfca: fix incorrect validating logic in EVT_TRANSACTION
c4e4c07d86 net: phy: dp83867: retrigger SGMII AN when link change
133c9870cd mmc: block: Fix CQE recovery reset success
0248a8c844 ata: libata-transport: fix {dma|pio|xfer}_mode sysfs files
471a413201 cifs: fix reconnect on smb3 mount types
9023ecfd33 cifs: return errors during session setup during reconnects
b423cd2a81 ALSA: hda/realtek: Fix for quirk to enable speaker output on the Lenovo Yoga DuetITL 2021
94bd216d17 ALSA: hda/conexant - Fix loopback issue with CX20632
13639c970f scripts/gdb: change kernel config dumping method
b6ea26873e vringh: Fix loop descriptors check in the indirect cases
362e3b3a59 nodemask: Fix return values to be unsigned
a262e1255b cifs: version operations for smb20 unneeded when legacy support disabled
01137d8980 s390/gmap: voluntarily schedule during key setting
f72df77600 nbd: fix io hung while disconnecting device
122e4adaff nbd: fix race between nbd_alloc_config() and module removal
c0868f6e72 nbd: call genl_unregister_family() first in nbd_cleanup()
cb8da20d71 jump_label,noinstr: Avoid instrumentation for JUMP_LABEL=n builds
320acaf84a x86/cpu: Elide KCSAN for cpu_has() and friends
8287687821 modpost: fix undefined behavior of is_arm_mapping_symbol()
fee8ae0a0b drm/radeon: fix a possible null pointer dereference
3e57686830 ceph: allow ceph.dir.rctime xattr to be updatable
7fa8312879 Revert "net: af_key: add check for pfkey_broadcast in function pfkey_process"
ebfe279725 scsi: myrb: Fix up null pointer access on myrb_cleanup()
7eb32f286e md: protect md_unregister_thread from reentrancy
668c3f9fa2 watchdog: wdat_wdt: Stop watchdog when rebooting the system
e20bc8b5a2 kernfs: Separate kernfs_pr_cont_buf and rename_lock.
1e3b3a5762 serial: msm_serial: disable interrupts in __msm_console_write()
ff727ab0b7 staging: rtl8712: fix uninit-value in r871xu_drv_init()
33ef21d554 staging: rtl8712: fix uninit-value in usb_read8() and friends
f3f754d72d clocksource/drivers/sp804: Avoid error on multiple instances
abf3b22261 extcon: Modify extcon device to be created after driver data is set
41ec946694 misc: rtsx: set NULL intfdata when probe fails
5b0c0298f7 usb: dwc2: gadget: don't reset gadget's driver->bus
468fe959ea sysrq: do not omit current cpu when showing backtrace of all active CPUs
f4cb24706c USB: hcd-pci: Fully suspend across freeze/thaw cycle
ffe9440d69 drivers: usb: host: Fix deadlock in oxu_bus_suspend()
6e2273eefa drivers: tty: serial: Fix deadlock in sa1100_set_termios()
ee105039d3 USB: host: isp116x: check return value after calling platform_get_resource()
0f69d7d5e9 drivers: staging: rtl8192e: Fix deadlock in rtllib_beacons_stop()
66f769762f drivers: staging: rtl8192u: Fix deadlock in ieee80211_beacons_stop()
cb7147afd3 tty: Fix a possible resource leak in icom_probe
d68d5e68b7 tty: synclink_gt: Fix null-pointer-dereference in slgt_clean()
61ca1b97ad lkdtm/usercopy: Expand size of "out of frame" object
7821d743ab iio: st_sensors: Add a local lock for protecting odr
5a89a92efc staging: rtl8712: fix a potential memory leak in r871xu_drv_init()
8caa4b7d41 iio: dummy: iio_simple_dummy: check the return value of kstrdup()
f091e29ed8 drm: imx: fix compiler warning with gcc-12
96bf5ed057 net: altera: Fix refcount leak in altera_tse_mdio_create
fbeb8dfa8b ip_gre: test csum_start instead of transport header
1981cd7a77 net/mlx5: fs, fail conflicting actions
652418d82b net/mlx5: Rearm the FW tracer after each tracer event
5d9c1b081a net: ipv6: unexport __init-annotated seg6_hmac_init()
be3884d5cd net: xfrm: unexport __init-annotated xfrm4_protocol_init()
7759c32228 net: mdio: unexport __init-annotated mdio_bus_init()
b585b87fd5 SUNRPC: Fix the calculation of xdr->end in xdr_get_next_encode_buffer()
3d8122e169 net/mlx4_en: Fix wrong return value on ioctl EEPROM query failure
c2ae49a113 net: dsa: lantiq_gswip: Fix refcount leak in gswip_gphy_fw_list
0cf7aaff29 bpf, arm64: Clear prog->jited_len along prog->jited
c61848500a af_unix: Fix a data-race in unix_dgram_peer_wake_me().
be9581f4fd xen: unexport __init-annotated xen_xlate_map_ballooned_pages()
86c87d2c03 netfilter: nf_tables: bail out early if hardware offload is not supported
330c0c6cd2 netfilter: nf_tables: memleak flow rule from commit path
67e2d44873 netfilter: nf_tables: release new hooks on unsupported flowtable flags
19cb3ece14 ata: pata_octeon_cf: Fix refcount leak in octeon_cf_probe
ec5548066d netfilter: nf_tables: always initialize flowtable hook list in transaction
7fd03e34f0 powerpc/kasan: Force thread size increase with KASAN
7a248f9c74 netfilter: nf_tables: delete flowtable hooks via transaction list
9edafbc7ec netfilter: nat: really support inet nat without l3 address
8dbae5affb xprtrdma: treat all calls not a bcall when bc_serv is NULL
8b3d5bafb1 video: fbdev: pxa3xx-gcu: release the resources correctly in pxa3xx_gcu_probe/remove()
c09b873f3f video: fbdev: hyperv_fb: Allow resolutions with size > 64 MB for Gen1
0ee5b9644f NFSv4: Don't hold the layoutget locks across multiple RPC calls
95a0ba85c1 dmaengine: zynqmp_dma: In struct zynqmp_dma_chan fix desc_size data type
2c08cae19d m68knommu: fix undefined reference to `_init_sp'
d99f04df32 m68knommu: set ZERO_PAGE() to the allocated zeroed page
344a55ccf5 i2c: cadence: Increase timeout per message if necessary
32bea51fe4 f2fs: remove WARN_ON in f2fs_is_valid_blkaddr
54c1e0e3bb iommu/arm-smmu-v3: check return value after calling platform_get_resource()
3660db29b0 iommu/arm-smmu: fix possible null-ptr-deref in arm_smmu_device_probe()
9e801c891a tracing: Avoid adding tracer option before update_tracer_options
1788e6dbb6 tracing: Fix sleeping function called from invalid context on RT kernel
2f452a3306 bootconfig: Make the bootconfig.o as a normal object file
c667b3872a mips: cpc: Fix refcount leak in mips_cpc_default_phys_base
76b226eaf0 dmaengine: idxd: set DMA_INTERRUPT cap bit
32be2b805a perf c2c: Fix sorting in percent_rmt_hitm_cmp()
71cbce7503 driver core: Fix wait_for_device_probe() & deferred_probe_timeout interaction
b8fac8e321 tipc: check attribute length for bearer name
c1f0187025 scsi: sd: Fix potential NULL pointer dereference
d2e297eaf4 afs: Fix infinite loop found by xfstest generic/676
04622d6318 gpio: pca953x: use the correct register address to do regcache sync
0a0f7f8414 tcp: tcp_rtx_synack() can be called from process context
e05dd93826 net: sched: add barrier to fix packet stuck problem for lockless qdisc
e9fe72b95d net/mlx5e: Update netdev features after changing XDP state
b50eef7a38 net/mlx5: correct ECE offset in query qp output
ea5edd015f net/mlx5: Don't use already freed action pointer
bf2af9b243 sfc: fix wrong tx channel offset with efx_separate_tx_channels
8f81a4113e sfc: fix considering that all channels have TX queues
7ac3a034d9 nfp: only report pause frame configuration for physical device
630e0a10c0 net/smc: fixes for converting from "struct smc_cdc_tx_pend **" to "struct smc_wr_tx_pend_priv *"
b97550e380 riscv: read-only pages should not be writable
8f49e1694c bpf: Fix probe read error in ___bpf_prog_run()
6d8d3f68cb ubi: ubi_create_volume: Fix use-after-free when volume creation failed
f413e4d7cd ubi: fastmap: Fix high cpu usage of ubi_bgt by making sure wl_pool not empty
3252d327f9 jffs2: fix memory leak in jffs2_do_fill_super
741e49eacd modpost: fix removing numeric suffixes
42658e47f1 net: dsa: mv88e6xxx: Fix refcount leak in mv88e6xxx_mdios_register
f7ba2cc57f net: ethernet: ti: am65-cpsw-nuss: Fix some refcount leaks
71ae30662e net: ethernet: mtk_eth_soc: out of bounds read in mtk_hwlro_get_fdir_entry()
503a3fd646 net: sched: fixed barrier to prevent skbuff sticking in qdisc backlog
ee89d7fd49 s390/crypto: fix scatterwalk_unmap() callers in AES-GCM
e892a7e60f clocksource/drivers/oxnas-rps: Fix irq_of_parse_and_map() return value
1d7361679f ASoC: fsl_sai: Fix FSL_SAI_xDR/xFR definition
910b1cdf6c watchdog: ts4800_wdt: Fix refcount leak in ts4800_wdt_probe
b3354f2046 watchdog: rti-wdt: Fix pm_runtime_get_sync() error checking
36ee9ffca8 driver core: fix deadlock in __device_attach
823f24f2e3 driver: base: fix UAF when driver_attach failed
7a6337bfed bus: ti-sysc: Fix warnings for unbind for serial
985706bd3b firmware: dmi-sysfs: Fix memory leak in dmi_sysfs_register_handle
94acaaad47 serial: stm32-usart: Correct CSIZE, bits, and parity
b7e560d2ff serial: st-asc: Sanitize CSIZE and correct PARENB for CS7
afcfc3183c serial: sifive: Sanitize CSIZE and c_iflag
a9f6bee486 serial: sh-sci: Don't allow CS5-6
00456b932e serial: txx9: Don't allow CS5-6
22e975796f serial: rda-uart: Don't allow CS5-6
ff4ce2979b serial: digicolor-usart: Don't allow CS5-6
5cd331bcf0 serial: 8250_fintek: Check SER_RS485_RTS_* only with RS485
260792d5c9 serial: meson: acquire port->lock in startup()
82bfea344e rtc: mt6397: check return value after calling platform_get_resource()
d54a51b518 clocksource/drivers/riscv: Events are stopped during CPU suspend
5b3e990f85 soc: rockchip: Fix refcount leak in rockchip_grf_init
cfe8a0967d extcon: ptn5150: Add queue work sync before driver release
96414e2cdc coresight: cpu-debug: Replace mutex with mutex_trylock on panic notifier
47ebc50dc2 serial: sifive: Report actual baud base rather than fixed 115200
ab35308bbd phy: qcom-qmp: fix pipe-clock imbalance on power-on failure
52f327a45c rpmsg: qcom_smd: Fix returning 0 if irq_of_parse_and_map() fails
c10333c451 iio: adc: sc27xx: Fine tune the scale calibration values
3747429834 iio: adc: sc27xx: fix read big scale voltage not right
b30f2315a3 iio: proximity: vl53l0x: Fix return value check of wait_for_completion_timeout
43823ceb26 iio: adc: stmpe-adc: Fix wait_for_completion_timeout return value check
6f01c0fb8e usb: typec: mux: Check dev_set_name() return value
7027c890ff firmware: stratix10-svc: fix a missing check on list iterator
70ece3c5ec misc: fastrpc: fix an incorrect NULL check on list iterator
2a1bf8e5ad usb: dwc3: pci: Fix pm_runtime_get_sync() error checking
8ae4fed195 rpmsg: qcom_smd: Fix irq_of_parse_and_map() return value
572211d631 pwm: lp3943: Fix duty calculation in case period was clamped
f9782b26d6 staging: fieldbus: Fix the error handling path in anybuss_host_common_probe()
b382c0c3b8 usb: musb: Fix missing of_node_put() in omap2430_probe
6b7cf22122 USB: storage: karma: fix rio_karma_init return
e100742823 usb: usbip: add missing device lock on tweak configuration cmd
bcbb795a9e usb: usbip: fix a refcount leak in stub_probe()
4e3a2d77bd tty: serial: fsl_lpuart: fix potential bug when using both of_alias_get_id and ida_simple_get
e27376f5aa tty: n_tty: Restore EOF push handling behavior
11bc6eff3a tty: serial: owl: Fix missing clk_disable_unprepare() in owl_uart_probe
ee6c33b29e tty: goldfish: Use tty_port_destroy() to destroy port
56ac04f35f lkdtm/bugs: Check for the NULL pointer after calling kmalloc
03efa70eb0 iio: adc: ad7124: Remove shift from scan_type
4610b06761 staging: greybus: codecs: fix type confusion of list iterator variable
1509d2335d pcmcia: db1xxx_ss: restrict to MIPS_DB1XXX boards
e2e52b40ef Linux 5.10.121
47c1680e51 md: bcache: check the return value of kzalloc() in detached_dev_do_request()
a67100f426 ext4: only allow test_dummy_encryption when supported
96662c7746 MIPS: IP30: Remove incorrect `cpu_has_fpu' override
57e561573f MIPS: IP27: Remove incorrect `cpu_has_fpu' override
bb55ca1612 RDMA/rxe: Generate a completion for unsupported/invalid opcode
72268945b1 Revert "random: use static branch for crng_ready()"
6b03dc67dd block: fix bio_clone_blkg_association() to associate with proper blkcg_gq
51f724bffa bfq: Make sure bfqg for which we are queueing requests is online
0285718e28 bfq: Get rid of __bio_blkcg() usage
80b0a2b3df bfq: Remove pointless bfq_init_rq() calls
13599aac1b bfq: Drop pointless unlock-lock pair
7d172b9dc9 bfq: Avoid merging queues with different parents
54cdc10ac7 thermal/core: Fix memory leak in the error path
b132abaa65 thermal/core: fix a UAF bug in __thermal_cooling_device_register()
ec1378f2fa kseltest/cgroup: Make test_stress.sh work if run interactively
82b2b60b67 xfs: assert in xfs_btree_del_cursor should take into account error
f1916a88c8 xfs: consider shutdown in bmapbt cursor delete assert
e3ffe7387c xfs: force log and push AIL to clear pinned inodes when aborting mount
0b229d03d0 xfs: restore shutdown check in mapped write fault path
3d05a855dc xfs: fix incorrect root dquot corruption error when switching group/project quota types
893cf5f68a xfs: fix chown leaking delalloc quota blocks when fssetxattr fails
643ceee253 xfs: sync lazy sb accounting on quiesce of read-only mounts
af26bfb04a xfs: set inode size after creating symlink
d27f0000d7 net: ipa: fix page free in ipa_endpoint_replenish_one()
70124d94f4 net: ipa: fix page free in ipa_endpoint_trans_release()
2156dc3904 phy: qcom-qmp: fix reset-controller leak on probe errors
67e3404889 coresight: core: Fix coresight device probe failure issue
77692c02e1 blk-iolatency: Fix inflight count imbalances and IO hangs on offline
19e5aac38a vdpasim: allow to enable a vq repeatedly
ec029087df dt-bindings: gpio: altera: correct interrupt-cells
0ac587c61f docs/conf.py: Cope with removal of language=None in Sphinx 5.0.0
6182c71a0c SMB3: EBADF/EIO errors in rename/open caused by race condition in smb2_compound_op
d6b9b220d1 ARM: pxa: maybe fix gpio lookup tables
39c61f4f7f ARM: dts: s5pv210: Remove spi-cs-high on panel in Aries
6f3673c8d8 phy: qcom-qmp: fix struct clk leak on probe errors
09a84dad95 arm64: dts: qcom: ipq8074: fix the sleep clock frequency
591c3481b1 gma500: fix an incorrect NULL check on list iterator
c521f42dd2 tilcdc: tilcdc_external: fix an incorrect NULL check on list iterator
10c5088a31 serial: pch: don't overwrite xmit->buf[0] by x_char
59afd4f287 bcache: avoid journal no-space deadlock by reserving 1 journal bucket
0cf22f234e bcache: remove incremental dirty sector counting for bch_sectors_dirty_init()
3f686b249b bcache: improve multithreaded bch_sectors_dirty_init()
46c2b5f81c bcache: improve multithreaded bch_btree_check()
4e2fbe8cda stm: ltdc: fix two incorrect NULL checks on list iterator
dc12a64cf8 carl9170: tx: fix an incorrect use of list iterator
8f1bc0edf5 ASoC: rt5514: Fix event generation for "DSP Voice Wake Up" control
769ec2a824 rtl818x: Prevent using not initialized queues
d787a57a17 xtensa/simdisk: fix proc_read_simdisk()
63758dd959 hugetlb: fix huge_pmd_unshare address update
90ad54714e nodemask.h: fix compilation error with GCC12
e9514bce2f iommu/msm: Fix an incorrect NULL check on list iterator
82c888e51c ftrace: Clean up hash direct_functions on register failures
c26ccbaeb8 kexec_file: drop weak attribute from arch_kexec_apply_relocations[_add]
cf0dabc374 um: Fix out-of-bounds read in LDT setup
7f8fd5dd43 um: chan_user: Fix winch_tramp() return value
873069e393 mac80211: upgrade passive scan to active scan on DFS channels after beacon rx
22741dd048 cfg80211: declare MODULE_FIRMWARE for regulatory.db
e87fedad4a irqchip: irq-xtensa-mx: fix initial IRQ affinity
be7ae7cd1c irqchip/armada-370-xp: Do not touch Performance Counter Overflow on A375, A38x, A39x
df7f0f8be3 csky: patch_text: Fixup last cpu should be master
31dca00d0c RDMA/hfi1: Fix potential integer multiplication overflow errors
09408080ad Kconfig: Add option for asm goto w/ tied outputs to workaround clang-13 bug
b67adaec34 ima: remove the IMA_TEMPLATE Kconfig option
577a959cb0 media: coda: Add more H264 levels for CODA960
4005f6a25c media: coda: Fix reported H264 profile
d09dad0057 mtd: cfi_cmdset_0002: Use chip_ready() for write on S29GL064N
08788b917b mtd: cfi_cmdset_0002: Move and rename chip_check/chip_ready/chip_good_for_write
b2b0144422 md: fix an incorrect NULL check in md_reload_sb
2401f1cf3d md: fix an incorrect NULL check in does_sb_need_changing
e28321e013 drm/i915/dsi: fix VBT send packet port selection for ICL+
495ac77576 drm/bridge: analogix_dp: Grab runtime PM reference for DP-AUX
addf0ae792 drm/nouveau/kms/nv50-: atom: fix an incorrect NULL check on list iterator
97a9ec86cc drm/nouveau/clk: Fix an incorrect NULL check on list iterator
436cff507f drm/etnaviv: check for reaped mapping in etnaviv_iommu_unmap_gem
be585921f2 drm/amdgpu/cs: make commands with 0 chunks illegal behaviour.
556e404691 scsi: ufs: qcom: Add a readl() to make sure ref_clk gets enabled
f297dc2364 scsi: dc395x: Fix a missing check on list iterator
337e365507 ocfs2: dlmfs: fix error handling of user_dlm_destroy_lock
4ca3ac06e7 dlm: fix missing lkb refcount handling
899bc44291 dlm: fix plock invalid read
74114d26e9 s390/perf: obtain sie_block from the right address
7994d89012 mm, compaction: fast_find_migrateblock() should return pfn in the target zone
99fd821f56 PCI: qcom: Fix unbalanced PHY init on probe errors
c0e129dafc PCI: qcom: Fix runtime PM imbalance on probe errors
2b4c6ad382 PCI/PM: Fix bridge_d3_blacklist[] Elo i2 overwrite of Gigabyte X299
058cb6d86b tracing: Fix potential double free in create_var_ref()
a2b9edc3f8 ACPI: property: Release subnode properties with data nodes
ff4cafa517 ext4: avoid cycles in directory h-tree
da2f059192 ext4: verify dir block before splitting it
4fd58b5cf1 ext4: fix bug_on in __es_tree_search
cc5b09cb6d ext4: filter out EXT4_FC_REPLAY from on-disk superblock field s_state
1b061af037 ext4: fix bug_on in ext4_writepages
adf490083c ext4: fix warning in ext4_handle_inode_extension
dd887f83ea ext4: fix use-after-free in ext4_rename_dir_prepare
70a7dea846 bfq: Track whether bfq_group is still online
b06691af08 bfq: Update cgroup information before merging bio
4dfc12f8c9 bfq: Split shared queues on move between cgroups
c072cab98b efi: Do not import certificates from UEFI Secure Boot for T2 Macs
9a9dc60da7 fs-writeback: writeback_sb_inodes:Recalculate 'wrote' according skipped pages
c1ad58de13 iwlwifi: mvm: fix assert 1F04 upon reconfig
6118bbdf69 wifi: mac80211: fix use-after-free in chanctx code
efdefbe8b7 f2fs: fix to do sanity check for inline inode
2221a2d410 f2fs: fix fallocate to use file_modified to update permissions consistently
ef221b738b f2fs: fix to do sanity check on total_data_blocks
196f72e089 f2fs: don't need inode lock for system hidden quota
2e790aa378 f2fs: fix deadloop in foreground GC
ccd58045be f2fs: fix to clear dirty inode in f2fs_evict_inode()
a34d7b4989 f2fs: fix to do sanity check on block address in f2fs_do_zero_range()
2766ddaf45 f2fs: fix to avoid f2fs_bug_on() in dec_valid_node_count()
d8b6aaeb9a perf jevents: Fix event syntax error caused by ExtSel
c8c2802407 perf c2c: Use stdio interface if slang is not supported
c9542f5f90 i2c: rcar: fix PM ref counts in probe error paths
ebd4f37ac1 i2c: npcm: Handle spurious interrupts
5c0dfca6b9 i2c: npcm: Correct register access width
06cb0f056b i2c: npcm: Fix timeout calculation
de6f6b5400 iommu/amd: Increase timeout waiting for GA log enablement
3cfb546439 dmaengine: stm32-mdma: fix chan initialization in stm32_mdma_irq_handler()
13d8d11dfa dmaengine: stm32-mdma: rework interrupt handler
0f87bd8b5f dmaengine: stm32-mdma: remove GISR1 register
c1c4405222 video: fbdev: clcdfb: Fix refcount leak in clcdfb_of_vram_setup
96fdbb1c85 NFSv4/pNFS: Do not fail I/O when we fail to allocate the pNFS layout
83839a333f NFS: Don't report errors from nfs_pageio_complete() more than once
040242365c NFS: Do not report flush errors in nfs_write_end()
c5a0e59bbe NFS: fsync() should report filesystem errors over EINTR/ERESTARTSYS
418b9fa434 NFS: Do not report EINTR/ERESTARTSYS as mapping errors
6073af7815 dmaengine: idxd: Fix the error handling path in idxd_cdev_register()
f57696bc63 i2c: at91: Initialize dma_buf in at91_twi_xfer()
8e49773a75 MIPS: Loongson: Use hwmon_device_register_with_groups() to register hwmon
ec5ded7acb cpufreq: mediatek: Unregister platform device on exit
9d91400fff cpufreq: mediatek: Use module_init and add module_exit
c7b0ec9744 cpufreq: mediatek: add missing platform_driver_unregister() on error in mtk_cpufreq_driver_init
fb02d6b543 i2c: at91: use dma safe buffers
da748d263a iommu/mediatek: Add list_del in mtk_iommu_remove
51d584704d f2fs: fix dereference of stale list iterator after loop body
0e0faa1431 OPP: call of_node_put() on error path in _bandwidth_supported()
baf86afed7 Input: stmfts - do not leave device disabled in stmfts_input_open
fc0750e659 RDMA/hfi1: Prevent use of lock before it is initialized
bb2220e067 mailbox: forward the hrtimer if not queued and under a lock
a1d4941d9a mfd: davinci_voicecodec: Fix possible null-ptr-deref davinci_vc_probe()
46fd994763 powerpc/fsl_rio: Fix refcount leak in fsl_rio_setup
b8ef79697b macintosh: via-pmu and via-cuda need RTC_LIB
cca915d691 powerpc/perf: Fix the threshold compare group constraint for power9
7620a280da powerpc/64: Only WARN if __pa()/__va() called with bad addresses
9b28515641 hwrng: omap3-rom - fix using wrong clk_disable() in omap_rom_rng_runtime_resume()
40d428b528 PCI/AER: Clear MULTI_ERR_COR/UNCOR_RCV bits
6e07ccc7d5 Input: sparcspkr - fix refcount leak in bbc_beep_probe
76badb0a4d crypto: cryptd - Protect per-CPU resource by disabling BH.
40c41a7bfd crypto: sun8i-ss - handle zero sized sg
5bea8f700a crypto: sun8i-ss - rework handling of IV
9834b13e8b tty: fix deadlock caused by calling printk() under tty_port->lock
a21d4dab77 PCI: imx6: Fix PERST# start-up sequence
2a9d3b5118 ipc/mqueue: use get_tree_nodev() in mqueue_get_tree()
f061ddfed9 proc: fix dentry/inode overinstantiating under /proc/${pid}/net
ab0c26e441 ASoC: atmel-classd: Remove endianness flag on class d component
b716e4168d ASoC: atmel-pdmic: Remove endianness flag on pdmic component
456105105e powerpc/4xx/cpm: Fix return value of __setup() handler
de5bc92318 powerpc/idle: Fix return value of __setup() handler
f991879762 pinctrl: renesas: core: Fix possible null-ptr-deref in sh_pfc_map_resources()
f7c290eac8 powerpc/8xx: export 'cpm_setbrg' for modules
49a5b1735c drivers/base/memory: fix an unlikely reference counting issue in __add_memory_block()
c121942917 dax: fix cache flush on PMD-mapped pages
d8a5bdc767 drivers/base/node.c: fix compaction sysfs file leak
84958f066d pinctrl: mvebu: Fix irq_of_parse_and_map() return value
8a8b40d007 nvdimm: Allow overwrite in the presence of disabled dimms
641649f31e nvdimm: Fix firmware activation deadlock scenarios
1052f22e12 firmware: arm_scmi: Fix list protocols enumeration in the base protocol
7a55a5159d scsi: fcoe: Fix Wstringop-overflow warnings in fcoe_wwn_from_mac()
17d9d7d264 mfd: ipaq-micro: Fix error check return value of platform_get_irq()
82c6c8a66c powerpc/fadump: fix PT_LOAD segment for boot memory area
08b053d32b arm: mediatek: select arch timer for mt7629
ceb61ab22d pinctrl: bcm2835: implement hook for missing gpio-ranges
cda45b715d gpiolib: of: Introduce hook for missing gpio-ranges
a26dfdf0a6 crypto: marvell/cesa - ECB does not IV
ee89d8dee5 misc: ocxl: fix possible double free in ocxl_file_register_afu
22c3fea20a ARM: dts: bcm2835-rpi-b: Fix GPIO line names
0a4ee6cdaa ARM: dts: bcm2837-rpi-3-b-plus: Fix GPIO line name of power LED
bd7ffc171c ARM: dts: bcm2837-rpi-cm3-io3: Fix GPIO line names for SMPS I2C
daffdb0830 ARM: dts: bcm2835-rpi-zero-w: Fix GPIO line name for Wifi/BT
95000ae680 ARM: dts: stm32: Fix PHY post-reset delay on Avenger96
b439f7addd can: xilinx_can: mark bit timing constants as const
875a17c3ad platform/chrome: Re-introduce cros_ec_cmd_xfer and use it for ioctls
b0bf87b1b3 ARM: dts: imx6dl-colibri: Fix I2C pinmuxing
acd2313bd9 platform/chrome: cros_ec: fix error handling in cros_ec_register()
e690350d3d KVM: nVMX: Clear IDT vectoring on nested VM-Exit for double/triple fault
fd7dca68a6 KVM: nVMX: Leave most VM-Exit info fields unmodified on failed VM-Entry
259c1fad9f soc: qcom: llcc: Add MODULE_DEVICE_TABLE()
ca7ce579a7 ARM: dts: ci4x10: Adapt to changes in imx6qdl.dtsi regarding fec clocks
acd99f384c PCI: dwc: Fix setting error return on MSI DMA mapping failure
92b7cab307 PCI: rockchip: Fix find_first_zero_bit() limit
266f5cf692 PCI: cadence: Fix find_first_zero_bit() limit
a409d0b1f9 soc: qcom: smsm: Fix missing of_node_put() in smsm_parse_ipc
7cbe94d296 soc: qcom: smp2p: Fix missing of_node_put() in smp2p_parse_ipc
8365341798 ARM: dts: suniv: F1C100: fix watchdog compatible
ea4f1c6bb9 memory: samsung: exynos5422-dmc: Avoid some over memory allocation
3960629bb5 arm64: dts: rockchip: Move drive-impedance-ohm to emmc phy on rk3399
0c5f04da02 net/smc: postpone sk_refcnt increment in connect()
8096e2d7c0 hinic: Avoid some over memory allocation
dc7753d600 net: huawei: hinic: Use devm_kcalloc() instead of devm_kzalloc()
4790963ef4 rxrpc: Fix decision on when to generate an IDLE ACK
3eef677a25 rxrpc: Don't let ack.previousPacket regress
573de88fc1 rxrpc: Fix overlapping ACK accounting
4f1c34ee60 rxrpc: Don't try to resend the request if we're receiving the reply
5b4826657d rxrpc: Fix listen() setting the bar too high for the prealloc rings
541224201e hv_netvsc: Fix potential dereference of NULL pointer
deb16df525 net: stmmac: fix out-of-bounds access in a selftest
5c2b34d072 net: stmmac: selftests: Use kcalloc() instead of kzalloc()
7386f69041 ASoC: max98090: Move check for invalid values before casting in max98090_put_enab_tlv()
d015f6f694 NFC: hci: fix sleep in atomic context bugs in nfc_hci_hcp_message_tx
7a5e6a4898 ASoC: wm2000: fix missing clk_disable_unprepare() on error in wm2000_anc_transition()
8bbf522a2c thermal/drivers/imx_sc_thermal: Fix refcount leak in imx_sc_thermal_probe
18530bedd2 thermal/core: Fix memory leak in __thermal_cooling_device_register()
dcf5ffc91c thermal/drivers/core: Use a char pointer for the cooling device name
79098339ac thermal/drivers/broadcom: Fix potential NULL dereference in sr_thermal_probe
8360380295 thermal/drivers/bcm2711: Don't clamp temperature at zero
3161044e75 drm/i915: Fix CFI violation with show_dynamic_id()
ffbcfb1688 drm/msm/dpu: handle pm_runtime_get_sync() errors in bind path
2679de7d04 x86/sev: Annotate stack change in the #VC handler
656aa3c51f drm: msm: fix possible memory leak in mdp5_crtc_cursor_set()
48e82ce8cd drm/msm/a6xx: Fix refcount leak in a6xx_gpu_init
d54ac6ca48 ext4: reject the 'commit' option on ext2 filesystems
63b7c08995 media: rkvdec: h264: Fix bit depth wrap in pps packet
b4805a77d5 media: rkvdec: h264: Fix dpb_valid implementation
82239e30ab media: staging: media: rkvdec: Make use of the helper function devm_platform_ioremap_resource()
5c24566294 media: ov7670: remove ov7670_power_off from ov7670_remove
510e879420 ASoC: ti: j721e-evm: Fix refcount leak in j721e_soc_probe_*
33411945c9 net: hinic: add missing destroy_workqueue in hinic_pf_to_mgmt_init
8113eedbab sctp: read sk->sk_bound_dev_if once in sctp_rcv()
6950ee32c1 lsm,selinux: pass flowi_common instead of flowi to the LSM hooks
a67a1661cf m68k: math-emu: Fix dependencies of math emulation support
4dcae15ff8 nvme: set dma alignment to dword
8ace1e6355 Bluetooth: use hdev lock for accept_list and reject_list in conn req
792f8b0e74 Bluetooth: use inclusive language when filtering devices
d763aa352c Bluetooth: use inclusive language in HCI role comments
c024f6f11d Bluetooth: LL privacy allow RPA
394df9f17e Bluetooth: L2CAP: Rudimentary typo fixes
5702c3c657 Bluetooth: Interleave with allowlist scan
36c644c63b Bluetooth: fix dangling sco_conn and use-after-free in sco_sock_timeout
fc68385fcb media: vsp1: Fix offset calculation for plane cropping
a3304766d9 media: pvrusb2: fix array-index-out-of-bounds in pvr2_i2c_core_init
7d792640d3 media: exynos4-is: Change clk_disable to clk_disable_unprepare
b3e4837358 media: st-delta: Fix PM disable depth imbalance in delta_probe
8e4e0c4ac5 media: exynos4-is: Fix PM disable depth imbalance in fimc_is_probe
0572a5bd38 media: aspeed: Fix an error handling path in aspeed_video_probe()
34feaea3aa scripts/faddr2line: Fix overlapping text section failures
1472fb1c74 kselftest/cgroup: fix test_stress.sh to use OUTPUT dir
cacea459f9 ASoC: samsung: Fix refcount leak in aries_audio_probe
c1b08aa568 ASoC: samsung: Use dev_err_probe() helper
9f564e29a5 regulator: pfuze100: Fix refcount leak in pfuze_parse_regulators_dt
2a0da7641e ASoC: mxs-saif: Fix refcount leak in mxs_saif_probe
e84aaf23ca ASoC: fsl: Fix refcount leak in imx_sgtl5000_probe
4024affd53 ath11k: Don't check arvif->is_started before sending management frames
779d41c80b perf/amd/ibs: Use interrupt regs ip for stack unwinding
37a9db0ee7 regulator: qcom_smd: Fix up PM8950 regulator configuration
e2786db0a7 Revert "cpufreq: Fix possible race in cpufreq online error path"
560dcbe1c7 spi: spi-fsl-qspi: check return value after calling platform_get_resource_byname()
f40549ce20 iomap: iomap_write_failed fix
7a79ab2596 media: uvcvideo: Fix missing check to determine if element is found in list
d50b26221f drm/msm: return an error pointer in msm_gem_prime_get_sg_table()
883f1d52a5 drm/msm/mdp5: Return error code in mdp5_mixer_release when deadlock is detected
49dc28b4b2 drm/msm/mdp5: Return error code in mdp5_pipe_release when deadlock is detected
a10092daba drm/msm/dp: fix event thread stuck in wait_event after kthread_stop()
369a712442 regulator: core: Fix enable_count imbalance with EXCLUSIVE_GET
018ebe4c18 arm64: fix types in copy_highpage()
49bfbaf6a0 x86/mm: Cleanup the control_va_addr_alignment() __setup handler
0d5c8ac922 irqchip/aspeed-scu-ic: Fix irq_of_parse_and_map() return value
f4b503b4ef irqchip/aspeed-i2c-ic: Fix irq_of_parse_and_map() return value
5e76e51633 irqchip/exiu: Fix acknowledgment of edge triggered interrupts
35abf2081f x86: Fix return value of __setup handlers
940b12435b virtio_blk: fix the discard_granularity and discard_alignment queue limits
23716d7614 perf tools: Use Python devtools for version autodetection rather than runtime
3451852312 drm/rockchip: vop: fix possible null-ptr-deref in vop_bind()
e19ece6f24 drm/panel: panel-simple: Fix proper bpc for AM-1280800N3TZQW-T00H
5a26a49470 drm/msm: add missing include to msm_drv.c
7b815e91ff drm/msm/hdmi: fix error check return value of irq_of_parse_and_map()
d9cb951d11 drm/msm/hdmi: check return value after calling platform_get_resource_byname()
e99755e6a9 drm/msm/dsi: fix error checks and return values for DSI xmit functions
3574e0b290 drm/msm/dp: fix error check return value of irq_of_parse_and_map()
04204612dd drm/msm/dp: stop event kernel thread when DP unbind
134760263f drm/msm/disp/dpu1: set vbif hw config to NULL to avoid use after memory free during pm runtime resume
d5773db56c perf tools: Add missing headers needed by util/data.h
e251a33fe8 ASoC: rk3328: fix disabling mclk on pclk probe failure
e2fef34d78 x86/speculation: Add missing prototype for unpriv_ebpf_notify()
81f1ddffdc mtd: rawnand: cadence: fix possible null-ptr-deref in cadence_nand_dt_probe()
b6ecf2b7e6 x86/pm: Fix false positive kmemleak report in msr_build_context()
0e1cd4edef mtd: spi-nor: core: Check written SR value in spi_nor_write_16bit_sr_and_check()
ab88c8d906 libbpf: Fix logic for finding matching program for CO-RE relocation
97b56f17b3 selftests/resctrl: Fix null pointer dereference on open failed
c54d66c514 scsi: ufs: core: Exclude UECxx from SFR dump list
02192ee936 scsi: ufs: qcom: Fix ufs_qcom_resume()
328cfeac73 drm/msm/dpu: adjust display_v_end for eDP and DP
cc68e53f9a of: overlay: do not break notify on NOTIFY_{OK|STOP}
f929416d5c fsnotify: fix wrong lockdep annotations
94845fc422 inotify: show inotify mask flags in proc fdinfo
f2c68c5289 ALSA: pcm: Check for null pointer of pointer substream before dereferencing it
d764a7d647 drm/panel: simple: Add missing bus flags for Innolux G070Y2-L01
b6b70cd3dd media: hantro: Empty encoder capture buffers by default
461e4c1f19 ath9k_htc: fix potential out of bounds access with invalid rxstatus->rs_keyix
96c848afbd cpufreq: Fix possible race in cpufreq online error path
172789fd95 spi: img-spfi: Fix pm_runtime_get_sync() error checking
147a376c1a sched/fair: Fix cfs_rq_clock_pelt() for throttled cfs_rq
f35c3f2374 drm/bridge: Fix error handling in analogix_dp_probe
6d0726725c HID: elan: Fix potential double free in elan_input_configured
39d4bd3f59 HID: hid-led: fix maximum brightness for Dream Cheeky
3c68daf4a3 mtd: rawnand: denali: Use managed device resources
dd2b1d70ef EDAC/dmc520: Don't print an error for each unconfigured interrupt line
bea6985099 drbd: fix duplicate array initializer
3eba802d47 target: remove an incorrect unmap zeroes data deduction
e7681199bb efi: Add missing prototype for efi_capsule_setup_info
2a1b5110c9 NFC: NULL out the dev->rfkill to prevent UAF
8e357f086d net: dsa: mt7530: 1G can also support 1000BASE-X link mode
4565d5be8b scftorture: Fix distribution of short handler delays
58eff5b73f spi: spi-ti-qspi: Fix return value handling of wait_for_completion_timeout
b4c7dd0037 drm: mali-dp: potential dereference of null pointer
78a3e9fcdb drm/komeda: Fix an undefined behavior bug in komeda_plane_add()
3cea0259ed nl80211: show SSID for P2P_GO interfaces
6c0a8c771a bpf: Fix excessive memory allocation in stack_map_alloc()
7ff76dc2d8 libbpf: Don't error out on CO-RE relos for overriden weak subprogs
84b0e23e10 drm/vc4: txp: Force alpha to be 0xff if it's disabled
ac904216b8 drm/vc4: txp: Don't set TXP_VSTART_AT_EOF
15cec7dfd3 drm/vc4: hvs: Reset muxes at probe time
2268f190af drm/mediatek: Fix mtk_cec_mask()
032f8c67fe drm/ingenic: Reset pixclock rate when parent clock rate changes
58c7c01577 x86/delay: Fix the wrong asm constraint in delay_loop()
f279c49f17 ASoC: mediatek: Fix missing of_node_put in mt2701_wm8960_machine_probe
fb66e0512e ASoC: mediatek: Fix error handling in mt8173_max98090_dev_probe
35db6e2e99 spi: qcom-qspi: Add minItems to interconnect-names
187ecfc3b7 drm/bridge: adv7511: clean up CEC adapter when probe fails
9072d62785 drm/edid: fix invalid EDID extension block filtering
0d6dc3efb1 ath9k: fix ar9003_get_eepmisc
822dac24b4 ath11k: acquire ab->base_lock in unassign when finding the peer by addr
3ed327b77d dt-bindings: display: sitronix, st7735r: Fix backlight in example
61bbbde9b6 drm: fix EDID struct for old ARM OABI format
cc80d3c37c RDMA/hfi1: Prevent panic when SDMA is disabled
dfc308d6f2 powerpc/iommu: Add missing of_node_put in iommu_init_early_dart
b4e14e9beb macintosh/via-pmu: Fix build failure when CONFIG_INPUT is disabled
0230055fa6 powerpc/powernv: fix missing of_node_put in uv_init()
6a61a97106 powerpc/xics: fix refcount leak in icp_opal_init()
8a665c2791 powerpc/powernv/vas: Assign real address to rx_fifo in vas_rx_win_attr
5a3767ac79 tracing: incorrect isolate_mote_t cast in mm_vmscan_lru_isolate
eff3587b9c PCI: Avoid pci_dev_lock() AB/BA deadlock with sriov_numvfs_store()
21a3effe44 ARM: hisi: Add missing of_node_put after of_find_compatible_node
d2b3b380c1 ARM: dts: exynos: add atmel,24c128 fallback to Samsung EEPROM
d146e2a986 ARM: versatile: Add missing of_node_put in dcscb_init
b646e0cfeb pinctrl: renesas: rzn1: Fix possible null-ptr-deref in sh_pfc_map_resources()
c16f1b3d72 fat: add ratelimit to fat*_ent_bread()
f20c7cd2b2 powerpc/fadump: Fix fadump to work with a different endian capture kernel
039966775c ARM: OMAP1: clock: Fix UART rate reporting algorithm
9dfa8d087b fs: jfs: fix possible NULL pointer dereference in dbFree()
05efc4591f soc: ti: ti_sci_pm_domains: Check for null return of devm_kcalloc
0f9091f202 crypto: ccree - use fine grained DMA mapping dir
86b091b689 PM / devfreq: rk3399_dmc: Disable edev on remove()
7e391ec939 arm64: dts: qcom: msm8994: Fix BLSP[12]_DMA channels count
c400439adc ARM: dts: s5pv210: align DMA channels with dtschema
0521c52978 ARM: dts: ox820: align interrupt controller node name with dtschema
968a668376 IB/rdmavt: add missing locks in rvt_ruc_loopback
6a2e275834 gfs2: use i_lock spin_lock for inode qadata
92ef7a8719 selftests/bpf: fix btf_dump/btf_dump due to recent clang change
340cf91293 eth: tg3: silence the GCC 12 array-bounds warning
cb2ca93f8f rxrpc, afs: Fix selection of abort codes
4a4e2e90ec rxrpc: Return an error to sendmsg if call failed
6c18a0fcd6 m68k: atari: Make Atari ROM port I/O write macros return void
76744a016e x86/microcode: Add explicit CPU vendor dependency
f29fb46232 can: mcp251xfd: silence clang's -Wunaligned-access warning
ff383c1879 ASoC: rt1015p: remove dependency on GPIOLIB
c73aee1946 ASoC: max98357a: remove dependency on GPIOLIB
86c02171bd media: exynos4-is: Fix compile warning
abb5594ae2 net: phy: micrel: Allow probing without .driver_data
8d33585ffa nbd: Fix hung on disconnect request if socket is closed before
1a5a3dfd9f ASoC: rt5645: Fix errorenous cleanup order
af98940dd3 nvme-pci: fix a NULL pointer dereference in nvme_alloc_admin_tags
8671aeeef2 openrisc: start CPU timer early in boot
22cdbb1354 media: cec-adap.c: fix is_configuring state
4cf6ba9367 media: imon: reorganize serialization
f3915b4665 media: coda: limit frame interval enumeration to supported encoder frame sizes
8ddc89437c media: rga: fix possible memory leak in rga_probe
f9413b9023 rtlwifi: Use pr_warn instead of WARN_ONCE
eb7a71b7b2 ipmi: Fix pr_fmt to avoid compilation issues
fa390c8b62 ipmi:ssif: Check for NULL msg when handling events and messages
0b7c1dc7ee ACPI: PM: Block ASUS B1400CEAE from suspend to idle by default
1ecd01d77c dma-debug: change allocation mode from GFP_NOWAIT to GFP_ATIOMIC
a61583744e spi: stm32-qspi: Fix wait_cmd timeout in APM mode
0c05c03c51 perf/amd/ibs: Cascade pmu init functions' return value
4605458398 s390/preempt: disable __preempt_count_add() optimization for PROFILE_ALL_BRANCHES
312c43e98e net: remove two BUG() from skb_checksum_help()
4f99bde59e ASoC: tscs454: Add endianness flag in snd_soc_component_driver
296f8ca0f7 HID: bigben: fix slab-out-of-bounds Write in bigben_probe
3ee67465f7 drm/amdgpu/ucode: Remove firmware load type check in amdgpu_ucode_free_bo
6f19abe031 mlxsw: Treat LLDP packets as control
b30e727f09 mlxsw: spectrum_dcb: Do not warn about priority changes
d68a5eb7b3 ASoC: dapm: Don't fold register value changes into notifications
9b42659cb3 net/mlx5: fs, delete the FTE when there are no rules attached to it
4d85201adb ipv6: Don't send rs packets to the interface of ARPHRD_TUNNEL
0325c08ae2 drm: msm: fix error check return value of irq_of_parse_and_map()
ad97425d23 arm64: compat: Do not treat syscall number as ESR_ELx for a bad syscall
8aa3750986 ath10k: skip ath10k_halt during suspend for driver state RESTARTING
20ad91d08a drm/amd/pm: fix the compile warning
b5cd108143 drm/plane: Move range check for format_count earlier
8c3fe9ff80 ASoC: Intel: bytcr_rt5640: Add quirk for the HP Pro Tablet 408
60afa4f4e1 ath11k: disable spectral scan during spectral deinit
fa1b509d41 scsi: lpfc: Fix resource leak in lpfc_sli4_send_seq_to_ulp()
1869f9bfaf scsi: ufs: Use pm_runtime_resume_and_get() instead of pm_runtime_get_sync()
508add11af scsi: megaraid: Fix error check return value of register_chrdev()
95050b9847 drivers: mmc: sdhci_am654: Add the quirk to set TESTCD bit
90281cadf5 mmc: jz4740: Apply DMA engine limits to maximum segment size
e69e93120f md/bitmap: don't set sb values if can't pass sanity check
3f94169aff media: cx25821: Fix the warning when removing the module
ca17e7a532 media: pci: cx23885: Fix the error handling in cx23885_initdev()
27ad46da44 media: venus: hfi: avoid null dereference in deinit
e68270a786 ath9k: fix QCA9561 PA bias level
ca1ce20689 drm/amd/pm: fix double free in si_parse_power_table()
3102e9d7e5 tools/power turbostat: fix ICX DRAM power numbers
fbfeb9bc94 spi: spi-rspi: Remove setting {src,dst}_{addr,addr_width} based on DMA direction
e2b8681769 ALSA: jack: Access input_dev under mutex
005990e30d sfc: ef10: Fix assigning negative value to unsigned variable
10f30cba8f rcu: Make TASKS_RUDE_RCU select IRQ_WORK
1c6c3f2336 rcu-tasks: Fix race in schedule and flush work
c977d63b8c drm/komeda: return early if drm_universal_plane_init() fails.
cd97a481ea ACPICA: Avoid cache flush inside virtual machines
29cb802966 x86/platform/uv: Update TSC sync state for UV5
59dd1a07ee fbcon: Consistently protect deferred_takeover with console_lock()
5bfb65e92f ipv6: fix locking issues with loops over idev->addr_list
98d1dc32f8 ipw2x00: Fix potential NULL dereference in libipw_xmit()
cc575b8558 b43: Fix assigning negative value to unsigned variable
4ae5a2ccf5 b43legacy: Fix assigning negative value to unsigned variable
74ad0d7450 mwifiex: add mutex lock for call in mwifiex_dfs_chan_sw_work_queue
fadc626cae drm/virtio: fix NULL pointer dereference in virtio_gpu_conn_get_modes
c6380d9d2d iommu/vt-d: Add RPLS to quirk list to skip TE disabling
509e9710b8 btrfs: repair super block num_devices automatically
4093eea47d btrfs: add "0x" prefix for unsupported optional features
b49516583f ptrace: Reimplement PTRACE_KILL by always sending SIGKILL
f8ef79687b ptrace/xtensa: Replace PT_SINGLESTEP with TIF_SINGLESTEP
6580673b17 ptrace/um: Replace PT_DTRACE with TIF_SINGLESTEP
92fb46536a perf/x86/intel: Fix event constraints for ICL
b4acb8e7f1 x86/MCE/AMD: Fix memory leak when threshold_create_bank() fails
860e44f21f parisc/stifb: Keep track of hardware path of graphics card
78e008dca2 Fonts: Make font size unsigned in font_desc
c5b9b7fb12 xhci: Allow host runtime PM as default for Intel Alder Lake N xHCI
c9ac773715 cifs: when extending a file with falloc we should make files not-sparse
ce4627f09e usb: core: hcd: Add support for deferring roothub registration
a2532c4417 usb: dwc3: gadget: Move null pinter check to proper place
0420275d64 USB: new quirk for Dell Gen 2 devices
19b3fe8a7c USB: serial: option: add Quectel BG95 modem
40bdb5ec95 ALSA: usb-audio: Cancel pending work at closing a MIDI substream
1cf70d5c15 ALSA: hda/realtek - Fix microphone noise on ASUS TUF B550M-PLUS
223368eaf6 ALSA: hda/realtek: Enable 4-speaker output for Dell XPS 15 9520 laptop
d2f3acde3d riscv: Fix irq_work when SMP is disabled
4a5c7a61ff riscv: Initialize thread pointer before calling C functions
6b45437959 parisc/stifb: Implement fb_is_primary_device()
9cef71ecea binfmt_flat: do not stop relocating GOT entries prematurely on riscv
43ca8e1dfb Merge 5.10.118 into android12-5.10-lts
70dd2d169d Linux 5.10.120
886eeb0460 bpf: Enlarge offset check value to INT_MAX in bpf_skb_{load,store}_bytes
7f845de286 bpf: Fix potential array overflow in bpf_trampoline_get_progs()
3097f38e91 NFSD: Fix possible sleep during nfsd4_release_lockowner()
78a62e09d8 NFS: Memory allocation failures are not server fatal errors
1d100fcc1d docs: submitting-patches: Fix crossref to 'The canonical patch format'
ebbbffae71 tpm: ibmvtpm: Correct the return value in tpm_ibmvtpm_probe()
5933a191ac tpm: Fix buffer access in tpm2_get_tpm_pt()
0c56e5d0e6 HID: multitouch: add quirks to enable Lenovo X12 trackpoint
d6822d82c0 HID: multitouch: Add support for Google Whiskers Touchpad
0f03885059 raid5: introduce MD_BROKEN
8df42bcd36 dm verity: set DM_TARGET_IMMUTABLE feature flag
e39b536d70 dm stats: add cond_resched when looping over entries
4617778417 dm crypt: make printing of the key constant-time
bb64957c47 dm integrity: fix error code in dm_integrity_ctr()
8845027e55 ARM: dts: s5pv210: Correct interrupt name for bluetooth in Aries
4989bb0334 Bluetooth: hci_qca: Use del_timer_sync() before freeing
fae05b2314 zsmalloc: fix races between asynchronous zspage free and page migration
6a1cc25494 crypto: ecrdsa - Fix incorrect use of vli_cmp
c013f7d1cd crypto: caam - fix i.MX6SX entropy delay value
3d8fc6e28f KVM: x86: avoid calling x86 emulator without a decoded instruction
a2a3fa5b61 x86, kvm: use correct GFP flags for preemption disabled
4a9f3a9c28 x86/kvm: Alloc dummy async #PF token outside of raw spinlock
4c4a11c74a KVM: PPC: Book3S HV: fix incorrect NULL check on list iterator
91a36ec160 netfilter: conntrack: re-fetch conntrack after insertion
c0aff1faf6 netfilter: nf_tables: sanitize nft_set_desc_concat_parse()
44f1ce5530 crypto: drbg - make reseeding from get_random_bytes() synchronous
e744e34a3c crypto: drbg - move dynamic ->reseed_threshold adjustments to __drbg_seed()
54700e82a7 crypto: drbg - track whether DRBG was seeded with !rng_is_initialized()
b2bef5500e crypto: drbg - prepare for more fine-grained tracking of seeding state
630192aa45 lib/crypto: add prompts back to crypto libraries
82f723b8a5 exfat: check if cluster num is valid
1f0681f3bd drm/i915: Fix -Wstringop-overflow warning in call to intel_read_wm_latency()
2728d95c6c xfs: Fix CIL throttle hang when CIL space used going backwards
a9e7f19a55 xfs: fix an ABBA deadlock in xfs_rename
72464fd2b4 xfs: fix the forward progress assertion in xfs_iwalk_run_callbacks
45d97f70da xfs: show the proper user quota options
f20e67b455 xfs: detect overflows in bmbt records
ffc8d61387 net: ipa: compute proper aggregation limit
8adb751d29 io_uring: fix using under-expanded iters
57d01bcae7 io_uring: don't re-import iovecs from callbacks
6029f86740 assoc_array: Fix BUG_ON during garbage collect
b96b4aa65b cfg80211: set custom regdomain after wiphy registration
8fbd54ab06 pipe: Fix missing lock in pipe_resize_ring()
cd720fad8b pipe: make poll_usage boolean and annotate its access
ea62d169b6 netfilter: nf_tables: disallow non-stateful expression in sets earlier
5525af175b drivers: i2c: thunderx: Allow driver to work with ACPI defined TWSI controllers
f0749aecb2 i2c: ismt: Provide a DMA buffer for Interrupt Cause Logging
828309eee5 net: ftgmac100: Disable hardware checksum on AST2600
640397afdf nfc: pn533: Fix buggy cleanup order
ac8d5eb26c net: af_key: check encryption module availability consistency
d007f49ab7 percpu_ref_init(): clean ->percpu_count_ref on failure
75e35951d6 pinctrl: sunxi: fix f1c100s uart2 function
56c31ac1d8 Linux 5.10.119
7c57f21349 ALSA: ctxfi: Add SB046x PCI ID
514f587340 random: check for signals after page of pool writes
18c261e948 random: wire up fops->splice_{read,write}_iter()
cf8f8d3758 random: convert to using fops->write_iter()
affa1ae522 random: convert to using fops->read_iter()
4bb374a118 random: unify batched entropy implementations
552ae8e484 random: move randomize_page() into mm where it belongs
5f2a040b2f random: move initialization functions out of hot pages
02102b63bd random: make consistent use of buf and len
33783ca355 random: use proper return types on get_random_{int,long}_wait()
1fdd7eef21 random: remove extern from functions in header
811afd06e0 random: use static branch for crng_ready()
04d61b96bd random: credit architectural init the exact amount
5123cc61e2 random: handle latent entropy and command line from random_init()
9320e087f2 random: use proper jiffies comparison macro
31ac294037 random: remove ratelimiting for in-kernel unseeded randomness
b50f2830b3 random: move initialization out of reseeding hot path
4c4110c052 random: avoid initializing twice in credit race
cef9010b78 random: use symbolic constants for crng_init states
30e9f36266 siphash: use one source of truth for siphash permutations
772edeb8c7 random: help compiler out with fast_mix() by using simpler arguments
1841347233 random: do not use input pool from hard IRQs
999b0c9e8a random: order timer entropy functions below interrupt functions
ce3c4ff381 random: do not pretend to handle premature next security model
24d3275685 random: use first 128 bits of input as fast init
273aebb50b random: do not use batches when !crng_ready()
f4c98fe1d1 random: insist on random_get_entropy() existing in order to simplify
ffcfdd5de9 xtensa: use fallback for random_get_entropy() instead of zero
e1ea0e26d3 sparc: use fallback for random_get_entropy() instead of zero
a5092be129 um: use fallback for random_get_entropy() instead of zero
25d4fdf1f0 x86/tsc: Use fallback for random_get_entropy() instead of zero
0b93f40cbe nios2: use fallback for random_get_entropy() instead of zero
fdca775081 arm: use fallback for random_get_entropy() instead of zero
d5531246af mips: use fallback for random_get_entropy() instead of just c0 random
714def4497 riscv: use fallback for random_get_entropy() instead of zero
84397906a6 m68k: use fallback for random_get_entropy() instead of zero
7690be1adf timekeeping: Add raw clock fallback for random_get_entropy()
07b5d0b3e2 powerpc: define get_cycles macro for arch-override
30ee01bcdc alpha: define get_cycles macro for arch-override
c55a863c30 parisc: define get_cycles macro for arch-override
641d1fbd96 s390: define get_cycles macro for arch-override
c895438b17 ia64: define get_cycles macro for arch-override
7d9eab78be init: call time_init() before rand_initialize()
ec25e386d3 random: fix sysctl documentation nits
9dff512945 random: document crng_fast_key_erasure() destination possibility
a1b5c849d8 random: make random_get_entropy() return an unsigned long
72a9ec8d75 random: allow partial reads if later user copies fail
1805d20dfb random: check for signals every PAGE_SIZE chunk of /dev/[u]random
9641d9b430 random: check for signal_pending() outside of need_resched() check
26ee8fa4df random: do not allow user to keep crng key around on stack
bb515a5bef random: do not split fast init input in add_hwgenerator_randomness()
be0d4e3e96 random: mix build-time latent entropy into pool at init
bb563d06c5 random: re-add removed comment about get_random_{u32,u64} reseeding
f3bc5eca83 random: treat bootloader trust toggle the same way as cpu trust toggle
7cb6782146 random: skip fast_init if hwrng provides large chunk of entropy
083ab33951 random: check for signal and try earlier when generating entropy
20da9c6079 random: reseed more often immediately after booting
9891211dfe random: make consistent usage of crng_ready()
95a1c94a1b random: use SipHash as interrupt entropy accumulator
849e7b744c random: replace custom notifier chain with standard one
66307429b5 random: don't let 644 read-only sysctls be written to
4c74ca006a random: give sysctl_random_min_urandom_seed a more sensible value
0964a76fd5 random: do crng pre-init loading in worker rather than irq
192d4c6cb3 random: unify cycles_t and jiffies usage and types
47f0e89b71 random: cleanup UUID handling
9b0e0e2714 random: only wake up writers after zap if threshold was passed
c47f215ab3 random: round-robin registers as ulong, not u32
5064550d42 random: clear fast pool, crng, and batches in cpuhp bring up
6e1cb84cc6 random: pull add_hwgenerator_randomness() declaration into random.h
32252548b5 random: check for crng_init == 0 in add_device_randomness()
684e9fe92d random: unify early init crng load accounting
f656bd0011 random: do not take pool spinlock at boot
5d73e69a5d random: defer fast pool mixing to worker
7873321cd8 random: rewrite header introductory comment
6d1671b6d2 random: group sysctl functions
21ae543e3a random: group userspace read/write functions
f04580811d random: group entropy collection functions
e9ff357860 random: group entropy extraction functions
d7e5b1925a random: group crng functions
6b1ffb3b5a random: group initialization wait functions
6c9cee1555 random: remove whitespace and reorder includes
7b0f36f7c2 random: remove useless header comment
b390181654 random: introduce drain_entropy() helper to declutter crng_reseed()
0971c1c2fd random: deobfuscate irq u32/u64 contributions
ae1b8f1954 random: add proper SPDX header
9342656c01 random: remove unused tracepoints
17ad693cd2 random: remove ifdef'd out interrupt bench
28683a1885 random: tie batched entropy generation to base_crng generation
adc32acf23 random: fix locking for crng_init in crng_reseed()
bb63851c25 random: zero buffer after reading entropy from userspace
63c1aae40a random: remove outdated INT_MAX >> 6 check in urandom_read()
07280d2c3f random: make more consistent use of integer types
655a69cb41 random: use hash function for crng_slow_load()
95026060d8 random: use simpler fast key erasure flow on per-cpu keys
732872aa2c random: absorb fast pool into input pool after fast load
7a5b9ca583 random: do not xor RDRAND when writing into /dev/random
16a6e4ae71 random: ensure early RDSEED goes through mixer on init
c521bf08ee random: inline leaves of rand_initialize()
70377ee074 random: get rid of secondary crngs
c36e71b5a5 random: use RDSEED instead of RDRAND in entropy extraction
1d1582e5fe random: fix locking in crng_fast_load()
0762b7d1f1 random: remove batched entropy locking
8d07e2a226 random: remove use_input_pool parameter from crng_reseed()
b07fcd9e53 random: make credit_entropy_bits() always safe
32d1d7ce3a random: always wake up entropy writers after extraction
9852922061 random: use linear min-entropy accumulation crediting
bb9c45cfb9 random: simplify entropy debiting
de0727c0c4 random: use computational hash for entropy extraction
e0cc561e47 random: only call crng_finalize_init() for primary_crng
480fd91dcd random: access primary_pool directly rather than through pointer
0b9e36e895 random: continually use hwgenerator randomness
6d2d29f051 random: simplify arithmetic function flow in account()
a0653a9ec1 random: selectively clang-format where it makes sense
bccc8d9231 random: access input_pool_data directly rather than through pointer
a9db850c21 random: cleanup fractional entropy shift constants
edd294052e random: prepend remaining pool constants with POOL_
f87f50b843 random: de-duplicate INPUT_POOL constants
09ae6b8519 random: remove unused OUTPUT_POOL constants
8cc5260c19 random: rather than entropy_store abstraction, use global
5897e06ac1 random: remove unused extract_entropy() reserved argument
ae093ca125 random: remove incomplete last_data logic
7abbc9809f random: cleanup integer types
c9e108e36d random: cleanup poolinfo abstraction
8a3b78f917 random: fix typo in comments
0ad5d6384d random: don't reset crng_init_cnt on urandom_read()
17420c77f0 random: avoid superfluous call to RDRAND in CRNG extraction
c245231aec random: early initialization of ChaCha constants
efaddd56bc random: use IS_ENABLED(CONFIG_NUMA) instead of ifdefs
6443204102 random: harmonize "crng init done" messages
ca57d51126 random: mix bootloader randomness into pool
542d8ebedb random: do not re-init if crng_reseed completes before primary init
2bfdf588a8 random: do not sign extend bytes for rotation when mixing
685200b076 random: use BLAKE2s instead of SHA1 in extraction
33c30bfe4f random: remove unused irq_flags argument from add_interrupt_randomness()
b57a888740 random: document add_hwgenerator_randomness() with other input functions
ae33c501e0 lib/crypto: blake2s: avoid indirect calls to compression function for Clang CFI
07918ddba3 lib/crypto: sha1: re-roll loops to reduce code size
5fb6a3ba3a lib/crypto: blake2s: move hmac construction into wireguard
62531d446a lib/crypto: blake2s: include as built-in
aec0878b1d crypto: blake2s - include <linux/bug.h> instead of <asm/bug.h>
030d3443aa crypto: blake2s - adjust include guard naming
fea91e9070 crypto: blake2s - add comment for blake2s_state fields
d45ae768b7 crypto: blake2s - optimize blake2s initialization
6c362b7c77 crypto: blake2s - share the "shash" API boilerplate code
72e5b68f33 crypto: blake2s - move update and final logic to internal/blake2s.h
e467a55bd0 crypto: blake2s - remove unneeded includes
198a19d7ee crypto: x86/blake2s - define shash_alg structs using macros
89f9ee998e crypto: blake2s - define shash_alg structs using macros
0f8fcf5b6e crypto: lib/blake2s - Move selftest prototype into header file
c3a4645d80 MAINTAINERS: add git tree for random.c
c4882c6e1e MAINTAINERS: co-maintain random.c
acb198c4d1 random: remove dead code left over from blocking pool
6227458fef random: avoid arch_get_random_seed_long() when collecting IRQ randomness
257fbea15a ACPI: sysfs: Fix BERT error region memory mapping
14fa2769ea ACPI: sysfs: Make sparse happy about address space in use
0debc69f00 media: vim2m: initialize the media device earlier
ed0e71cc3f media: vim2m: Register video device after setting up internals
a5c68f457f secure_seq: use the 64 bits of the siphash for port offset calculation
33f1b4a27a tcp: change source port randomizarion at connect() time
9b4aa0d80b KVM: x86/mmu: fix NULL pointer dereference on guest INVPCID
74c6e5d584 KVM: x86: Properly handle APF vs disabled LAPIC situation
c06e5f751a staging: rtl8723bs: prevent ->Ssid overflow in rtw_wx_set_scan()
a8f4d63142 lockdown: also lock down previous kgdb use
c204ee3350 Linux 5.10.118
56642f6af2 module: check for exit sections in layout_sections() instead of module_init_section()
633be494c3 include/uapi/linux/xfrm.h: Fix XFRM_MSG_MAPPING ABI breakage
61a4cc41e5 afs: Fix afs_getattr() to refetch file status if callback break occurred
606011cb6a i2c: mt7621: fix missing clk_disable_unprepare() on error in mtk_i2c_probe()
030de84d45 module: treat exit sections the same as init sections when !CONFIG_MODULE_UNLOAD
355141fdbf dt-bindings: pinctrl: aspeed-g6: remove FWQSPID group
d30fdf7d13 Input: ili210x - fix reset timing
a698bf1f72 arm64: Enable repeat tlbi workaround on KRYO4XX gold CPUs
696292b9b5 net: atlantic: verify hw_head_ lies within TX buffer ring
cd66ab20a8 net: atlantic: add check for MAX_SKB_FRAGS
9bee8b4275 net: atlantic: reduce scope of is_rsc_complete
9b84e83a92 net: atlantic: fix "frag[0] not initialized"
0ae23a1d47 net: stmmac: fix missing pci_disable_device() on error in stmmac_pci_probe()
d4c6e5cebc ethernet: tulip: fix missing pci_disable_device() on error in tulip_init_one()
3a6dee284f nl80211: fix locking in nl80211_set_tx_bitrate_mask()
efe580c436 selftests: add ping test with ping_group_range tuned
1cfbf6d3a7 nl80211: validate S1G channel width
a0f5ff2049 mac80211: fix rx reordering with non explicit / psmp ack policy
e21d734fd0 scsi: qla2xxx: Fix missed DMA unmap for aborted commands
c5af341747 perf bench numa: Address compiler error on s390
210ea7da5c gpio: mvebu/pwm: Refuse requests with inverted polarity
30d4721fec gpio: gpio-vf610: do not touch other bits when set the target bit
ea8a9cb4a7 riscv: dts: sifive: fu540-c000: align dma node name with dtschema
dfd1f0cb62 net: bridge: Clear offload_fwd_mark when passing frame up bridge interface.
579061f391 igb: skip phy status check where unavailable
a89888648e ARM: 9197/1: spectre-bhb: fix loop8 sequence for Thumb2
1756b45d8d ARM: 9196/1: spectre-bhb: enable for Cortex-A15
7b676abe32 net: af_key: add check for pfkey_broadcast in function pfkey_process
697f3219ee net/mlx5e: Properly block LRO when XDP is enabled
b503d0228c NFC: nci: fix sleep in atomic context bugs caused by nci_skb_alloc
42d4287cc1 net/qla3xxx: Fix a test in ql_reset_work()
d35bf8d766 clk: at91: generated: consider range when calculating best rate
9e0e75a5e7 ice: fix possible under reporting of ethtool Tx and Rx statistics
6e2caee5cd net: vmxnet3: fix possible NULL pointer dereference in vmxnet3_rq_cleanup()
a54d86cf41 net: vmxnet3: fix possible use-after-free bugs in vmxnet3_rq_alloc_rx_buf()
201e5b5c27 net: systemport: Fix an error handling path in bcm_sysport_probe()
9bfe898e2b net/sched: act_pedit: sanitize shift argument before usage
47f04f95ed xfrm: fix "disable_policy" flag use when arriving from different devices
0d2e9d8000 xfrm: rework default policy structure
57c1bbe709 xfrm: fix dflt policy check when there is no policy configured
9856c3a129 xfrm: notify default policy on update
20fd28df40 xfrm: make user policy API complete
ab610ee1d1 net: xfrm: fix shift-out-of-bounce
5b7f84b1f9 xfrm: Add possibility to set the default to block if we have no policy
243e72e204 net: evaluate net.ipvX.conf.all.disable_policy and disable_xfrm
1bc27eb71b net: macb: Increment rx bd head after allocating skb and buffer
998e305bd1 net: ipa: record proper RX transaction count
0599d5a8b4 ARM: dts: aspeed-g6: fix SPI1/SPI2 quad pin group
0a2847d448 pinctrl: pinctrl-aspeed-g6: remove FWQSPID group in pinctrl
d8ca684c3d ARM: dts: aspeed-g6: remove FWQSPID group in pinctrl dtsi
3fc2846099 dma-buf: fix use of DMA_BUF_SET_NAME_{A,B} in userspace
e5289affba drm/dp/mst: fix a possible memory leak in fetch_monitor_name()
8ceca1a069 libceph: fix potential use-after-free on linger ping and resends
233a3cc60e crypto: qcom-rng - fix infinite loop on requests not multiple of WORD_SZ
6013ef5f51 arm64: mte: Ensure the cleared tags are visible before setting the PTE
a817f78ed6 arm64: paravirt: Use RCU read locks to guard stolen_time
b49bc8d615 KVM: x86/mmu: Update number of zapped pages even if page list is stable
146128ba26 PCI/PM: Avoid putting Elo i2 PCIe Ports in D3cold
ec0d801d1a Fix double fget() in vhost_net_set_backend()
b42e5e3a84 selinux: fix bad cleanup on error in hashtab_duplicate()
3ee8e109c3 perf: Fix sys_perf_event_open() race against self
18fb7d533c ALSA: hda/realtek: Add quirk for TongFang devices with pop noise
3eaf770163 ALSA: wavefront: Proper check of get_user() error
a34d018b6e ALSA: usb-audio: Restore Rane SL-1 quirk
f3f2247ac3 Reinstate some of "swiotlb: rework "fix info leak with DMA_FROM_DEVICE""
e2cfa7b093 Revert "swiotlb: fix info leak with DMA_FROM_DEVICE"
fe5ac3da50 nilfs2: fix lockdep warnings during disk space reclamation
d626fcdabe nilfs2: fix lockdep warnings in page operations for btree nodes
aca18bacdb ARM: 9191/1: arm/stacktrace, kasan: Silence KASAN warnings in unwind_frame()
0acaf9cacd platform/chrome: cros_ec_debugfs: detach log reader wq from devm
5a19f3c2d3 drbd: remove usage of list iterator variable after loop
9b7f321106 MIPS: lantiq: check the return value of kzalloc()
05c073b1ad fs: fix an infinite loop in iomap_fiemap
00d8b06a4e rtc: mc146818-lib: Fix the AltCentury for AMD platforms
87fd0dd43e nvme-multipath: fix hang when disk goes live over reconnect
3663d6023a tools/virtio: compile with -pthread
5a4cbcb3df vhost_vdpa: don't setup irq offloading when irq_num < 0
f0931ee125 s390/pci: improve zpci_dev reference counting
7d3f69cbde ALSA: hda/realtek: Enable headset mic on Lenovo P360
a59450656b crypto: x86/chacha20 - Avoid spurious jumps to other functions
39acee8aea crypto: stm32 - fix reference leak in stm32_crc_remove
703c80ff43 rtc: sun6i: Fix time overflow handling
bab037ebbe gfs2: Disable page faults during lockless buffered reads
e803f12ea2 nvme-pci: add quirks for Samsung X5 SSDs
5565fc538d Input: stmfts - fix reference leak in stmfts_input_open
d5e88c2d76 Input: add bounds checking to input_set_capability()
ea6a86886c um: Cleanup syscall_handler_t definition/cast, fix warning
c39b91fcd5 rtc: pcf2127: fix bug when reading alarm registers
2b4e5a2d7d rtc: fix use-after-free on device removal
67136fff5b igc: Update I226_K device ID
d0229838b6 igc: Remove phy->type checking
170110adbe igc: Remove _I_PHY_ID checking
55c820c1b2 Revert "drm/i915/opregion: check port number bounds for SWSCI display power state"
911b362678 floppy: use a statically allocated error counter
3c48558be5 io_uring: always grab file table for deferred statx
a1a2c957da usb: gadget: fix race when gadget driver register via ioctl

ABI updated to add a new symbol that is needed to be tracked:

Leaf changes summary: 1 artifact changed
Changed leaf types summary: 0 leaf type changed
Removed/Changed/Added functions summary: 0 Removed, 0 Changed, 1 Added function
Removed/Changed/Added variables summary: 0 Removed, 0 Changed, 0 Added variable

1 Added function:

  [A] 'function bool rng_is_initialized()'

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: Ib7f64defc72960f3603eb23b9a401a9fd42ec217
2022-09-28 09:54:28 +02:00

9179 lines
254 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/page_alloc.c
*
* Manages the free list, the system allocates free pages here.
* Note that kmalloc() lives in slab.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
* Swap reorganised 29.12.95, Stephen Tweedie
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
* (lots of bits borrowed from Ingo Molnar & Andrew Morton)
*/
#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/kasan.h>
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/oom.h>
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
#include <linux/vmstat.h>
#include <linux/mempolicy.h>
#include <linux/memremap.h>
#include <linux/stop_machine.h>
#include <linux/random.h>
#include <linux/sort.h>
#include <linux/pfn.h>
#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
#include <linux/compaction.h>
#include <trace/events/kmem.h>
#include <trace/events/oom.h>
#include <linux/prefetch.h>
#include <linux/mm_inline.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
#include <linux/sched/rt.h>
#include <linux/sched/mm.h>
#include <linux/page_owner.h>
#include <linux/page_pinner.h>
#include <linux/kthread.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
#include <linux/lockdep.h>
#include <linux/nmi.h>
#include <linux/psi.h>
#include <linux/padata.h>
#include <linux/khugepaged.h>
#include <trace/hooks/mm.h>
#include <trace/hooks/vmscan.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include "internal.h"
#include "shuffle.h"
#include "page_reporting.h"
/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
typedef int __bitwise fpi_t;
/* No special request */
#define FPI_NONE ((__force fpi_t)0)
/*
* Skip free page reporting notification for the (possibly merged) page.
* This does not hinder free page reporting from grabbing the page,
* reporting it and marking it "reported" - it only skips notifying
* the free page reporting infrastructure about a newly freed page. For
* example, used when temporarily pulling a page from a freelist and
* putting it back unmodified.
*/
#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
/*
* Place the (possibly merged) page to the tail of the freelist. Will ignore
* page shuffling (relevant code - e.g., memory onlining - is expected to
* shuffle the whole zone).
*
* Note: No code should rely on this flag for correctness - it's purely
* to allow for optimizations when handing back either fresh pages
* (memory onlining) or untouched pages (page isolation, free page
* reporting).
*/
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
/*
* Don't poison memory with KASAN (only for the tag-based modes).
* During boot, all non-reserved memblock memory is exposed to page_alloc.
* Poisoning all that memory lengthens boot time, especially on systems with
* large amount of RAM. This flag is used to skip that poisoning.
* This is only done for the tag-based KASAN modes, as those are able to
* detect memory corruptions with the memory tags assigned by default.
* All memory allocated normally after boot gets poisoned as usual.
*/
#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_FRACTION (8)
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
#endif
DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
* defined in <linux/topology.h>.
*/
DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
EXPORT_PER_CPU_SYMBOL(_numa_mem_);
#endif
/* work_structs for global per-cpu drains */
struct pcpu_drain {
struct zone *zone;
struct work_struct work;
};
static DEFINE_MUTEX(pcpu_drain_mutex);
static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
volatile unsigned long latent_entropy __latent_entropy;
EXPORT_SYMBOL(latent_entropy);
#endif
/*
* Array of node states.
*/
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
[N_POSSIBLE] = NODE_MASK_ALL,
[N_ONLINE] = { { [0] = 1UL } },
#ifndef CONFIG_NUMA
[N_NORMAL_MEMORY] = { { [0] = 1UL } },
#ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
[N_MEMORY] = { { [0] = 1UL } },
[N_CPU] = { { [0] = 1UL } },
#endif /* NUMA */
};
EXPORT_SYMBOL(node_states);
atomic_long_t _totalram_pages __read_mostly;
EXPORT_SYMBOL(_totalram_pages);
unsigned long totalreserve_pages __read_mostly;
unsigned long totalcma_pages __read_mostly;
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
DEFINE_STATIC_KEY_FALSE(init_on_alloc);
EXPORT_SYMBOL(init_on_alloc);
DEFINE_STATIC_KEY_FALSE(init_on_free);
EXPORT_SYMBOL(init_on_free);
static bool _init_on_alloc_enabled_early __read_mostly
= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
static int __init early_init_on_alloc(char *buf)
{
return kstrtobool(buf, &_init_on_alloc_enabled_early);
}
early_param("init_on_alloc", early_init_on_alloc);
static bool _init_on_free_enabled_early __read_mostly
= IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
static int __init early_init_on_free(char *buf)
{
return kstrtobool(buf, &_init_on_free_enabled_early);
}
early_param("init_on_free", early_init_on_free);
/*
* A cached value of the page's pageblock's migratetype, used when the page is
* put on a pcplist. Used to avoid the pageblock migratetype lookup when
* freeing from pcplists in most cases, at the cost of possibly becoming stale.
* Also the migratetype set in the page does not necessarily match the pcplist
* index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
* other index - this ensures that it will be put on the correct CMA freelist.
*/
static inline int get_pcppage_migratetype(struct page *page)
{
return page->index;
}
static inline void set_pcppage_migratetype(struct page *page, int migratetype)
{
page->index = migratetype;
}
#ifdef CONFIG_PM_SLEEP
/*
* The following functions are used by the suspend/hibernate code to temporarily
* change gfp_allowed_mask in order to avoid using I/O during memory allocations
* while devices are suspended. To avoid races with the suspend/hibernate code,
* they should always be called with system_transition_mutex held
* (gfp_allowed_mask also should only be modified with system_transition_mutex
* held, unless the suspend/hibernate code is guaranteed not to run in parallel
* with that modification).
*/
static gfp_t saved_gfp_mask;
void pm_restore_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&system_transition_mutex));
if (saved_gfp_mask) {
gfp_allowed_mask = saved_gfp_mask;
saved_gfp_mask = 0;
}
}
void pm_restrict_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&system_transition_mutex));
WARN_ON(saved_gfp_mask);
saved_gfp_mask = gfp_allowed_mask;
gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
}
bool pm_suspended_storage(void)
{
if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
return false;
return true;
}
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
unsigned int pageblock_order __read_mostly;
#endif
static void __free_pages_ok(struct page *page, unsigned int order,
fpi_t fpi_flags);
/*
* results with 256, 32 in the lowmem_reserve sysctl:
* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
* 1G machine -> (16M dma, 784M normal, 224M high)
* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
* HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
*
* TBD: should special case ZONE_DMA32 machines here - in those we normally
* don't need any ZONE_NORMAL reservation
*/
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
[ZONE_DMA] = 256,
#endif
#ifdef CONFIG_ZONE_DMA32
[ZONE_DMA32] = 256,
#endif
[ZONE_NORMAL] = 32,
#ifdef CONFIG_HIGHMEM
[ZONE_HIGHMEM] = 0,
#endif
[ZONE_MOVABLE] = 0,
};
static char * const zone_names[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
"DMA",
#endif
#ifdef CONFIG_ZONE_DMA32
"DMA32",
#endif
"Normal",
#ifdef CONFIG_HIGHMEM
"HighMem",
#endif
"Movable",
#ifdef CONFIG_ZONE_DEVICE
"Device",
#endif
};
const char * const migratetype_names[MIGRATE_TYPES] = {
"Unmovable",
"Movable",
"Reclaimable",
#ifdef CONFIG_CMA
"CMA",
#endif
"HighAtomic",
#ifdef CONFIG_MEMORY_ISOLATION
"Isolate",
#endif
};
compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
[NULL_COMPOUND_DTOR] = NULL,
[COMPOUND_PAGE_DTOR] = free_compound_page,
#ifdef CONFIG_HUGETLB_PAGE
[HUGETLB_PAGE_DTOR] = free_huge_page,
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
[TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
#endif
};
/*
* Try to keep at least this much lowmem free. Do not allow normal
* allocations below this point, only high priority ones. Automatically
* tuned according to the amount of memory in the system.
*/
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
#ifdef CONFIG_DISCONTIGMEM
/*
* DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
* are not on separate NUMA nodes. Functionally this works but with
* watermark_boost_factor, it can reclaim prematurely as the ranges can be
* quite small. By default, do not boost watermarks on discontigmem as in
* many cases very high-order allocations like THP are likely to be
* unsupported and the premature reclaim offsets the advantage of long-term
* fragmentation avoidance.
*/
int watermark_boost_factor __read_mostly;
#else
int watermark_boost_factor __read_mostly = 15000;
#endif
int watermark_scale_factor = 10;
/*
* Extra memory for the system to try freeing. Used to temporarily
* free memory, to make space for new workloads. Anyone can allocate
* down to the min watermarks controlled by min_free_kbytes above.
*/
int extra_free_kbytes = 0;
static unsigned long nr_kernel_pages __initdata;
static unsigned long nr_all_pages __initdata;
static unsigned long dma_reserve __initdata;
static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
static unsigned long required_kernelcore __initdata;
static unsigned long required_kernelcore_percent __initdata;
static unsigned long required_movablecore __initdata;
static unsigned long required_movablecore_percent __initdata;
static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
static bool mirrored_kernelcore __meminitdata;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
EXPORT_SYMBOL(movable_zone);
#if MAX_NUMNODES > 1
unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
unsigned int nr_online_nodes __read_mostly = 1;
EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* During boot we initialize deferred pages on-demand, as needed, but once
* page_alloc_init_late() has finished, the deferred pages are all initialized,
* and we can permanently disable that path.
*/
static DEFINE_STATIC_KEY_TRUE(deferred_pages);
/*
* Calling kasan_poison_pages() only after deferred memory initialization
* has completed. Poisoning pages during deferred memory init will greatly
* lengthen the process and cause problem in large memory systems as the
* deferred pages initialization is done with interrupt disabled.
*
* Assuming that there will be no reference to those newly initialized
* pages before they are ever allocated, this should have no effect on
* KASAN memory tracking as the poison will be properly inserted at page
* allocation time. The only corner case is when pages are allocated by
* on-demand allocation and then freed again before the deferred pages
* initialization is done, but this is not likely to happen.
*/
static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
{
return static_branch_unlikely(&deferred_pages) ||
(!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
(fpi_flags & FPI_SKIP_KASAN_POISON)) ||
PageSkipKASanPoison(page);
}
/* Returns true if the struct page for the pfn is uninitialised */
static inline bool __meminit early_page_uninitialised(unsigned long pfn)
{
int nid = early_pfn_to_nid(pfn);
if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
return true;
return false;
}
/*
* Returns true when the remaining initialisation should be deferred until
* later in the boot cycle when it can be parallelised.
*/
static bool __meminit
defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
static unsigned long prev_end_pfn, nr_initialised;
/*
* prev_end_pfn static that contains the end of previous zone
* No need to protect because called very early in boot before smp_init.
*/
if (prev_end_pfn != end_pfn) {
prev_end_pfn = end_pfn;
nr_initialised = 0;
}
/* Always populate low zones for address-constrained allocations */
if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
return false;
if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
return true;
/*
* We start only with one section of pages, more pages are added as
* needed until the rest of deferred pages are initialized.
*/
nr_initialised++;
if ((nr_initialised > PAGES_PER_SECTION) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
NODE_DATA(nid)->first_deferred_pfn = pfn;
return true;
}
return false;
}
#else
static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
{
return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
(fpi_flags & FPI_SKIP_KASAN_POISON)) ||
PageSkipKASanPoison(page);
}
static inline bool early_page_uninitialised(unsigned long pfn)
{
return false;
}
static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
return false;
}
#endif
/* Return a pointer to the bitmap storing bits affecting a block of pages */
static inline unsigned long *get_pageblock_bitmap(struct page *page,
unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
return section_to_usemap(__pfn_to_section(pfn));
#else
return page_zone(page)->pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
}
static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
pfn &= (PAGES_PER_SECTION-1);
#else
pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
#endif /* CONFIG_SPARSEMEM */
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}
/**
* get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @pfn: The target page frame number
* @mask: mask of bits that the caller is interested in
*
* Return: pageblock_bits flags
*/
static __always_inline
unsigned long __get_pfnblock_flags_mask(struct page *page,
unsigned long pfn,
unsigned long mask)
{
unsigned long *bitmap;
unsigned long bitidx, word_bitidx;
unsigned long word;
bitmap = get_pageblock_bitmap(page, pfn);
bitidx = pfn_to_bitidx(page, pfn);
word_bitidx = bitidx / BITS_PER_LONG;
bitidx &= (BITS_PER_LONG-1);
word = bitmap[word_bitidx];
return (word >> bitidx) & mask;
}
unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
unsigned long mask)
{
return __get_pfnblock_flags_mask(page, pfn, mask);
}
EXPORT_SYMBOL_GPL(get_pfnblock_flags_mask);
int isolate_anon_lru_page(struct page *page)
{
int ret;
if (!PageLRU(page) || !PageAnon(page))
return -EINVAL;
if (!get_page_unless_zero(page))
return -EINVAL;
ret = isolate_lru_page(page);
put_page(page);
return ret;
}
EXPORT_SYMBOL_GPL(isolate_anon_lru_page);
static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
{
return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
}
/**
* set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @flags: The flags to set
* @pfn: The target page frame number
* @mask: mask of bits that the caller is interested in
*/
void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
unsigned long pfn,
unsigned long mask)
{
unsigned long *bitmap;
unsigned long bitidx, word_bitidx;
unsigned long old_word, word;
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
bitmap = get_pageblock_bitmap(page, pfn);
bitidx = pfn_to_bitidx(page, pfn);
word_bitidx = bitidx / BITS_PER_LONG;
bitidx &= (BITS_PER_LONG-1);
VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
mask <<= bitidx;
flags <<= bitidx;
word = READ_ONCE(bitmap[word_bitidx]);
for (;;) {
old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
if (word == old_word)
break;
word = old_word;
}
}
void set_pageblock_migratetype(struct page *page, int migratetype)
{
if (unlikely(page_group_by_mobility_disabled &&
migratetype < MIGRATE_PCPTYPES))
migratetype = MIGRATE_UNMOVABLE;
set_pfnblock_flags_mask(page, (unsigned long)migratetype,
page_to_pfn(page), MIGRATETYPE_MASK);
}
#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
int ret = 0;
unsigned seq;
unsigned long pfn = page_to_pfn(page);
unsigned long sp, start_pfn;
do {
seq = zone_span_seqbegin(zone);
start_pfn = zone->zone_start_pfn;
sp = zone->spanned_pages;
if (!zone_spans_pfn(zone, pfn))
ret = 1;
} while (zone_span_seqretry(zone, seq));
if (ret)
pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
pfn, zone_to_nid(zone), zone->name,
start_pfn, start_pfn + sp);
return ret;
}
static int page_is_consistent(struct zone *zone, struct page *page)
{
if (!pfn_valid_within(page_to_pfn(page)))
return 0;
if (zone != page_zone(page))
return 0;
return 1;
}
/*
* Temporary debugging check for pages not lying within a given zone.
*/
static int __maybe_unused bad_range(struct zone *zone, struct page *page)
{
if (page_outside_zone_boundaries(zone, page))
return 1;
if (!page_is_consistent(zone, page))
return 1;
return 0;
}
#else
static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
{
return 0;
}
#endif
static void bad_page(struct page *page, const char *reason)
{
static unsigned long resume;
static unsigned long nr_shown;
static unsigned long nr_unshown;
/*
* Allow a burst of 60 reports, then keep quiet for that minute;
* or allow a steady drip of one report per second.
*/
if (nr_shown == 60) {
if (time_before(jiffies, resume)) {
nr_unshown++;
goto out;
}
if (nr_unshown) {
pr_alert(
"BUG: Bad page state: %lu messages suppressed\n",
nr_unshown);
nr_unshown = 0;
}
nr_shown = 0;
}
if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
__dump_page(page, reason);
dump_page_owner(page);
print_modules();
dump_stack();
out:
/* Leave bad fields for debug, except PageBuddy could make trouble */
page_mapcount_reset(page); /* remove PageBuddy */
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
/*
* Higher-order pages are called "compound pages". They are structured thusly:
*
* The first PAGE_SIZE page is called the "head page" and have PG_head set.
*
* The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
* in bit 0 of page->compound_head. The rest of bits is pointer to head page.
*
* The first tail page's ->compound_dtor holds the offset in array of compound
* page destructors. See compound_page_dtors.
*
* The first tail page's ->compound_order holds the order of allocation.
* This usage means that zero-order pages may not be compound.
*/
void free_compound_page(struct page *page)
{
mem_cgroup_uncharge(page);
__free_pages_ok(page, compound_order(page), FPI_NONE);
}
void prep_compound_page(struct page *page, unsigned int order)
{
int i;
int nr_pages = 1 << order;
__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
set_page_count(p, 0);
p->mapping = TAIL_MAPPING;
set_compound_head(p, page);
}
set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
set_compound_order(page, order);
atomic_set(compound_mapcount_ptr(page), -1);
if (hpage_pincount_available(page))
atomic_set(compound_pincount_ptr(page), 0);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
bool _debug_pagealloc_enabled_early __read_mostly
= IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
EXPORT_SYMBOL(_debug_pagealloc_enabled);
DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
static int __init early_debug_pagealloc(char *buf)
{
return kstrtobool(buf, &_debug_pagealloc_enabled_early);
}
early_param("debug_pagealloc", early_debug_pagealloc);
static int __init debug_guardpage_minorder_setup(char *buf)
{
unsigned long res;
if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
pr_err("Bad debug_guardpage_minorder value\n");
return 0;
}
_debug_guardpage_minorder = res;
pr_info("Setting debug_guardpage_minorder to %lu\n", res);
return 0;
}
early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
static inline bool set_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype)
{
if (!debug_guardpage_enabled())
return false;
if (order >= debug_guardpage_minorder())
return false;
__SetPageGuard(page);
INIT_LIST_HEAD(&page->lru);
set_page_private(page, order);
/* Guard pages are not available for any usage */
__mod_zone_freepage_state(zone, -(1 << order), migratetype);
return true;
}
static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype)
{
if (!debug_guardpage_enabled())
return;
__ClearPageGuard(page);
set_page_private(page, 0);
if (!is_migrate_isolate(migratetype))
__mod_zone_freepage_state(zone, (1 << order), migratetype);
}
#else
static inline bool set_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) {}
#endif
/*
* Enable static keys related to various memory debugging and hardening options.
* Some override others, and depend on early params that are evaluated in the
* order of appearance. So we need to first gather the full picture of what was
* enabled, and then make decisions.
*/
void init_mem_debugging_and_hardening(void)
{
bool page_poisoning_requested = false;
#ifdef CONFIG_PAGE_POISONING
/*
* Page poisoning is debug page alloc for some arches. If
* either of those options are enabled, enable poisoning.
*/
if (page_poisoning_enabled() ||
(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
debug_pagealloc_enabled())) {
static_branch_enable(&_page_poisoning_enabled);
page_poisoning_requested = true;
}
#endif
if (_init_on_alloc_enabled_early) {
if (page_poisoning_requested)
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
"will take precedence over init_on_alloc\n");
else
static_branch_enable(&init_on_alloc);
}
if (_init_on_free_enabled_early) {
if (page_poisoning_requested)
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
"will take precedence over init_on_free\n");
else
static_branch_enable(&init_on_free);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
if (!debug_pagealloc_enabled())
return;
static_branch_enable(&_debug_pagealloc_enabled);
if (!debug_guardpage_minorder())
return;
static_branch_enable(&_debug_guardpage_enabled);
#endif
}
static inline void set_buddy_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
__SetPageBuddy(page);
}
/*
* This function checks whether a page is free && is the buddy
* we can coalesce a page and its buddy if
* (a) the buddy is not in a hole (check before calling!) &&
* (b) the buddy is in the buddy system &&
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
* For recording whether a page is in the buddy system, we set PageBuddy.
* Setting, clearing, and testing PageBuddy is serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
static inline bool page_is_buddy(struct page *page, struct page *buddy,
unsigned int order)
{
if (!page_is_guard(buddy) && !PageBuddy(buddy))
return false;
if (buddy_order(buddy) != order)
return false;
/*
* zone check is done late to avoid uselessly calculating
* zone/node ids for pages that could never merge.
*/
if (page_zone_id(page) != page_zone_id(buddy))
return false;
VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
return true;
}
#ifdef CONFIG_COMPACTION
static inline struct capture_control *task_capc(struct zone *zone)
{
struct capture_control *capc = current->capture_control;
return unlikely(capc) &&
!(current->flags & PF_KTHREAD) &&
!capc->page &&
capc->cc->zone == zone ? capc : NULL;
}
static inline bool
compaction_capture(struct capture_control *capc, struct page *page,
int order, int migratetype)
{
if (!capc || order != capc->cc->order)
return false;
/* Do not accidentally pollute CMA or isolated regions*/
if (is_migrate_cma(migratetype) ||
is_migrate_isolate(migratetype))
return false;
/*
* Do not let lower order allocations polluate a movable pageblock.
* This might let an unmovable request use a reclaimable pageblock
* and vice-versa but no more than normal fallback logic which can
* have trouble finding a high-order free page.
*/
if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
return false;
capc->page = page;
return true;
}
#else
static inline struct capture_control *task_capc(struct zone *zone)
{
return NULL;
}
static inline bool
compaction_capture(struct capture_control *capc, struct page *page,
int order, int migratetype)
{
return false;
}
#endif /* CONFIG_COMPACTION */
/* Used for pages not on another list */
static inline void add_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
struct free_area *area = &zone->free_area[order];
list_add(&page->lru, &area->free_list[migratetype]);
area->nr_free++;
}
/* Used for pages not on another list */
static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
struct free_area *area = &zone->free_area[order];
list_add_tail(&page->lru, &area->free_list[migratetype]);
area->nr_free++;
}
/*
* Used for pages which are on another list. Move the pages to the tail
* of the list - so the moved pages won't immediately be considered for
* allocation again (e.g., optimization for memory onlining).
*/
static inline void move_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
struct free_area *area = &zone->free_area[order];
list_move_tail(&page->lru, &area->free_list[migratetype]);
}
static inline void del_page_from_free_list(struct page *page, struct zone *zone,
unsigned int order)
{
/* clear reported state and update reported page count */
if (page_reported(page))
__ClearPageReported(page);
list_del(&page->lru);
__ClearPageBuddy(page);
set_page_private(page, 0);
zone->free_area[order].nr_free--;
}
/*
* If this is not the largest possible page, check if the buddy
* of the next-highest order is free. If it is, it's possible
* that pages are being freed that will coalesce soon. In case,
* that is happening, add the free page to the tail of the list
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
static inline bool
buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
struct page *page, unsigned int order)
{
struct page *higher_page, *higher_buddy;
unsigned long combined_pfn;
if (order >= MAX_ORDER - 2)
return false;
if (!pfn_valid_within(buddy_pfn))
return false;
combined_pfn = buddy_pfn & pfn;
higher_page = page + (combined_pfn - pfn);
buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
higher_buddy = higher_page + (buddy_pfn - combined_pfn);
return pfn_valid_within(buddy_pfn) &&
page_is_buddy(higher_page, higher_buddy, order + 1);
}
/*
* Freeing function for a buddy system allocator.
*
* The concept of a buddy system is to maintain direct-mapped table
* (containing bit values) for memory blocks of various "orders".
* The bottom level table contains the map for the smallest allocatable
* units of memory (here, pages), and each level above it describes
* pairs of units from the levels below, hence, "buddies".
* At a high level, all that happens here is marking the table entry
* at the bottom level available, and propagating the changes upward
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
* free pages of length of (1 << order) and marked with PageBuddy.
* Page's order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
* free, the remainder of the region must be split into blocks.
* If a block is freed, and its buddy is also free, then this
* triggers coalescing into a block of larger size.
*
* -- nyc
*/
static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
int migratetype, fpi_t fpi_flags)
{
struct capture_control *capc = task_capc(zone);
unsigned long buddy_pfn;
unsigned long combined_pfn;
unsigned int max_order;
struct page *buddy;
bool to_tail;
max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
if (likely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
continue_merging:
while (order < max_order) {
if (compaction_capture(capc, page, order, migratetype)) {
__mod_zone_freepage_state(zone, -(1 << order),
migratetype);
return;
}
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
if (!pfn_valid_within(buddy_pfn))
goto done_merging;
if (!page_is_buddy(page, buddy, order))
goto done_merging;
/*
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
*/
if (page_is_guard(buddy))
clear_page_guard(zone, buddy, order, migratetype);
else
del_page_from_free_list(buddy, zone, order);
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
order++;
}
if (order < MAX_ORDER - 1) {
/* If we are here, it means order is >= pageblock_order.
* We want to prevent merge between freepages on isolate
* pageblock and normal pageblock. Without this, pageblock
* isolation could cause incorrect freepage or CMA accounting.
*
* We don't want to hit this code for the more frequent
* low-order merging.
*/
if (unlikely(has_isolate_pageblock(zone))) {
int buddy_mt;
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
buddy_mt = get_pageblock_migratetype(buddy);
if (migratetype != buddy_mt
&& (is_migrate_isolate(migratetype) ||
is_migrate_isolate(buddy_mt)))
goto done_merging;
}
max_order = order + 1;
goto continue_merging;
}
done_merging:
set_buddy_order(page, order);
if (fpi_flags & FPI_TO_TAIL)
to_tail = true;
else if (is_shuffle_order(order))
to_tail = shuffle_pick_tail();
else
to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
if (to_tail)
add_to_free_list_tail(page, zone, order, migratetype);
else
add_to_free_list(page, zone, order, migratetype);
/* Notify page reporting subsystem of freed page */
if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
page_reporting_notify_free(order);
}
/*
* A bad page could be due to a number of fields. Instead of multiple branches,
* try and check multiple fields with one check. The caller must do a detailed
* check if necessary.
*/
static inline bool page_expected_state(struct page *page,
unsigned long check_flags)
{
if (unlikely(atomic_read(&page->_mapcount) != -1))
return false;
if (unlikely((unsigned long)page->mapping |
page_ref_count(page) |
#ifdef CONFIG_MEMCG
(unsigned long)page->mem_cgroup |
#endif
(page->flags & check_flags)))
return false;
return true;
}
static const char *page_bad_reason(struct page *page, unsigned long flags)
{
const char *bad_reason = NULL;
if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _refcount";
if (unlikely(page->flags & flags)) {
if (flags == PAGE_FLAGS_CHECK_AT_PREP)
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
else
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
}
#ifdef CONFIG_MEMCG
if (unlikely(page->mem_cgroup))
bad_reason = "page still charged to cgroup";
#endif
return bad_reason;
}
static void check_free_page_bad(struct page *page)
{
bad_page(page,
page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
}
static inline int check_free_page(struct page *page)
{
if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
return 0;
/* Something has gone sideways, find it */
check_free_page_bad(page);
return 1;
}
static int free_tail_pages_check(struct page *head_page, struct page *page)
{
int ret = 1;
/*
* We rely page->lru.next never has bit 0 set, unless the page
* is PageTail(). Let's make sure that's true even for poisoned ->lru.
*/
BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
ret = 0;
goto out;
}
switch (page - head_page) {
case 1:
/* the first tail page: ->mapping may be compound_mapcount() */
if (unlikely(compound_mapcount(page))) {
bad_page(page, "nonzero compound_mapcount");
goto out;
}
break;
case 2:
/*
* the second tail page: ->mapping is
* deferred_list.next -- ignore value.
*/
break;
default:
if (page->mapping != TAIL_MAPPING) {
bad_page(page, "corrupted mapping in tail page");
goto out;
}
break;
}
if (unlikely(!PageTail(page))) {
bad_page(page, "PageTail not set");
goto out;
}
if (unlikely(compound_head(page) != head_page)) {
bad_page(page, "compound_head not consistent");
goto out;
}
ret = 0;
out:
page->mapping = NULL;
clear_compound_head(page);
return ret;
}
static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
{
int i;
if (zero_tags) {
for (i = 0; i < numpages; i++)
tag_clear_highpage(page + i);
return;
}
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
for (i = 0; i < numpages; i++) {
u8 tag = page_kasan_tag(page + i);
page_kasan_tag_reset(page + i);
clear_highpage(page + i);
page_kasan_tag_set(page + i, tag);
}
kasan_enable_current();
}
static __always_inline bool free_pages_prepare(struct page *page,
unsigned int order, bool check_free, fpi_t fpi_flags)
{
int bad = 0;
bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
VM_BUG_ON_PAGE(PageTail(page), page);
trace_mm_page_free(page, order);
if (unlikely(PageHWPoison(page)) && !order) {
/*
* Do not let hwpoison pages hit pcplists/buddy
* Untie memcg state and reset page's owner
*/
if (memcg_kmem_enabled() && PageKmemcg(page))
__memcg_kmem_uncharge_page(page, order);
reset_page_owner(page, order);
free_page_pinner(page, order);
return false;
}
/*
* Check tail pages before head page information is cleared to
* avoid checking PageCompound for order-0 pages.
*/
if (unlikely(order)) {
bool compound = PageCompound(page);
int i;
VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
if (compound)
ClearPageDoubleMap(page);
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_pages_check(page, page + i);
if (unlikely(check_free_page(page + i))) {
bad++;
continue;
}
(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
}
}
if (PageMappingFlags(page))
page->mapping = NULL;
if (memcg_kmem_enabled() && PageKmemcg(page))
__memcg_kmem_uncharge_page(page, order);
if (check_free)
bad += check_free_page(page);
if (bad)
return false;
page_cpupid_reset_last(page);
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
reset_page_owner(page, order);
free_page_pinner(page, order);
if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),
PAGE_SIZE << order);
debug_check_no_obj_freed(page_address(page),
PAGE_SIZE << order);
}
kernel_poison_pages(page, 1 << order);
/*
* As memory initialization might be integrated into KASAN,
* kasan_free_pages and kernel_init_free_pages must be
* kept together to avoid discrepancies in behavior.
*
* With hardware tag-based KASAN, memory tags must be set before the
* page becomes unavailable via debug_pagealloc or arch_free_page.
*/
if (kasan_has_integrated_init()) {
if (!skip_kasan_poison)
kasan_free_pages(page, order);
} else {
bool init = want_init_on_free();
if (init)
kernel_init_free_pages(page, 1 << order, false);
if (!skip_kasan_poison)
kasan_poison_pages(page, order, init);
}
/*
* arch_free_page() can make the page's contents inaccessible. s390
* does this. So nothing which can access the page's contents should
* happen after this.
*/
arch_free_page(page, order);
debug_pagealloc_unmap_pages(page, 1 << order);
return true;
}
#ifdef CONFIG_DEBUG_VM
/*
* With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
* to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
* moved from pcp lists to free lists.
*/
static bool free_pcp_prepare(struct page *page)
{
return free_pages_prepare(page, 0, true, FPI_NONE);
}
static bool bulkfree_pcp_prepare(struct page *page)
{
if (debug_pagealloc_enabled_static())
return check_free_page(page);
else
return false;
}
#else
/*
* With DEBUG_VM disabled, order-0 pages being freed are checked only when
* moving from pcp lists to free list in order to reduce overhead. With
* debug_pagealloc enabled, they are checked also immediately when being freed
* to the pcp lists.
*/
static bool free_pcp_prepare(struct page *page)
{
if (debug_pagealloc_enabled_static())
return free_pages_prepare(page, 0, true, FPI_NONE);
else
return free_pages_prepare(page, 0, false, FPI_NONE);
}
static bool bulkfree_pcp_prepare(struct page *page)
{
return check_free_page(page);
}
#endif /* CONFIG_DEBUG_VM */
static inline void prefetch_buddy(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0);
struct page *buddy = page + (buddy_pfn - pfn);
prefetch(buddy);
}
/*
* Frees a number of pages from the PCP lists
* Assumes all pages on list are in same zone, and of same order.
* count is the number of pages to free.
*
* If the zone was previously in an "all pages pinned" state then look to
* see if this freeing clears that state.
*
* And clear the zone's pages_scanned counter, to hold off the "all pages are
* pinned" detection logic.
*/
static void free_pcppages_bulk(struct zone *zone, int count,
struct per_cpu_pages *pcp)
{
int migratetype = 0;
int batch_free = 0;
int prefetch_nr = 0;
bool isolated_pageblocks;
struct page *page, *tmp;
LIST_HEAD(head);
/*
* Ensure proper count is passed which otherwise would stuck in the
* below while (list_empty(list)) loop.
*/
count = min(pcp->count, count);
while (count) {
struct list_head *list;
/*
* Remove pages from lists in a round-robin fashion. A
* batch_free count is maintained that is incremented when an
* empty list is encountered. This is so more pages are freed
* off fuller lists instead of spinning excessively around empty
* lists
*/
do {
batch_free++;
if (++migratetype == MIGRATE_PCPTYPES)
migratetype = 0;
list = &pcp->lists[migratetype];
} while (list_empty(list));
/* This is the only non-empty list. Free them all. */
if (batch_free == MIGRATE_PCPTYPES)
batch_free = count;
do {
page = list_last_entry(list, struct page, lru);
/* must delete to avoid corrupting pcp list */
list_del(&page->lru);
pcp->count--;
if (bulkfree_pcp_prepare(page))
continue;
list_add_tail(&page->lru, &head);
/*
* We are going to put the page back to the global
* pool, prefetch its buddy to speed up later access
* under zone->lock. It is believed the overhead of
* an additional test and calculating buddy_pfn here
* can be offset by reduced memory latency later. To
* avoid excessive prefetching due to large count, only
* prefetch buddy for the first pcp->batch nr of pages.
*/
if (prefetch_nr++ < pcp->batch)
prefetch_buddy(page);
} while (--count && --batch_free && !list_empty(list));
}
spin_lock(&zone->lock);
isolated_pageblocks = has_isolate_pageblock(zone);
/*
* Use safe version since after __free_one_page(),
* page->lru.next will not point to original list.
*/
list_for_each_entry_safe(page, tmp, &head, lru) {
int mt = get_pcppage_migratetype(page);
/* MIGRATE_ISOLATE page should not go to pcplists */
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
/* Pageblock could have been isolated meanwhile */
if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
__free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
trace_mm_page_pcpu_drain(page, 0, mt);
}
spin_unlock(&zone->lock);
}
static void free_one_page(struct zone *zone,
struct page *page, unsigned long pfn,
unsigned int order,
int migratetype, fpi_t fpi_flags)
{
spin_lock(&zone->lock);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
spin_unlock(&zone->lock);
}
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid)
{
mm_zero_struct_page(page);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
page_cpupid_reset_last(page);
page_kasan_tag_reset(page);
INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
if (!is_highmem_idx(zone))
set_page_address(page, __va(pfn << PAGE_SHIFT));
#endif
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static void __meminit init_reserved_page(unsigned long pfn)
{
pg_data_t *pgdat;
int nid, zid;
if (!early_page_uninitialised(pfn))
return;
nid = early_pfn_to_nid(pfn);
pgdat = NODE_DATA(nid);
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
struct zone *zone = &pgdat->node_zones[zid];
if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
break;
}
__init_single_page(pfn_to_page(pfn), pfn, zid, nid);
}
#else
static inline void init_reserved_page(unsigned long pfn)
{
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
/*
* Initialised pages do not have PageReserved set. This function is
* called for each range allocated by the bootmem allocator and
* marks the pages PageReserved. The remaining valid pages are later
* sent to the buddy page allocator.
*/
void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end);
for (; start_pfn < end_pfn; start_pfn++) {
if (pfn_valid(start_pfn)) {
struct page *page = pfn_to_page(start_pfn);
init_reserved_page(start_pfn);
/* Avoid false-positive PageTail() */
INIT_LIST_HEAD(&page->lru);
/*
* no need for atomic set_bit because the struct
* page is not visible yet so nobody should
* access it yet.
*/
__SetPageReserved(page);
}
}
}
static void __free_pages_ok(struct page *page, unsigned int order,
fpi_t fpi_flags)
{
unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
if (!free_pages_prepare(page, order, true, fpi_flags))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, pfn, order, migratetype,
fpi_flags);
local_irq_restore(flags);
}
void __free_pages_core(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
unsigned int loop;
/*
* When initializing the memmap, __init_single_page() sets the refcount
* of all pages to 1 ("allocated"/"not free"). We have to set the
* refcount of all involved pages to 0.
*/
prefetchw(p);
for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
prefetchw(p + 1);
__ClearPageReserved(p);
set_page_count(p, 0);
}
__ClearPageReserved(p);
set_page_count(p, 0);
atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
/*
* Bypass PCP and place fresh pages right to the tail, primarily
* relevant for memory onlining.
*/
__free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
}
#ifdef CONFIG_NEED_MULTIPLE_NODES
static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
/*
* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
*/
int __meminit __early_pfn_to_nid(unsigned long pfn,
struct mminit_pfnnid_cache *state)
{
unsigned long start_pfn, end_pfn;
int nid;
if (state->last_start <= pfn && pfn < state->last_end)
return state->last_nid;
nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
if (nid != NUMA_NO_NODE) {
state->last_start = start_pfn;
state->last_end = end_pfn;
state->last_nid = nid;
}
return nid;
}
#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
int __meminit early_pfn_to_nid(unsigned long pfn)
{
static DEFINE_SPINLOCK(early_pfn_lock);
int nid;
spin_lock(&early_pfn_lock);
nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
if (nid < 0)
nid = first_online_node;
spin_unlock(&early_pfn_lock);
return nid;
}
#endif /* CONFIG_NEED_MULTIPLE_NODES */
void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
{
if (early_page_uninitialised(pfn))
return;
__free_pages_core(page, order);
}
/*
* Check that the whole (or subset of) a pageblock given by the interval of
* [start_pfn, end_pfn) is valid and within the same zone, before scanning it
* with the migration of free compaction scanner. The scanners then need to
* use only pfn_valid_within() check for arches that allow holes within
* pageblocks.
*
* Return struct page pointer of start_pfn, or NULL if checks were not passed.
*
* It's possible on some configurations to have a setup like node0 node1 node0
* i.e. it's possible that all pages within a zones range of pages do not
* belong to a single zone. We assume that a border between node0 and node1
* can occur within a single pageblock, but not a node0 node1 node0
* interleaving within a single pageblock. It is therefore sufficient to check
* the first and last page of a pageblock and avoid checking each individual
* page in a pageblock.
*/
struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
unsigned long end_pfn, struct zone *zone)
{
struct page *start_page;
struct page *end_page;
/* end_pfn is one past the range we are checking */
end_pfn--;
if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
return NULL;
start_page = pfn_to_online_page(start_pfn);
if (!start_page)
return NULL;
if (page_zone(start_page) != zone)
return NULL;
end_page = pfn_to_page(end_pfn);
/* This gives a shorter code than deriving page_zone(end_page) */
if (page_zone_id(start_page) != page_zone_id(end_page))
return NULL;
return start_page;
}
void set_zone_contiguous(struct zone *zone)
{
unsigned long block_start_pfn = zone->zone_start_pfn;
unsigned long block_end_pfn;
block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
for (; block_start_pfn < zone_end_pfn(zone);
block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
if (!__pageblock_pfn_to_page(block_start_pfn,
block_end_pfn, zone))
return;
cond_resched();
}
/* We confirm that there is no hole */
zone->contiguous = true;
}
void clear_zone_contiguous(struct zone *zone)
{
zone->contiguous = false;
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static void __init deferred_free_range(unsigned long pfn,
unsigned long nr_pages)
{
struct page *page;
unsigned long i;
if (!nr_pages)
return;
page = pfn_to_page(pfn);
/* Free a large naturally-aligned chunk if possible */
if (nr_pages == pageblock_nr_pages &&
(pfn & (pageblock_nr_pages - 1)) == 0) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
__free_pages_core(page, pageblock_order);
return;
}
for (i = 0; i < nr_pages; i++, page++, pfn++) {
if ((pfn & (pageblock_nr_pages - 1)) == 0)
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
__free_pages_core(page, 0);
}
}
/* Completion tracking for deferred_init_memmap() threads */
static atomic_t pgdat_init_n_undone __initdata;
static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
static inline void __init pgdat_init_report_one_done(void)
{
if (atomic_dec_and_test(&pgdat_init_n_undone))
complete(&pgdat_init_all_done_comp);
}
/*
* Returns true if page needs to be initialized or freed to buddy allocator.
*
* First we check if pfn is valid on architectures where it is possible to have
* holes within pageblock_nr_pages. On systems where it is not possible, this
* function is optimized out.
*
* Then, we check if a current large page is valid by only checking the validity
* of the head pfn.
*/
static inline bool __init deferred_pfn_valid(unsigned long pfn)
{
if (!pfn_valid_within(pfn))
return false;
if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
return false;
return true;
}
/*
* Free pages to buddy allocator. Try to free aligned pages in
* pageblock_nr_pages sizes.
*/
static void __init deferred_free_pages(unsigned long pfn,
unsigned long end_pfn)
{
unsigned long nr_pgmask = pageblock_nr_pages - 1;
unsigned long nr_free = 0;
for (; pfn < end_pfn; pfn++) {
if (!deferred_pfn_valid(pfn)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 0;
} else if (!(pfn & nr_pgmask)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 1;
} else {
nr_free++;
}
}
/* Free the last block of pages to allocator */
deferred_free_range(pfn - nr_free, nr_free);
}
/*
* Initialize struct pages. We minimize pfn page lookups and scheduler checks
* by performing it only once every pageblock_nr_pages.
* Return number of pages initialized.
*/
static unsigned long __init deferred_init_pages(struct zone *zone,
unsigned long pfn,
unsigned long end_pfn)
{
unsigned long nr_pgmask = pageblock_nr_pages - 1;
int nid = zone_to_nid(zone);
unsigned long nr_pages = 0;
int zid = zone_idx(zone);
struct page *page = NULL;
for (; pfn < end_pfn; pfn++) {
if (!deferred_pfn_valid(pfn)) {
page = NULL;
continue;
} else if (!page || !(pfn & nr_pgmask)) {
page = pfn_to_page(pfn);
} else {
page++;
}
__init_single_page(page, pfn, zid, nid);
nr_pages++;
}
return (nr_pages);
}
/*
* This function is meant to pre-load the iterator for the zone init.
* Specifically it walks through the ranges until we are caught up to the
* first_init_pfn value and exits there. If we never encounter the value we
* return false indicating there are no valid ranges left.
*/
static bool __init
deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
unsigned long *spfn, unsigned long *epfn,
unsigned long first_init_pfn)
{
u64 j;
/*
* Start out by walking through the ranges in this zone that have
* already been initialized. We don't need to do anything with them
* so we just need to flush them out of the system.
*/
for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
if (*epfn <= first_init_pfn)
continue;
if (*spfn < first_init_pfn)
*spfn = first_init_pfn;
*i = j;
return true;
}
return false;
}
/*
* Initialize and free pages. We do it in two loops: first we initialize
* struct page, then free to buddy allocator, because while we are
* freeing pages we can access pages that are ahead (computing buddy
* page in __free_one_page()).
*
* In order to try and keep some memory in the cache we have the loop
* broken along max page order boundaries. This way we will not cause
* any issues with the buddy page computation.
*/
static unsigned long __init
deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
unsigned long *end_pfn)
{
unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
unsigned long spfn = *start_pfn, epfn = *end_pfn;
unsigned long nr_pages = 0;
u64 j = *i;
/* First we loop through and initialize the page values */
for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
unsigned long t;
if (mo_pfn <= *start_pfn)
break;
t = min(mo_pfn, *end_pfn);
nr_pages += deferred_init_pages(zone, *start_pfn, t);
if (mo_pfn < *end_pfn) {
*start_pfn = mo_pfn;
break;
}
}
/* Reset values and now loop through freeing pages as needed */
swap(j, *i);
for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
unsigned long t;
if (mo_pfn <= spfn)
break;
t = min(mo_pfn, epfn);
deferred_free_pages(spfn, t);
if (mo_pfn <= epfn)
break;
}
return nr_pages;
}
static void __init
deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
void *arg)
{
unsigned long spfn, epfn;
struct zone *zone = arg;
u64 i;
deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
/*
* Initialize and free pages in MAX_ORDER sized increments so that we
* can avoid introducing any issues with the buddy allocator.
*/
while (spfn < end_pfn) {
deferred_init_maxorder(&i, zone, &spfn, &epfn);
cond_resched();
}
}
/* An arch may override for more concurrency. */
__weak int __init
deferred_page_init_max_threads(const struct cpumask *node_cpumask)
{
return 1;
}
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
{
pg_data_t *pgdat = data;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
unsigned long spfn = 0, epfn = 0;
unsigned long first_init_pfn, flags;
unsigned long start = jiffies;
struct zone *zone;
int zid, max_threads;
u64 i;
/* Bind memory initialisation thread to a local node if possible */
if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(current, cpumask);
pgdat_resize_lock(pgdat, &flags);
first_init_pfn = pgdat->first_deferred_pfn;
if (first_init_pfn == ULONG_MAX) {
pgdat_resize_unlock(pgdat, &flags);
pgdat_init_report_one_done();
return 0;
}
/* Sanity check boundaries */
BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
pgdat->first_deferred_pfn = ULONG_MAX;
/*
* Once we unlock here, the zone cannot be grown anymore, thus if an
* interrupt thread must allocate this early in boot, zone must be
* pre-grown prior to start of deferred page initialization.
*/
pgdat_resize_unlock(pgdat, &flags);
/* Only the highest zone is deferred so find it */
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
zone = pgdat->node_zones + zid;
if (first_init_pfn < zone_end_pfn(zone))
break;
}
/* If the zone is empty somebody else may have cleared out the zone */
if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
first_init_pfn))
goto zone_empty;
max_threads = deferred_page_init_max_threads(cpumask);
while (spfn < epfn) {
unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
struct padata_mt_job job = {
.thread_fn = deferred_init_memmap_chunk,
.fn_arg = zone,
.start = spfn,
.size = epfn_align - spfn,
.align = PAGES_PER_SECTION,
.min_chunk = PAGES_PER_SECTION,
.max_threads = max_threads,
};
padata_do_multithreaded(&job);
deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
epfn_align);
}
zone_empty:
/* Sanity check that the next zone really is unpopulated */
WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
pr_info("node %d deferred pages initialised in %ums\n",
pgdat->node_id, jiffies_to_msecs(jiffies - start));
pgdat_init_report_one_done();
return 0;
}
/*
* If this zone has deferred pages, try to grow it by initializing enough
* deferred pages to satisfy the allocation specified by order, rounded up to
* the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
* of SECTION_SIZE bytes by initializing struct pages in increments of
* PAGES_PER_SECTION * sizeof(struct page) bytes.
*
* Return true when zone was grown, otherwise return false. We return true even
* when we grow less than requested, to let the caller decide if there are
* enough pages to satisfy the allocation.
*
* Note: We use noinline because this function is needed only during boot, and
* it is called from a __ref function _deferred_grow_zone. This way we are
* making sure that it is not inlined into permanent text section.
*/
static noinline bool __init
deferred_grow_zone(struct zone *zone, unsigned int order)
{
unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
pg_data_t *pgdat = zone->zone_pgdat;
unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
unsigned long spfn, epfn, flags;
unsigned long nr_pages = 0;
u64 i;
/* Only the last zone may have deferred pages */
if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
return false;
pgdat_resize_lock(pgdat, &flags);
/*
* If someone grew this zone while we were waiting for spinlock, return
* true, as there might be enough pages already.
*/
if (first_deferred_pfn != pgdat->first_deferred_pfn) {
pgdat_resize_unlock(pgdat, &flags);
return true;
}
/* If the zone is empty somebody else may have cleared out the zone */
if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
first_deferred_pfn)) {
pgdat->first_deferred_pfn = ULONG_MAX;
pgdat_resize_unlock(pgdat, &flags);
/* Retry only once. */
return first_deferred_pfn != ULONG_MAX;
}
/*
* Initialize and free pages in MAX_ORDER sized increments so
* that we can avoid introducing any issues with the buddy
* allocator.
*/
while (spfn < epfn) {
/* update our first deferred PFN for this section */
first_deferred_pfn = spfn;
nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
touch_nmi_watchdog();
/* We should only stop along section boundaries */
if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
continue;
/* If our quota has been met we can stop here */
if (nr_pages >= nr_pages_needed)
break;
}
pgdat->first_deferred_pfn = spfn;
pgdat_resize_unlock(pgdat, &flags);
return nr_pages > 0;
}
/*
* deferred_grow_zone() is __init, but it is called from
* get_page_from_freelist() during early boot until deferred_pages permanently
* disables this call. This is why we have refdata wrapper to avoid warning,
* and to ensure that the function body gets unloaded.
*/
static bool __ref
_deferred_grow_zone(struct zone *zone, unsigned int order)
{
return deferred_grow_zone(zone, order);
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
void __init page_alloc_init_late(void)
{
struct zone *zone;
int nid;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* There will be num_node_state(N_MEMORY) threads */
atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
for_each_node_state(nid, N_MEMORY) {
kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
}
/* Block until all are initialised */
wait_for_completion(&pgdat_init_all_done_comp);
/*
* The number of managed pages has changed due to the initialisation
* so the pcpu batch and high limits needs to be updated or the limits
* will be artificially small.
*/
for_each_populated_zone(zone)
zone_pcp_update(zone);
/*
* We initialized the rest of the deferred pages. Permanently disable
* on-demand struct page initialization.
*/
static_branch_disable(&deferred_pages);
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
#endif
/* Discard memblock private memory */
memblock_discard();
for_each_node_state(nid, N_MEMORY)
shuffle_free_memory(NODE_DATA(nid));
for_each_populated_zone(zone)
set_zone_contiguous(zone);
}
#ifdef CONFIG_CMA
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void __init init_cma_reserved_pageblock(struct page *page)
{
unsigned i = pageblock_nr_pages;
struct page *p = page;
do {
__ClearPageReserved(p);
set_page_count(p, 0);
} while (++p, --i);
set_pageblock_migratetype(page, MIGRATE_CMA);
if (pageblock_order >= MAX_ORDER) {
i = pageblock_nr_pages;
p = page;
do {
set_page_refcounted(p);
__free_pages(p, MAX_ORDER - 1);
p += MAX_ORDER_NR_PAGES;
} while (i -= MAX_ORDER_NR_PAGES);
} else {
set_page_refcounted(page);
__free_pages(page, pageblock_order);
}
adjust_managed_page_count(page, pageblock_nr_pages);
page_zone(page)->cma_pages += pageblock_nr_pages;
}
#endif
/*
* The order of subdivision here is critical for the IO subsystem.
* Please do not alter this order without good reasons and regression
* testing. Specifically, as large blocks of memory are subdivided,
* the order in which smaller blocks are delivered depends on the order
* they're subdivided in this function. This is the primary factor
* influencing the order in which pages are delivered to the IO
* subsystem according to empirical testing, and this is also justified
* by considering the behavior of a buddy system containing a single
* large block of memory acted on by a series of small allocations.
* This behavior is a critical factor in sglist merging's success.
*
* -- nyc
*/
static inline void expand(struct zone *zone, struct page *page,
int low, int high, int migratetype)
{
unsigned long size = 1 << high;
while (high > low) {
high--;
size >>= 1;
VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
/*
* Mark as guard pages (or page), that will allow to
* merge back to allocator when buddy will be freed.
* Corresponding page table entries will not be touched,
* pages will stay not present in virtual address space
*/
if (set_page_guard(zone, &page[size], high, migratetype))
continue;
add_to_free_list(&page[size], zone, high, migratetype);
set_buddy_order(&page[size], high);
}
}
static void check_new_page_bad(struct page *page)
{
if (unlikely(page->flags & __PG_HWPOISON)) {
/* Don't complain about hwpoisoned pages */
page_mapcount_reset(page); /* remove PageBuddy */
return;
}
bad_page(page,
page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
}
/*
* This page is about to be returned from the page allocator
*/
static inline int check_new_page(struct page *page)
{
if (likely(page_expected_state(page,
PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
return 0;
check_new_page_bad(page);
return 1;
}
#ifdef CONFIG_DEBUG_VM
/*
* With DEBUG_VM enabled, order-0 pages are checked for expected state when
* being allocated from pcp lists. With debug_pagealloc also enabled, they are
* also checked when pcp lists are refilled from the free lists.
*/
static inline bool check_pcp_refill(struct page *page)
{
if (debug_pagealloc_enabled_static())
return check_new_page(page);
else
return false;
}
static inline bool check_new_pcp(struct page *page)
{
return check_new_page(page);
}
#else
/*
* With DEBUG_VM disabled, free order-0 pages are checked for expected state
* when pcp lists are being refilled from the free lists. With debug_pagealloc
* enabled, they are also checked when being allocated from the pcp lists.
*/
static inline bool check_pcp_refill(struct page *page)
{
return check_new_page(page);
}
static inline bool check_new_pcp(struct page *page)
{
if (debug_pagealloc_enabled_static())
return check_new_page(page);
else
return false;
}
#endif /* CONFIG_DEBUG_VM */
static bool check_new_pages(struct page *page, unsigned int order)
{
int i;
for (i = 0; i < (1 << order); i++) {
struct page *p = page + i;
if (unlikely(check_new_page(p)))
return true;
}
return false;
}
inline void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags)
{
set_page_private(page, 0);
set_page_refcounted(page);
arch_alloc_page(page, order);
debug_pagealloc_map_pages(page, 1 << order);
/*
* Page unpoisoning must happen before memory initialization.
* Otherwise, the poison pattern will be overwritten for __GFP_ZERO
* allocations and the page unpoisoning code will complain.
*/
kernel_unpoison_pages(page, 1 << order);
/*
* As memory initialization might be integrated into KASAN,
* kasan_alloc_pages and kernel_init_free_pages must be
* kept together to avoid discrepancies in behavior.
*/
if (kasan_has_integrated_init()) {
kasan_alloc_pages(page, order, gfp_flags);
} else {
bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
kasan_unpoison_pages(page, order, init);
if (init)
kernel_init_free_pages(page, 1 << order,
gfp_flags & __GFP_ZEROTAGS);
}
set_page_owner(page, order, gfp_flags);
}
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
unsigned int alloc_flags)
{
post_alloc_hook(page, order, gfp_flags);
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
/*
* page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
* allocate the page. The expectation is that the caller is taking
* steps that will free more memory. The caller should avoid the page
* being used for !PFMEMALLOC purposes.
*/
if (alloc_flags & ALLOC_NO_WATERMARKS)
set_page_pfmemalloc(page);
else
clear_page_pfmemalloc(page);
trace_android_vh_test_clear_look_around_ref(page);
}
/*
* Go through the free lists for the given migratetype and remove
* the smallest available page from the freelists
*/
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area *area;
struct page *page;
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
del_page_from_free_list(page, zone, current_order);
expand(zone, page, order, current_order, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
}
return NULL;
}
/*
* This array describes the order lists are fallen back to when
* the free lists for the desirable migrate type are depleted
*/
static int fallbacks[MIGRATE_TYPES][3] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
#ifdef CONFIG_CMA
[MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
#endif
#ifdef CONFIG_MEMORY_ISOLATION
[MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */
#endif
};
#ifdef CONFIG_CMA
static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
unsigned int order)
{
return __rmqueue_smallest(zone, order, MIGRATE_CMA);
}
#else
static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
unsigned int order) { return NULL; }
#endif
/*
* Move the free pages in a range to the freelist tail of the requested type.
* Note that start_page and end_pages are not aligned on a pageblock
* boundary. If alignment is required, use move_freepages_block()
*/
static int move_freepages(struct zone *zone,
struct page *start_page, struct page *end_page,
int migratetype, int *num_movable)
{
struct page *page;
unsigned int order;
int pages_moved = 0;
for (page = start_page; page <= end_page;) {
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
continue;
}
if (!PageBuddy(page)) {
/*
* We assume that pages that could be isolated for
* migration are movable. But we don't actually try
* isolating, as that would be expensive.
*/
if (num_movable &&
(PageLRU(page) || __PageMovable(page)))
(*num_movable)++;
page++;
continue;
}
/* Make sure we are not inadvertently changing nodes */
VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
VM_BUG_ON_PAGE(page_zone(page) != zone, page);
order = buddy_order(page);
move_to_free_list(page, zone, order, migratetype);
page += 1 << order;
pages_moved += 1 << order;
}
return pages_moved;
}
int move_freepages_block(struct zone *zone, struct page *page,
int migratetype, int *num_movable)
{
unsigned long start_pfn, end_pfn;
struct page *start_page, *end_page;
if (num_movable)
*num_movable = 0;
start_pfn = page_to_pfn(page);
start_pfn = start_pfn & ~(pageblock_nr_pages-1);
start_page = pfn_to_page(start_pfn);
end_page = start_page + pageblock_nr_pages - 1;
end_pfn = start_pfn + pageblock_nr_pages - 1;
/* Do not cross zone boundaries */
if (!zone_spans_pfn(zone, start_pfn))
start_page = page;
if (!zone_spans_pfn(zone, end_pfn))
return 0;
return move_freepages(zone, start_page, end_page, migratetype,
num_movable);
}
static void change_pageblock_range(struct page *pageblock_page,
int start_order, int migratetype)
{
int nr_pageblocks = 1 << (start_order - pageblock_order);
while (nr_pageblocks--) {
set_pageblock_migratetype(pageblock_page, migratetype);
pageblock_page += pageblock_nr_pages;
}
}
/*
* When we are falling back to another migratetype during allocation, try to
* steal extra free pages from the same pageblocks to satisfy further
* allocations, instead of polluting multiple pageblocks.
*
* If we are stealing a relatively large buddy page, it is likely there will
* be more free pages in the pageblock, so try to steal them all. For
* reclaimable and unmovable allocations, we steal regardless of page size,
* as fragmentation caused by those allocations polluting movable pageblocks
* is worse than movable allocations stealing from unmovable and reclaimable
* pageblocks.
*/
static bool can_steal_fallback(unsigned int order, int start_mt)
{
/*
* Leaving this order check is intended, although there is
* relaxed order check in next check. The reason is that
* we can actually steal whole pageblock if this condition met,
* but, below check doesn't guarantee it and that is just heuristic
* so could be changed anytime.
*/
if (order >= pageblock_order)
return true;
if (order >= pageblock_order / 2 ||
start_mt == MIGRATE_RECLAIMABLE ||
start_mt == MIGRATE_UNMOVABLE ||
page_group_by_mobility_disabled)
return true;
return false;
}
static inline bool boost_watermark(struct zone *zone)
{
unsigned long max_boost;
if (!watermark_boost_factor)
return false;
/*
* Don't bother in zones that are unlikely to produce results.
* On small machines, including kdump capture kernels running
* in a small area, boosting the watermark can cause an out of
* memory situation immediately.
*/
if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
return false;
max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
watermark_boost_factor, 10000);
/*
* high watermark may be uninitialised if fragmentation occurs
* very early in boot so do not boost. We do not fall
* through and boost by pageblock_nr_pages as failing
* allocations that early means that reclaim is not going
* to help and it may even be impossible to reclaim the
* boosted watermark resulting in a hang.
*/
if (!max_boost)
return false;
max_boost = max(pageblock_nr_pages, max_boost);
zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
max_boost);
return true;
}
/*
* This function implements actual steal behaviour. If order is large enough,
* we can steal whole pageblock. If not, we first move freepages in this
* pageblock to our migratetype and determine how many already-allocated pages
* are there in the pageblock with a compatible migratetype. If at least half
* of pages are free or compatible, we can change migratetype of the pageblock
* itself, so pages freed in the future will be put on the correct free list.
*/
static void steal_suitable_fallback(struct zone *zone, struct page *page,
unsigned int alloc_flags, int start_type, bool whole_block)
{
unsigned int current_order = buddy_order(page);
int free_pages, movable_pages, alike_pages;
int old_block_type;
old_block_type = get_pageblock_migratetype(page);
/*
* This can happen due to races and we want to prevent broken
* highatomic accounting.
*/
if (is_migrate_highatomic(old_block_type))
goto single_page;
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
change_pageblock_range(page, current_order, start_type);
goto single_page;
}
/*
* Boost watermarks to increase reclaim pressure to reduce the
* likelihood of future fallbacks. Wake kswapd now as the node
* may be balanced overall and kswapd will not wake naturally.
*/
if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
/* We are not allowed to try stealing from the whole block */
if (!whole_block)
goto single_page;
free_pages = move_freepages_block(zone, page, start_type,
&movable_pages);
/*
* Determine how many pages are compatible with our allocation.
* For movable allocation, it's the number of movable pages which
* we just obtained. For other types it's a bit more tricky.
*/
if (start_type == MIGRATE_MOVABLE) {
alike_pages = movable_pages;
} else {
/*
* If we are falling back a RECLAIMABLE or UNMOVABLE allocation
* to MOVABLE pageblock, consider all non-movable pages as
* compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
* vice versa, be conservative since we can't distinguish the
* exact migratetype of non-movable pages.
*/
if (old_block_type == MIGRATE_MOVABLE)
alike_pages = pageblock_nr_pages
- (free_pages + movable_pages);
else
alike_pages = 0;
}
/* moving whole block can fail due to zone boundary conditions */
if (!free_pages)
goto single_page;
/*
* If a sufficient number of pages in the block are either free or of
* comparable migratability as our allocation, claim the whole block.
*/
if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled)
set_pageblock_migratetype(page, start_type);
return;
single_page:
move_to_free_list(page, zone, current_order, start_type);
}
/*
* Check whether there is a suitable fallback freepage with requested order.
* If only_stealable is true, this function returns fallback_mt only if
* we can steal other freepages all together. This would help to reduce
* fragmentation due to mixed migratetype pages in one pageblock.
*/
int find_suitable_fallback(struct free_area *area, unsigned int order,
int migratetype, bool only_stealable, bool *can_steal)
{
int i;
int fallback_mt;
if (area->nr_free == 0)
return -1;
*can_steal = false;
for (i = 0;; i++) {
fallback_mt = fallbacks[migratetype][i];
if (fallback_mt == MIGRATE_TYPES)
break;
if (free_area_empty(area, fallback_mt))
continue;
if (can_steal_fallback(order, migratetype))
*can_steal = true;
if (!only_stealable)
return fallback_mt;
if (*can_steal)
return fallback_mt;
}
return -1;
}
/*
* Reserve a pageblock for exclusive use of high-order atomic allocations if
* there are no empty page blocks that contain a page with a suitable order
*/
static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
unsigned int alloc_order)
{
int mt;
unsigned long max_managed, flags;
/*
* Limit the number reserved to 1 pageblock or roughly 1% of a zone.
* Check is race-prone but harmless.
*/
max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
if (zone->nr_reserved_highatomic >= max_managed)
return;
spin_lock_irqsave(&zone->lock, flags);
/* Recheck the nr_reserved_highatomic limit under the lock */
if (zone->nr_reserved_highatomic >= max_managed)
goto out_unlock;
/* Yoink! */
mt = get_pageblock_migratetype(page);
if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
&& !is_migrate_cma(mt)) {
zone->nr_reserved_highatomic += pageblock_nr_pages;
set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
}
out_unlock:
spin_unlock_irqrestore(&zone->lock, flags);
}
/*
* Used when an allocation is about to fail under memory pressure. This
* potentially hurts the reliability of high-order allocations when under
* intense memory pressure but failed atomic allocations should be easier
* to recover from than an OOM.
*
* If @force is true, try to unreserve a pageblock even though highatomic
* pageblock is exhausted.
*/
static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
bool force)
{
struct zonelist *zonelist = ac->zonelist;
unsigned long flags;
struct zoneref *z;
struct zone *zone;
struct page *page;
int order;
bool ret;
for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
ac->nodemask) {
/*
* Preserve at least one pageblock unless memory pressure
* is really high.
*/
if (!force && zone->nr_reserved_highatomic <=
pageblock_nr_pages)
continue;
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
struct free_area *area = &(zone->free_area[order]);
page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
if (!page)
continue;
/*
* In page freeing path, migratetype change is racy so
* we can counter several free pages in a pageblock
* in this loop althoug we changed the pageblock type
* from highatomic to ac->migratetype. So we should
* adjust the count once.
*/
if (is_migrate_highatomic_page(page)) {
/*
* It should never happen but changes to
* locking could inadvertently allow a per-cpu
* drain to add pages to MIGRATE_HIGHATOMIC
* while unreserving so be safe and watch for
* underflows.
*/
zone->nr_reserved_highatomic -= min(
pageblock_nr_pages,
zone->nr_reserved_highatomic);
}
/*
* Convert to ac->migratetype and avoid the normal
* pageblock stealing heuristics. Minimally, the caller
* is doing the work and needs the pages. More
* importantly, if the block was always converted to
* MIGRATE_UNMOVABLE or another type then the number
* of pageblocks that cannot be completely freed
* may increase.
*/
set_pageblock_migratetype(page, ac->migratetype);
ret = move_freepages_block(zone, page, ac->migratetype,
NULL);
if (ret) {
spin_unlock_irqrestore(&zone->lock, flags);
return ret;
}
}
spin_unlock_irqrestore(&zone->lock, flags);
}
return false;
}
/*
* Try finding a free buddy page on the fallback list and put it on the free
* list of requested migratetype, possibly along with other pages from the same
* block, depending on fragmentation avoidance heuristics. Returns true if
* fallback was found so that __rmqueue_smallest() can grab it.
*
* The use of signed ints for order and current_order is a deliberate
* deviation from the rest of this file, to make the for loop
* condition simpler.
*/
static __always_inline bool
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
unsigned int alloc_flags)
{
struct free_area *area;
int current_order;
int min_order = order;
struct page *page;
int fallback_mt;
bool can_steal;
/*
* Do not steal pages from freelists belonging to other pageblocks
* i.e. orders < pageblock_order. If there are no local zones free,
* the zonelists will be reiterated without ALLOC_NOFRAGMENT.
*/
if (alloc_flags & ALLOC_NOFRAGMENT)
min_order = pageblock_order;
/*
* Find the largest available free page in the other list. This roughly
* approximates finding the pageblock with the most free pages, which
* would be too costly to do exactly.
*/
for (current_order = MAX_ORDER - 1; current_order >= min_order;
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
start_migratetype, false, &can_steal);
if (fallback_mt == -1)
continue;
/*
* We cannot steal all free pages from the pageblock and the
* requested migratetype is movable. In that case it's better to
* steal and split the smallest available page instead of the
* largest available page, because even if the next movable
* allocation falls back into a different pageblock than this
* one, it won't cause permanent fragmentation.
*/
if (!can_steal && start_migratetype == MIGRATE_MOVABLE
&& current_order > order)
goto find_smallest;
goto do_steal;
}
return false;
find_smallest:
for (current_order = order; current_order < MAX_ORDER;
current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
start_migratetype, false, &can_steal);
if (fallback_mt != -1)
break;
}
/*
* This should not happen - we already found a suitable fallback
* when looking for the largest page.
*/
VM_BUG_ON(current_order == MAX_ORDER);
do_steal:
page = get_page_from_free_area(area, fallback_mt);
steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
can_steal);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
return true;
}
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
unsigned int alloc_flags)
{
struct page *page;
retry:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype,
alloc_flags))
goto retry;
trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}
#ifdef CONFIG_CMA
static struct page *__rmqueue_cma(struct zone *zone, unsigned int order,
int migratetype,
unsigned int alloc_flags)
{
struct page *page = __rmqueue_cma_fallback(zone, order);
trace_mm_page_alloc_zone_locked(page, order, MIGRATE_CMA);
return page;
}
#else
static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order,
int migratetype,
unsigned int alloc_flags)
{
return NULL;
}
#endif
/*
* Obtain a specified number of elements from the buddy allocator, all under
* a single hold of the lock, for efficiency. Add them to the supplied list.
* Returns the number of new pages which were placed at *list.
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
int i, alloced = 0;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
struct page *page;
if (is_migrate_cma(migratetype))
page = __rmqueue_cma(zone, order, migratetype,
alloc_flags);
else
page = __rmqueue(zone, order, migratetype, alloc_flags);
if (unlikely(page == NULL))
break;
if (unlikely(check_pcp_refill(page)))
continue;
/*
* Split buddy pages returned by expand() are received here in
* physical page order. The page is added to the tail of
* caller's list. From the callers perspective, the linked list
* is ordered by page number under some conditions. This is
* useful for IO devices that can forward direction from the
* head, thus also in the physical page order. This is useful
* for IO devices that can merge IO requests if the physical
* pages are ordered properly.
*/
list_add_tail(&page->lru, list);
alloced++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
/*
* i pages were removed from the buddy list even if some leak due
* to check_pcp_refill failing so adjust NR_FREE_PAGES based
* on i. Do not confuse with 'alloced' which is the number of
* pages added to the pcp list.
*/
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock(&zone->lock);
return alloced;
}
/*
* Return the pcp list that corresponds to the migrate type if that list isn't
* empty.
* If the list is empty return NULL.
*/
static struct list_head *get_populated_pcp_list(struct zone *zone,
unsigned int order, struct per_cpu_pages *pcp,
int migratetype, unsigned int alloc_flags)
{
struct list_head *list = &pcp->lists[migratetype];
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, order,
pcp->batch, list,
migratetype, alloc_flags);
if (list_empty(list))
list = NULL;
}
return list;
}
#ifdef CONFIG_NUMA
/*
* Called from the vmstat counter updater to drain pagesets of this
* currently executing processor on remote nodes after they have
* expired.
*
* Note that this function must be called with the thread pinned to
* a single processor.
*/
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
unsigned long flags;
int to_drain, batch;
local_irq_save(flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0)
free_pcppages_bulk(zone, to_drain, pcp);
local_irq_restore(flags);
}
#endif
/*
* Drain pcplists of the indicated processor and zone.
*
* The processor must either be the current processor and the
* thread pinned to the current processor or a processor that
* is not online.
*/
static void drain_pages_zone(unsigned int cpu, struct zone *zone)
{
unsigned long flags;
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
local_irq_save(flags);
pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
if (pcp->count)
free_pcppages_bulk(zone, pcp->count, pcp);
local_irq_restore(flags);
}
/*
* Drain pcplists of all zones on the indicated processor.
*
* The processor must either be the current processor and the
* thread pinned to the current processor or a processor that
* is not online.
*/
static void drain_pages(unsigned int cpu)
{
struct zone *zone;
for_each_populated_zone(zone) {
drain_pages_zone(cpu, zone);
}
}
/*
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
*
* The CPU has to be pinned. When zone parameter is non-NULL, spill just
* the single zone's pages.
*/
void drain_local_pages(struct zone *zone)
{
int cpu = smp_processor_id();
if (zone)
drain_pages_zone(cpu, zone);
else
drain_pages(cpu);
}
static void drain_local_pages_wq(struct work_struct *work)
{
struct pcpu_drain *drain;
drain = container_of(work, struct pcpu_drain, work);
/*
* drain_all_pages doesn't use proper cpu hotplug protection so
* we can race with cpu offline when the WQ can move this from
* a cpu pinned worker to an unbound one. We can operate on a different
* cpu which is allright but we also have to make sure to not move to
* a different one.
*/
preempt_disable();
drain_local_pages(drain->zone);
preempt_enable();
}
/*
* Spill all the per-cpu pages from all CPUs back into the buddy allocator.
*
* When zone parameter is non-NULL, spill just the single zone's pages.
*
* Note that this can be extremely slow as the draining happens in a workqueue.
*/
void drain_all_pages(struct zone *zone)
{
int cpu;
/*
* Allocate in the BSS so we wont require allocation in
* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
*/
static cpumask_t cpus_with_pcps;
/*
* Make sure nobody triggers this path before mm_percpu_wq is fully
* initialized.
*/
if (WARN_ON_ONCE(!mm_percpu_wq))
return;
/*
* Do not drain if one is already in progress unless it's specific to
* a zone. Such callers are primarily CMA and memory hotplug and need
* the drain to be complete when the call returns.
*/
if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
if (!zone)
return;
mutex_lock(&pcpu_drain_mutex);
}
/*
* We don't care about racing with CPU hotplug event
* as offline notification will cause the notified
* cpu to drain that CPU pcps and on_each_cpu_mask
* disables preemption as part of its processing
*/
for_each_online_cpu(cpu) {
struct per_cpu_pageset *pcp;
struct zone *z;
bool has_pcps = false;
if (zone) {
pcp = per_cpu_ptr(zone->pageset, cpu);
if (pcp->pcp.count)
has_pcps = true;
} else {
for_each_populated_zone(z) {
pcp = per_cpu_ptr(z->pageset, cpu);
if (pcp->pcp.count) {
has_pcps = true;
break;
}
}
}
if (has_pcps)
cpumask_set_cpu(cpu, &cpus_with_pcps);
else
cpumask_clear_cpu(cpu, &cpus_with_pcps);
}
for_each_cpu(cpu, &cpus_with_pcps) {
struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
drain->zone = zone;
INIT_WORK(&drain->work, drain_local_pages_wq);
queue_work_on(cpu, mm_percpu_wq, &drain->work);
}
for_each_cpu(cpu, &cpus_with_pcps)
flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
mutex_unlock(&pcpu_drain_mutex);
}
#ifdef CONFIG_HIBERNATION
/*
* Touch the watchdog for every WD_PAGE_COUNT pages.
*/
#define WD_PAGE_COUNT (128*1024)
void mark_free_pages(struct zone *zone)
{
unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
unsigned long flags;
unsigned int order, t;
struct page *page;
if (zone_is_empty(zone))
return;
spin_lock_irqsave(&zone->lock, flags);
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
if (!--page_count) {
touch_nmi_watchdog();
page_count = WD_PAGE_COUNT;
}
if (page_zone(page) != zone)
continue;
if (!swsusp_page_is_forbidden(page))
swsusp_unset_page_free(page);
}
for_each_migratetype_order(order, t) {
list_for_each_entry(page,
&zone->free_area[order].free_list[t], lru) {
unsigned long i;
pfn = page_to_pfn(page);
for (i = 0; i < (1UL << order); i++) {
if (!--page_count) {
touch_nmi_watchdog();
page_count = WD_PAGE_COUNT;
}
swsusp_set_page_free(pfn_to_page(pfn + i));
}
}
}
spin_unlock_irqrestore(&zone->lock, flags);
}
#endif /* CONFIG_PM */
static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
{
int migratetype;
if (!free_pcp_prepare(page))
return false;
migratetype = get_pfnblock_migratetype(page, pfn);
set_pcppage_migratetype(page, migratetype);
return true;
}
static void free_unref_page_commit(struct page *page, unsigned long pfn)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
int migratetype;
bool pcp_skip_cma_pages = false;
migratetype = get_pcppage_migratetype(page);
__count_vm_event(PGFREE);
/*
* We only track unmovable, reclaimable and movable on pcp lists.
* Free ISOLATE pages back to the allocator because they are being
* offlined but treat HIGHATOMIC as movable pages so we can get those
* areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
if (migratetype >= MIGRATE_PCPTYPES) {
trace_android_vh_pcplist_add_cma_pages_bypass(migratetype,
&pcp_skip_cma_pages);
if (unlikely(is_migrate_isolate(migratetype)) ||
pcp_skip_cma_pages) {
free_one_page(zone, page, pfn, 0, migratetype,
FPI_NONE);
return;
}
migratetype = MIGRATE_MOVABLE;
}
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list_add(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
if (pcp->count >= pcp->high) {
unsigned long batch = READ_ONCE(pcp->batch);
free_pcppages_bulk(zone, batch, pcp);
}
}
/*
* Free a 0-order page
*/
void free_unref_page(struct page *page)
{
unsigned long flags;
unsigned long pfn = page_to_pfn(page);
if (!free_unref_page_prepare(page, pfn))
return;
local_irq_save(flags);
free_unref_page_commit(page, pfn);
local_irq_restore(flags);
}
/*
* Free a list of 0-order pages
*/
void free_unref_page_list(struct list_head *list)
{
struct page *page, *next;
unsigned long flags, pfn;
int batch_count = 0;
/* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
pfn = page_to_pfn(page);
if (!free_unref_page_prepare(page, pfn))
list_del(&page->lru);
set_page_private(page, pfn);
}
local_irq_save(flags);
list_for_each_entry_safe(page, next, list, lru) {
unsigned long pfn = page_private(page);
set_page_private(page, 0);
trace_mm_page_free_batched(page);
free_unref_page_commit(page, pfn);
/*
* Guard against excessive IRQ disabled times when we get
* a large list of pages to free.
*/
if (++batch_count == SWAP_CLUSTER_MAX) {
local_irq_restore(flags);
batch_count = 0;
local_irq_save(flags);
}
}
local_irq_restore(flags);
}
/*
* split_page takes a non-compound higher-order page, and splits it into
* n (1<<order) sub-pages: page[0..n]
* Each sub-page must be freed individually.
*
* Note: this is probably too low level an operation for use in drivers.
* Please consult with lkml before using this in your driver.
*/
void split_page(struct page *page, unsigned int order)
{
int i;
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!page_count(page), page);
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
split_page_owner(page, 1 << order);
split_page_memcg(page, 1 << order);
}
EXPORT_SYMBOL_GPL(split_page);
int __isolate_free_page(struct page *page, unsigned int order)
{
unsigned long watermark;
struct zone *zone;
int mt;
BUG_ON(!PageBuddy(page));
zone = page_zone(page);
mt = get_pageblock_migratetype(page);
if (!is_migrate_isolate(mt)) {
/*
* Obey watermarks as if the page was being allocated. We can
* emulate a high-order watermark check with a raised order-0
* watermark, because we already know our high-order page
* exists.
*/
watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
return 0;
__mod_zone_freepage_state(zone, -(1UL << order), mt);
}
/* Remove page from free list */
del_page_from_free_list(page, zone, order);
/*
* Set the pageblock if the isolated page is at least half of a
* pageblock
*/
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
int mt = get_pageblock_migratetype(page);
if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
&& !is_migrate_highatomic(mt))
set_pageblock_migratetype(page,
MIGRATE_MOVABLE);
}
}
return 1UL << order;
}
/**
* __putback_isolated_page - Return a now-isolated page back where we got it
* @page: Page that was isolated
* @order: Order of the isolated page
* @mt: The page's pageblock's migratetype
*
* This function is meant to return a page pulled from the free lists via
* __isolate_free_page back to the free lists they were pulled from.
*/
void __putback_isolated_page(struct page *page, unsigned int order, int mt)
{
struct zone *zone = page_zone(page);
/* zone lock should be held when this function is called */
lockdep_assert_held(&zone->lock);
/* Return isolated page to tail of freelist. */
__free_one_page(page, page_to_pfn(page), zone, order, mt,
FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
}
/*
* Update NUMA hit/miss statistics
*
* Must be called with interrupts disabled.
*/
static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
{
#ifdef CONFIG_NUMA
enum numa_stat_item local_stat = NUMA_LOCAL;
/* skip numa counters update if numa stats is disabled */
if (!static_branch_likely(&vm_numa_stat_key))
return;
if (zone_to_nid(z) != numa_node_id())
local_stat = NUMA_OTHER;
if (zone_to_nid(z) == zone_to_nid(preferred_zone))
__inc_numa_state(z, NUMA_HIT);
else {
__inc_numa_state(z, NUMA_MISS);
__inc_numa_state(preferred_zone, NUMA_FOREIGN);
}
__inc_numa_state(z, local_stat);
#endif
}
/* Remove page from the per-cpu list, caller must protect the list */
static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
unsigned int alloc_flags,
struct per_cpu_pages *pcp,
gfp_t gfp_flags)
{
struct page *page = NULL;
struct list_head *list = NULL;
do {
/* First try to get CMA pages */
if (migratetype == MIGRATE_MOVABLE &&
alloc_flags & ALLOC_CMA) {
list = get_populated_pcp_list(zone, 0, pcp,
get_cma_migrate_type(), alloc_flags);
}
if (list == NULL) {
/*
* Either CMA is not suitable or there are no
* free CMA pages.
*/
list = get_populated_pcp_list(zone, 0, pcp,
migratetype, alloc_flags);
if (unlikely(list == NULL) ||
unlikely(list_empty(list)))
return NULL;
}
page = list_first_entry(list, struct page, lru);
list_del(&page->lru);
pcp->count--;
} while (check_new_pcp(page));
return page;
}
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct zone *zone, gfp_t gfp_flags,
int migratetype, unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct page *page;
unsigned long flags;
local_irq_save(flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp,
gfp_flags);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
zone_statistics(preferred_zone, zone);
}
local_irq_restore(flags);
return page;
}
/*
* Allocate a page from the given zone. Use pcplists for order-0 allocations.
*/
static inline
struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
unsigned long flags;
struct page *page;
if (likely(order == 0)) {
page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
migratetype, alloc_flags);
goto out;
}
/*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with __GFP_NOFAIL.
*/
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
spin_lock_irqsave(&zone->lock, flags);
do {
page = NULL;
/*
* order-0 request can reach here when the pcplist is skipped
* due to non-CMA allocation context. HIGHATOMIC area is
* reserved for high-order atomic allocation, so order-0
* request should skip it.
*/
if (order > 0 && alloc_flags & ALLOC_HARDER) {
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (page)
trace_mm_page_alloc_zone_locked(page, order, migratetype);
}
if (!page) {
if (migratetype == MIGRATE_MOVABLE &&
alloc_flags & ALLOC_CMA)
page = __rmqueue_cma(zone, order, migratetype,
alloc_flags);
if (!page)
page = __rmqueue(zone, order, migratetype,
alloc_flags);
}
} while (page && check_new_pages(page, order));
spin_unlock(&zone->lock);
if (!page)
goto failed;
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone);
trace_android_vh_rmqueue(preferred_zone, zone, order,
gfp_flags, alloc_flags, migratetype);
local_irq_restore(flags);
out:
/* Separate test+clear to avoid unnecessary atomics */
if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
}
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
failed:
local_irq_restore(flags);
return NULL;
}
#ifdef CONFIG_FAIL_PAGE_ALLOC
static struct {
struct fault_attr attr;
bool ignore_gfp_highmem;
bool ignore_gfp_reclaim;
u32 min_order;
} fail_page_alloc = {
.attr = FAULT_ATTR_INITIALIZER,
.ignore_gfp_reclaim = true,
.ignore_gfp_highmem = true,
.min_order = 1,
};
static int __init setup_fail_page_alloc(char *str)
{
return setup_fault_attr(&fail_page_alloc.attr, str);
}
__setup("fail_page_alloc=", setup_fail_page_alloc);
static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
if (order < fail_page_alloc.min_order)
return false;
if (gfp_mask & __GFP_NOFAIL)
return false;
if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
return false;
if (fail_page_alloc.ignore_gfp_reclaim &&
(gfp_mask & __GFP_DIRECT_RECLAIM))
return false;
return should_fail(&fail_page_alloc.attr, 1 << order);
}
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
static int __init fail_page_alloc_debugfs(void)
{
umode_t mode = S_IFREG | 0600;
struct dentry *dir;
dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
&fail_page_alloc.attr);
debugfs_create_bool("ignore-gfp-wait", mode, dir,
&fail_page_alloc.ignore_gfp_reclaim);
debugfs_create_bool("ignore-gfp-highmem", mode, dir,
&fail_page_alloc.ignore_gfp_highmem);
debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
return 0;
}
late_initcall(fail_page_alloc_debugfs);
#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
#else /* CONFIG_FAIL_PAGE_ALLOC */
static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
return false;
}
#endif /* CONFIG_FAIL_PAGE_ALLOC */
noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
return __should_fail_alloc_page(gfp_mask, order);
}
ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
static inline long __zone_watermark_unusable_free(struct zone *z,
unsigned int order, unsigned int alloc_flags)
{
const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
long unusable_free = (1 << order) - 1;
/*
* If the caller does not have rights to ALLOC_HARDER then subtract
* the high-atomic reserves. This will over-estimate the size of the
* atomic reserve but it avoids a search.
*/
if (likely(!alloc_harder))
unusable_free += z->nr_reserved_highatomic;
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
if (!(alloc_flags & ALLOC_CMA))
unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
return unusable_free;
}
/*
* Return true if free base pages are above 'mark'. For high-order checks it
* will return true of the order-0 watermark is reached and there is at least
* one free page of a suitable size. Checking now avoids taking the zone lock
* to check in the allocation paths if no pages are free.
*/
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
int highest_zoneidx, unsigned int alloc_flags,
long free_pages)
{
long min = mark;
int o;
const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
/* free_pages may go negative - that's OK */
free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (unlikely(alloc_harder)) {
/*
* OOM victims can try even harder than normal ALLOC_HARDER
* users on the grounds that it's definitely going to be in
* the exit path shortly and free memory. Any allocation it
* makes during the free path will be small and short-lived.
*/
if (alloc_flags & ALLOC_OOM)
min -= min / 2;
else
min -= min / 4;
}
/*
* Check watermarks for an order-0 allocation request. If these
* are not met, then a high-order request also cannot go ahead
* even if a suitable page happened to be free.
*/
if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
return false;
/* If this is an order-0 request then the watermark is fine */
if (!order)
return true;
/* For a high-order request, check at least one suitable page is free */
for (o = order; o < MAX_ORDER; o++) {
struct free_area *area = &z->free_area[o];
int mt;
if (!area->nr_free)
continue;
for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
#ifdef CONFIG_CMA
/*
* Note that this check is needed only
* when MIGRATE_CMA < MIGRATE_PCPTYPES.
*/
if (mt == MIGRATE_CMA)
continue;
#endif
if (!free_area_empty(area, mt))
return true;
}
#ifdef CONFIG_CMA
if ((alloc_flags & ALLOC_CMA) &&
!free_area_empty(area, MIGRATE_CMA)) {
return true;
}
#endif
if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
return true;
}
return false;
}
bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
int highest_zoneidx, unsigned int alloc_flags)
{
return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
zone_page_state(z, NR_FREE_PAGES));
}
EXPORT_SYMBOL_GPL(zone_watermark_ok);
static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
unsigned long mark, int highest_zoneidx,
unsigned int alloc_flags, gfp_t gfp_mask)
{
long free_pages;
free_pages = zone_page_state(z, NR_FREE_PAGES);
/*
* Fast check for order-0 only. If this fails then the reserves
* need to be calculated.
*/
if (!order) {
long usable_free;
long reserved;
usable_free = free_pages;
reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
/* reserved may over estimate high-atomic reserves. */
usable_free -= min(usable_free, reserved);
if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
return true;
}
if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
free_pages))
return true;
/*
* Ignore watermark boosting for GFP_ATOMIC order-0 allocations
* when checking the min watermark. The min watermark is the
* point where boosting is ignored so that kswapd is woken up
* when below the low watermark.
*/
if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
&& ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
mark = z->_watermark[WMARK_MIN];
return __zone_watermark_ok(z, order, mark, highest_zoneidx,
alloc_flags, free_pages);
}
return false;
}
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
unsigned long mark, int highest_zoneidx)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
free_pages);
}
EXPORT_SYMBOL_GPL(zone_watermark_ok_safe);
#ifdef CONFIG_NUMA
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
node_reclaim_distance;
}
#else /* CONFIG_NUMA */
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return true;
}
#endif /* CONFIG_NUMA */
/*
* The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
* fragmentation is subtle. If the preferred zone was HIGHMEM then
* premature use of a lower zone may cause lowmem pressure problems that
* are worse than fragmentation. If the next zone is ZONE_DMA then it is
* probably too small. It only makes sense to spread allocations to avoid
* fragmentation between the Normal and DMA32 zones.
*/
static inline unsigned int
alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
{
unsigned int alloc_flags;
/*
* __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
* to save a branch.
*/
alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
#ifdef CONFIG_ZONE_DMA32
if (!zone)
return alloc_flags;
if (zone_idx(zone) != ZONE_NORMAL)
return alloc_flags;
/*
* If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
* the pointer is within zone->zone_pgdat->node_zones[]. Also assume
* on UMA that if Normal is populated then so is DMA32.
*/
BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
if (nr_online_nodes > 1 && !populated_zone(--zone))
return alloc_flags;
alloc_flags |= ALLOC_NOFRAGMENT;
#endif /* CONFIG_ZONE_DMA32 */
return alloc_flags;
}
static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
unsigned int alloc_flags)
{
#ifdef CONFIG_CMA
unsigned int pflags = current->flags;
if (!(pflags & PF_MEMALLOC_NOCMA) &&
gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE &&
gfp_mask & __GFP_CMA)
alloc_flags |= ALLOC_CMA;
#endif
return alloc_flags;
}
/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
struct pglist_data *last_pgdat_dirty_limit = NULL;
bool no_fallback;
retry:
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
ac->nodemask) {
struct page *page;
unsigned long mark;
if (cpusets_enabled() &&
(alloc_flags & ALLOC_CPUSET) &&
!__cpuset_zone_allowed(zone, gfp_mask))
continue;
/*
* When allocating a page cache page for writing, we
* want to get it from a node that is within its dirty
* limit, such that no single node holds more than its
* proportional share of globally allowed dirty pages.
* The dirty limits take into account the node's
* lowmem reserves and high watermark so that kswapd
* should be able to balance it without having to
* write pages from its LRU list.
*
* XXX: For now, allow allocations to potentially
* exceed the per-node dirty limit in the slowpath
* (spread_dirty_pages unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
* nodes are together not big enough to reach the
* global limit. The proper fix for these situations
* will require awareness of nodes in the
* dirty-throttling and the flusher threads.
*/
if (ac->spread_dirty_pages) {
if (last_pgdat_dirty_limit == zone->zone_pgdat)
continue;
if (!node_dirty_ok(zone->zone_pgdat)) {
last_pgdat_dirty_limit = zone->zone_pgdat;
continue;
}
}
if (no_fallback && nr_online_nodes > 1 &&
zone != ac->preferred_zoneref->zone) {
int local_nid;
/*
* If moving to a remote node, retry but allow
* fragmenting fallbacks. Locality is more important
* than fragmentation avoidance.
*/
local_nid = zone_to_nid(ac->preferred_zoneref->zone);
if (zone_to_nid(zone) != local_nid) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
}
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
if (!zone_watermark_fast(zone, order, mark,
ac->highest_zoneidx, alloc_flags,
gfp_mask)) {
int ret;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* Watermark failed for this zone, but see if we can
* grow this zone if it contains deferred pages.
*/
if (static_branch_unlikely(&deferred_pages)) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
#endif
/* Checked here to keep the fast path fast */
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
if (node_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) {
case NODE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
ac->highest_zoneidx, alloc_flags))
goto try_this_zone;
continue;
}
}
try_this_zone:
page = rmqueue(ac->preferred_zoneref->zone, zone, order,
gfp_mask, alloc_flags, ac->migratetype);
if (page) {
prep_new_page(page, order, gfp_mask, alloc_flags);
/*
* If this is a high-order atomic allocation then check
* if the pageblock should be reserved for the future
*/
if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
reserve_highatomic_pageblock(page, zone, order);
return page;
} else {
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
if (static_branch_unlikely(&deferred_pages)) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
#endif
}
}
/*
* It's possible on a UMA machine to get through all zones that are
* fragmented. If avoiding fragmentation, reset and try again.
*/
if (no_fallback) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
return NULL;
}
static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
/*
* This documents exceptions given to allocations in certain
* contexts that are allowed to allocate outside current's set
* of allowed nodes.
*/
if (!(gfp_mask & __GFP_NOMEMALLOC))
if (tsk_is_oom_victim(current) ||
(current->flags & (PF_MEMALLOC | PF_EXITING)))
filter &= ~SHOW_MEM_FILTER_NODES;
if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
filter &= ~SHOW_MEM_FILTER_NODES;
show_mem(filter, nodemask);
}
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
if ((gfp_mask & __GFP_NOWARN) ||
!__ratelimit(&nopage_rs) ||
((gfp_mask & __GFP_DMA) && !has_managed_dma()))
return;
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
current->comm, &vaf, gfp_mask, &gfp_mask,
nodemask_pr_args(nodemask));
va_end(args);
cpuset_print_current_mems_allowed();
pr_cont("\n");
dump_stack();
warn_alloc_show_mem(gfp_mask, nodemask);
}
static inline struct page *
__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags,
const struct alloc_context *ac)
{
struct page *page;
page = get_page_from_freelist(gfp_mask, order,
alloc_flags|ALLOC_CPUSET, ac);
/*
* fallback to ignore cpuset restriction if our nodes
* are depleted
*/
if (!page)
page = get_page_from_freelist(gfp_mask, order,
alloc_flags, ac);
return page;
}
static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac, unsigned long *did_some_progress)
{
struct oom_control oc = {
.zonelist = ac->zonelist,
.nodemask = ac->nodemask,
.memcg = NULL,
.gfp_mask = gfp_mask,
.order = order,
};
struct page *page;
*did_some_progress = 0;
/*
* Acquire the oom lock. If that fails, somebody else is
* making progress for us.
*/
if (!mutex_trylock(&oom_lock)) {
*did_some_progress = 1;
schedule_timeout_uninterruptible(1);
return NULL;
}
/*
* Go through the zonelist yet one more time, keep very high watermark
* here, this is only to catch a parallel oom killing, we must fail if
* we're still under heavy pressure. But make sure that this reclaim
* attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
* allocation which will never fail due to oom_lock already held.
*/
page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
~__GFP_DIRECT_RECLAIM, order,
ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
if (page)
goto out;
/* Coredumps can quickly deplete all memory reserves */
if (current->flags & PF_DUMPCORE)
goto out;
/* The OOM killer will not help higher order allocs */
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto out;
/*
* We have already exhausted all our reclaim opportunities without any
* success so it is time to admit defeat. We will skip the OOM killer
* because it is very likely that the caller has a more reasonable
* fallback than shooting a random task.
*
* The OOM killer may not free memory on a specific node.
*/
if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
goto out;
/* The OOM killer does not needlessly kill tasks for lowmem */
if (ac->highest_zoneidx < ZONE_NORMAL)
goto out;
if (pm_suspended_storage())
goto out;
/*
* XXX: GFP_NOFS allocations should rather fail than rely on
* other request to make a forward progress.
* We are in an unfortunate situation where out_of_memory cannot
* do much for this context but let's try it to at least get
* access to memory reserved if the current task is killed (see
* out_of_memory). Once filesystems are ready to handle allocation
* failures more gracefully we should just bail out here.
*/
/* Exhausted what can be done so it's blame time */
if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
*did_some_progress = 1;
/*
* Help non-failing allocations by giving them access to memory
* reserves
*/
if (gfp_mask & __GFP_NOFAIL)
page = __alloc_pages_cpuset_fallback(gfp_mask, order,
ALLOC_NO_WATERMARKS, ac);
}
out:
mutex_unlock(&oom_lock);
return page;
}
/*
* Maximum number of compaction retries wit a progress before OOM
* killer is consider as the only way to move forward.
*/
#define MAX_COMPACT_RETRIES 16
#ifdef CONFIG_COMPACTION
/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, enum compact_result *compact_result)
{
struct page *page = NULL;
unsigned long pflags;
unsigned int noreclaim_flag;
if (!order)
return NULL;
psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
prio, &page);
memalloc_noreclaim_restore(noreclaim_flag);
psi_memstall_leave(&pflags);
/*
* At least in one zone compaction wasn't deferred or skipped, so let's
* count a compaction stall
*/
count_vm_event(COMPACTSTALL);
/* Prep a captured page if available */
if (page)
prep_new_page(page, order, gfp_mask, alloc_flags);
/* Try get a page from the freelist if available */
if (!page)
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page) {
struct zone *zone = page_zone(page);
zone->compact_blockskip_flush = false;
compaction_defer_reset(zone, order, true);
count_vm_event(COMPACTSUCCESS);
return page;
}
/*
* It's bad if compaction run occurs and fails. The most likely reason
* is that pages exist, but not enough to satisfy watermarks.
*/
count_vm_event(COMPACTFAIL);
cond_resched();
return NULL;
}
static inline bool
should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
enum compact_result compact_result,
enum compact_priority *compact_priority,
int *compaction_retries)
{
int max_retries = MAX_COMPACT_RETRIES;
int min_priority;
bool ret = false;
int retries = *compaction_retries;
enum compact_priority priority = *compact_priority;
if (!order)
return false;
if (compaction_made_progress(compact_result))
(*compaction_retries)++;
/*
* compaction considers all the zone as desperately out of memory
* so it doesn't really make much sense to retry except when the
* failure could be caused by insufficient priority
*/
if (compaction_failed(compact_result))
goto check_priority;
/*
* compaction was skipped because there are not enough order-0 pages
* to work with, so we retry only if it looks like reclaim can help.
*/
if (compaction_needs_reclaim(compact_result)) {
ret = compaction_zonelist_suitable(ac, order, alloc_flags);
goto out;
}
/*
* make sure the compaction wasn't deferred or didn't bail out early
* due to locks contention before we declare that we should give up.
* But the next retry should use a higher priority if allowed, so
* we don't just keep bailing out endlessly.
*/
if (compaction_withdrawn(compact_result)) {
goto check_priority;
}
/*
* !costly requests are much more important than __GFP_RETRY_MAYFAIL
* costly ones because they are de facto nofail and invoke OOM
* killer to move on while costly can fail and users are ready
* to cope with that. 1/4 retries is rather arbitrary but we
* would need much more detailed feedback from compaction to
* make a better decision.
*/
if (order > PAGE_ALLOC_COSTLY_ORDER)
max_retries /= 4;
if (*compaction_retries <= max_retries) {
ret = true;
goto out;
}
/*
* Make sure there are attempts at the highest priority if we exhausted
* all retries or failed at the lower priorities.
*/
check_priority:
min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
if (*compact_priority > min_priority) {
(*compact_priority)--;
*compaction_retries = 0;
ret = true;
}
out:
trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
return ret;
}
#else
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, enum compact_result *compact_result)
{
*compact_result = COMPACT_SKIPPED;
return NULL;
}
static inline bool
should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
enum compact_result compact_result,
enum compact_priority *compact_priority,
int *compaction_retries)
{
struct zone *zone;
struct zoneref *z;
if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
return false;
/*
* There are setups with compaction disabled which would prefer to loop
* inside the allocator rather than hit the oom killer prematurely.
* Let's give them a good hope and keep retrying while the order-0
* watermarks are OK.
*/
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
ac->highest_zoneidx, ac->nodemask) {
if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
ac->highest_zoneidx, alloc_flags))
return true;
}
return false;
}
#endif /* CONFIG_COMPACTION */
#ifdef CONFIG_LOCKDEP
static struct lockdep_map __fs_reclaim_map =
STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
static bool __need_fs_reclaim(gfp_t gfp_mask)
{
gfp_mask = current_gfp_context(gfp_mask);
/* no reclaim without waiting on it */
if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
return false;
/* this guy won't enter reclaim */
if (current->flags & PF_MEMALLOC)
return false;
/* We're only interested __GFP_FS allocations for now */
if (!(gfp_mask & __GFP_FS))
return false;
if (gfp_mask & __GFP_NOLOCKDEP)
return false;
return true;
}
void __fs_reclaim_acquire(void)
{
lock_map_acquire(&__fs_reclaim_map);
}
void __fs_reclaim_release(void)
{
lock_map_release(&__fs_reclaim_map);
}
void fs_reclaim_acquire(gfp_t gfp_mask)
{
if (__need_fs_reclaim(gfp_mask))
__fs_reclaim_acquire();
}
EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
void fs_reclaim_release(gfp_t gfp_mask)
{
if (__need_fs_reclaim(gfp_mask))
__fs_reclaim_release();
}
EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
/* Perform direct synchronous page reclaim */
static unsigned long
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac)
{
unsigned int noreclaim_flag;
unsigned long progress;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
fs_reclaim_acquire(gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
ac->nodemask);
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(gfp_mask);
cond_resched();
return progress;
}
/* The really slow allocator path where we enter direct reclaim */
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
unsigned long *did_some_progress)
{
struct page *page = NULL;
unsigned long pflags;
bool drained = false;
bool skip_pcp_drain = false;
psi_memstall_enter(&pflags);
*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
if (unlikely(!(*did_some_progress)))
goto out;
retry:
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
/*
* If an allocation failed after direct reclaim, it could be because
* pages are pinned on the per-cpu lists or in high alloc reserves.
* Shrink them and try again
*/
if (!page && !drained) {
unreserve_highatomic_pageblock(ac, false);
trace_android_vh_drain_all_pages_bypass(gfp_mask, order,
alloc_flags, ac->migratetype, *did_some_progress, &skip_pcp_drain);
if (!skip_pcp_drain)
drain_all_pages(NULL);
drained = true;
goto retry;
}
out:
psi_memstall_leave(&pflags);
return page;
}
static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
pg_data_t *last_pgdat = NULL;
enum zone_type highest_zoneidx = ac->highest_zoneidx;
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
ac->nodemask) {
if (last_pgdat != zone->zone_pgdat)
wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
last_pgdat = zone->zone_pgdat;
}
}
static inline unsigned int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
/*
* __GFP_HIGH is assumed to be the same as ALLOC_HIGH
* and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
* to save two branches.
*/
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
/*
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
* set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
*/
alloc_flags |= (__force int)
(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
if (gfp_mask & __GFP_ATOMIC) {
/*
* Not worth trying to allocate harder for __GFP_NOMEMALLOC even
* if it can't schedule.
*/
if (!(gfp_mask & __GFP_NOMEMALLOC))
alloc_flags |= ALLOC_HARDER;
/*
* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
* comment for __cpuset_node_allowed().
*/
alloc_flags &= ~ALLOC_CPUSET;
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
return alloc_flags;
}
static bool oom_reserves_allowed(struct task_struct *tsk)
{
if (!tsk_is_oom_victim(tsk))
return false;
/*
* !MMU doesn't have oom reaper so give access to memory reserves
* only to the thread with TIF_MEMDIE set
*/
if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
return false;
return true;
}
/*
* Distinguish requests which really need access to full memory
* reserves from oom victims which can live with a portion of it
*/
static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
{
if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
return 0;
if (gfp_mask & __GFP_MEMALLOC)
return ALLOC_NO_WATERMARKS;
if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
return ALLOC_NO_WATERMARKS;
if (!in_interrupt()) {
if (current->flags & PF_MEMALLOC)
return ALLOC_NO_WATERMARKS;
else if (oom_reserves_allowed(current))
return ALLOC_OOM;
}
return 0;
}
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
{
return !!__gfp_pfmemalloc_flags(gfp_mask);
}
/*
* Checks whether it makes sense to retry the reclaim to make a forward progress
* for the given allocation request.
*
* We give up when we either have tried MAX_RECLAIM_RETRIES in a row
* without success, or when we couldn't even meet the watermark if we
* reclaimed all remaining pages on the LRU lists.
*
* Returns true if a retry is viable or false to enter the oom path.
*/
static inline bool
should_reclaim_retry(gfp_t gfp_mask, unsigned order,
struct alloc_context *ac, int alloc_flags,
bool did_some_progress, int *no_progress_loops)
{
struct zone *zone;
struct zoneref *z;
bool ret = false;
/*
* Costly allocations might have made a progress but this doesn't mean
* their order will become available due to high fragmentation so
* always increment the no progress counter for them
*/
if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
*no_progress_loops = 0;
else
(*no_progress_loops)++;
/*
* Make sure we converge to OOM if we cannot make any progress
* several times in the row.
*/
if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
/* Before OOM, exhaust highatomic_reserve */
return unreserve_highatomic_pageblock(ac, true);
}
/*
* Keep reclaiming pages while there is a chance this will lead
* somewhere. If none of the target zones can satisfy our allocation
* request even if all reclaimable pages are considered then we are
* screwed and have to go OOM.
*/
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
ac->highest_zoneidx, ac->nodemask) {
unsigned long available;
unsigned long reclaimable;
unsigned long min_wmark = min_wmark_pages(zone);
bool wmark;
available = reclaimable = zone_reclaimable_pages(zone);
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
/*
* Would the allocation succeed if we reclaimed all
* reclaimable pages?
*/
wmark = __zone_watermark_ok(zone, order, min_wmark,
ac->highest_zoneidx, alloc_flags, available);
trace_reclaim_retry_zone(z, order, reclaimable,
available, min_wmark, *no_progress_loops, wmark);
if (wmark) {
/*
* If we didn't make any progress and have a lot of
* dirty + writeback pages then we should wait for
* an IO to complete to slow down the reclaim and
* prevent from pre mature OOM
*/
if (!did_some_progress) {
unsigned long write_pending;
write_pending = zone_page_state_snapshot(zone,
NR_ZONE_WRITE_PENDING);
if (2 * write_pending > reclaimable) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
return true;
}
}
ret = true;
goto out;
}
}
out:
/*
* Memory allocation/reclaim might be called from a WQ context and the
* current implementation of the WQ concurrency control doesn't
* recognize that a particular WQ is congested if the worker thread is
* looping without ever sleeping. Therefore we have to do a short sleep
* here rather than calling cond_resched().
*/
if (current->flags & PF_WQ_WORKER)
schedule_timeout_uninterruptible(1);
else
cond_resched();
return ret;
}
static inline bool
check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
{
/*
* It's possible that cpuset's mems_allowed and the nodemask from
* mempolicy don't intersect. This should be normally dealt with by
* policy_nodemask(), but it's possible to race with cpuset update in
* such a way the check therein was true, and then it became false
* before we got our cpuset_mems_cookie here.
* This assumes that for all allocations, ac->nodemask can come only
* from MPOL_BIND mempolicy (whose documented semantics is to be ignored
* when it does not intersect with the cpuset restrictions) or the
* caller can deal with a violated nodemask.
*/
if (cpusets_enabled() && ac->nodemask &&
!cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
ac->nodemask = NULL;
return true;
}
/*
* When updating a task's mems_allowed or mempolicy nodemask, it is
* possible to race with parallel threads in such a way that our
* allocation can fail while the mask is being updated. If we are about
* to fail, check if the cpuset changed during allocation and if so,
* retry.
*/
if (read_mems_allowed_retry(cpuset_mems_cookie))
return true;
return false;
}
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
unsigned int alloc_flags;
unsigned long did_some_progress;
enum compact_priority compact_priority;
enum compact_result compact_result;
int compaction_retries;
int no_progress_loops;
unsigned int cpuset_mems_cookie;
int reserve_flags;
unsigned long vh_record;
trace_android_vh_alloc_pages_slowpath_begin(gfp_mask, order, &vh_record);
/*
* We also sanity check to catch abuse of atomic reserves being used by
* callers that are not in atomic context.
*/
if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
retry_cpuset:
compaction_retries = 0;
no_progress_loops = 0;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
/*
* The fast path uses conservative alloc_flags to succeed only until
* kswapd needs to be woken up, and to avoid the cost of setting up
* alloc_flags precisely. So we do that now.
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
/*
* We need to recalculate the starting point for the zonelist iterator
* because we might have used different nodemask in the fast path, or
* there was a cpuset modification and we are retrying - otherwise we
* could end up iterating over non-eligible zones endlessly.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);
if (!ac->preferred_zoneref->zone)
goto nopage;
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
/*
* The adjusted alloc_flags might result in immediate success, so try
* that first
*/
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;
/*
* For costly allocations, try direct compaction first, as it's likely
* that we have enough base pages and don't need to reclaim. For non-
* movable high-order allocations, do that as well, as compaction will
* try prevent permanent fragmentation by migrating from blocks of the
* same migratetype.
* Don't try this for allocations that are allowed to ignore
* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
*/
if (can_direct_reclaim &&
(costly_order ||
(order > 0 && ac->migratetype != MIGRATE_MOVABLE))
&& !gfp_pfmemalloc_allowed(gfp_mask)) {
page = __alloc_pages_direct_compact(gfp_mask, order,
alloc_flags, ac,
INIT_COMPACT_PRIORITY,
&compact_result);
if (page)
goto got_pg;
/*
* Checks for costly allocations with __GFP_NORETRY, which
* includes some THP page fault allocations
*/
if (costly_order && (gfp_mask & __GFP_NORETRY)) {
/*
* If allocating entire pageblock(s) and compaction
* failed because all zones are below low watermarks
* or is prohibited because it recently failed at this
* order, fail immediately unless the allocator has
* requested compaction and reclaim retry.
*
* Reclaim is
* - potentially very expensive because zones are far
* below their low watermarks or this is part of very
* bursty high order allocations,
* - not guaranteed to help because isolate_freepages()
* may not iterate over freed pages as part of its
* linear scan, and
* - unlikely to make entire pageblocks free on its
* own.
*/
if (compact_result == COMPACT_SKIPPED ||
compact_result == COMPACT_DEFERRED)
goto nopage;
/*
* Looks like reclaim/compaction is worth trying, but
* sync compaction could be very expensive, so keep
* using async compaction.
*/
compact_priority = INIT_COMPACT_PRIORITY;
}
}
retry:
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
if (reserve_flags)
alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
/*
* Reset the nodemask and zonelist iterators if memory policies can be
* ignored. These allocations are high priority and system rather than
* user oriented.
*/
if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
ac->nodemask = NULL;
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);
}
/* Attempt with potentially adjusted zonelist and alloc_flags */
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;
/* Caller is not willing to reclaim, we can't balance anything */
if (!can_direct_reclaim)
goto nopage;
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
goto nopage;
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
&did_some_progress);
if (page)
goto got_pg;
/* Try direct compaction and then allocating */
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
compact_priority, &compact_result);
if (page)
goto got_pg;
/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)
goto nopage;
/*
* Do not retry costly high order allocations unless they are
* __GFP_RETRY_MAYFAIL
*/
if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
goto nopage;
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
did_some_progress > 0, &no_progress_loops))
goto retry;
/*
* It doesn't make any sense to retry for the compaction if the order-0
* reclaim is not able to make any progress because the current
* implementation of the compaction depends on the sufficient amount
* of free memory (see __compaction_suitable)
*/
if (did_some_progress > 0 &&
should_compact_retry(ac, order, alloc_flags,
compact_result, &compact_priority,
&compaction_retries))
goto retry;
/* Deal with possible cpuset update races before we start OOM killing */
if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
if (page)
goto got_pg;
/* Avoid allocations with no watermarks from looping endlessly */
if (tsk_is_oom_victim(current) &&
(alloc_flags & ALLOC_OOM ||
(gfp_mask & __GFP_NOMEMALLOC)))
goto nopage;
/* Retry as long as the OOM killer is making progress */
if (did_some_progress) {
no_progress_loops = 0;
goto retry;
}
nopage:
/* Deal with possible cpuset update races before we fail */
if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
/*
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
* we always retry
*/
if (gfp_mask & __GFP_NOFAIL) {
/*
* All existing users of the __GFP_NOFAIL are blockable, so warn
* of any new users that actually require GFP_NOWAIT
*/
if (WARN_ON_ONCE(!can_direct_reclaim))
goto fail;
/*
* PF_MEMALLOC request from this context is rather bizarre
* because we cannot reclaim anything and only can loop waiting
* for somebody to do a work for us
*/
WARN_ON_ONCE(current->flags & PF_MEMALLOC);
/*
* non failing costly orders are a hard requirement which we
* are not prepared for much so let's warn about these users
* so that we can identify them and convert them to something
* else.
*/
WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
/*
* Help non-failing allocations by giving them access to memory
* reserves but do not use ALLOC_NO_WATERMARKS because this
* could deplete whole memory reserves which would just make
* the situation worse
*/
page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
if (page)
goto got_pg;
cond_resched();
goto retry;
}
fail:
warn_alloc(gfp_mask, ac->nodemask,
"page allocation failure: order:%u", order);
got_pg:
trace_android_vh_alloc_pages_slowpath_end(gfp_mask, order, vh_record);
return page;
}
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
int preferred_nid, nodemask_t *nodemask,
struct alloc_context *ac, gfp_t *alloc_mask,
unsigned int *alloc_flags)
{
ac->highest_zoneidx = gfp_zone(gfp_mask);
ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->nodemask = nodemask;
ac->migratetype = gfp_migratetype(gfp_mask);
if (cpusets_enabled()) {
*alloc_mask |= __GFP_HARDWALL;
/*
* When we are in the interrupt context, it is irrelevant
* to the current task context. It means that any node ok.
*/
if (!in_interrupt() && !ac->nodemask)
ac->nodemask = &cpuset_current_mems_allowed;
else
*alloc_flags |= ALLOC_CPUSET;
}
fs_reclaim_acquire(gfp_mask);
fs_reclaim_release(gfp_mask);
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
if (should_fail_alloc_page(gfp_mask, order))
return false;
*alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
/* Dirty zone balancing only done in the fast path */
ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
/*
* The preferred zone is used for statistics but crucially it is
* also used as the starting point for the zonelist iterator. It
* may get reset for allocations that ignore memory policies.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);
return true;
}
/*
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
/*
* There are several places where we assume that the order value is sane
* so bail out early if the request is out of bound.
*/
if (unlikely(order >= MAX_ORDER)) {
WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
}
gfp_mask &= gfp_allowed_mask;
alloc_mask = gfp_mask;
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
/*
* Forbid the first pass from falling back to types that fragment
* memory until all local zones are considered.
*/
alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
/* First allocation attempt */
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
if (likely(page))
goto out;
/*
* Apply scoped allocation constraints. This is mainly about GFP_NOFS
* resp. GFP_NOIO which has to be inherited for all allocation requests
* from a particular context which has been marked by
* memalloc_no{fs,io}_{save,restore}.
*/
alloc_mask = current_gfp_context(gfp_mask);
ac.spread_dirty_pages = false;
/*
* Restore the original nodemask if it was potentially replaced with
* &cpuset_current_mems_allowed to optimize the fast-path attempt.
*/
ac.nodemask = nodemask;
page = __alloc_pages_slowpath(alloc_mask, order, &ac);
out:
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) {
__free_pages(page, order);
page = NULL;
}
trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
/*
* Common helper functions. Never use with __GFP_HIGHMEM because the returned
* address cannot represent highmem pages. Use alloc_pages and then kmap if
* you need to access high mem.
*/
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *page;
page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
if (!page)
return 0;
return (unsigned long) page_address(page);
}
EXPORT_SYMBOL(__get_free_pages);
unsigned long get_zeroed_page(gfp_t gfp_mask)
{
return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
}
EXPORT_SYMBOL(get_zeroed_page);
static inline void free_the_page(struct page *page, unsigned int order)
{
if (order == 0) /* Via pcp? */
free_unref_page(page);
else
__free_pages_ok(page, order, FPI_NONE);
}
void __free_pages(struct page *page, unsigned int order)
{
trace_android_vh_free_pages(page, order);
if (put_page_testzero(page))
free_the_page(page, order);
else if (!PageHead(page))
while (order-- > 0)
free_the_page(page + (1 << order), order);
}
EXPORT_SYMBOL(__free_pages);
void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
VM_BUG_ON(!virt_addr_valid((void *)addr));
__free_pages(virt_to_page((void *)addr), order);
}
}
EXPORT_SYMBOL(free_pages);
/*
* Page Fragment:
* An arbitrary-length arbitrary-offset area of memory which resides
* within a 0 or higher order page. Multiple fragments within that page
* are individually refcounted, in the page's reference counter.
*
* The page_frag functions below provide a simple allocation framework for
* page fragments. This is used by the network stack and network device
* drivers to provide a backing region of memory for use as either an
* sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
*/
static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
gfp_t gfp_mask)
{
struct page *page = NULL;
gfp_t gfp = gfp_mask;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
__GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
#endif
if (unlikely(!page))
page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
nc->va = page ? page_address(page) : NULL;
return page;
}
void __page_frag_cache_drain(struct page *page, unsigned int count)
{
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
if (page_ref_sub_and_test(page, count))
free_the_page(page, compound_order(page));
}
EXPORT_SYMBOL(__page_frag_cache_drain);
void *page_frag_alloc(struct page_frag_cache *nc,
unsigned int fragsz, gfp_t gfp_mask)
{
unsigned int size = PAGE_SIZE;
struct page *page;
int offset;
if (unlikely(!nc->va)) {
refill:
page = __page_frag_cache_refill(nc, gfp_mask);
if (!page)
return NULL;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
/* if size can vary use size else just use PAGE_SIZE */
size = nc->size;
#endif
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
/* reset page count bias and offset to start of new frag */
nc->pfmemalloc = page_is_pfmemalloc(page);
nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
nc->offset = size;
}
offset = nc->offset - fragsz;
if (unlikely(offset < 0)) {
page = virt_to_page(nc->va);
if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
goto refill;
if (unlikely(nc->pfmemalloc)) {
free_the_page(page, compound_order(page));
goto refill;
}
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
/* if size can vary use size else just use PAGE_SIZE */
size = nc->size;
#endif
/* OK, page count is 0, we can safely set it */
set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
/* reset page count bias and offset to start of new frag */
nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
offset = size - fragsz;
}
nc->pagecnt_bias--;
nc->offset = offset;
return nc->va + offset;
}
EXPORT_SYMBOL(page_frag_alloc);
/*
* Frees a page fragment allocated out of either a compound or order 0 page.
*/
void page_frag_free(void *addr)
{
struct page *page = virt_to_head_page(addr);
if (unlikely(put_page_testzero(page)))
free_the_page(page, compound_order(page));
}
EXPORT_SYMBOL(page_frag_free);
static void *make_alloc_exact(unsigned long addr, unsigned int order,
size_t size)
{
if (addr) {
unsigned long alloc_end = addr + (PAGE_SIZE << order);
unsigned long used = addr + PAGE_ALIGN(size);
split_page(virt_to_page((void *)addr), order);
while (used < alloc_end) {
free_page(used);
used += PAGE_SIZE;
}
}
return (void *)addr;
}
/**
* alloc_pages_exact - allocate an exact number physically-contiguous pages.
* @size: the number of bytes to allocate
* @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
*
* This function is similar to alloc_pages(), except that it allocates the
* minimum number of pages to satisfy the request. alloc_pages() can only
* allocate memory in power-of-two pages.
*
* This function is also limited by MAX_ORDER.
*
* Memory allocated by this function must be released by free_pages_exact().
*
* Return: pointer to the allocated area or %NULL in case of error.
*/
void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
{
unsigned int order = get_order(size);
unsigned long addr;
if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
gfp_mask &= ~__GFP_COMP;
addr = __get_free_pages(gfp_mask, order);
return make_alloc_exact(addr, order, size);
}
EXPORT_SYMBOL(alloc_pages_exact);
/**
* alloc_pages_exact_nid - allocate an exact number of physically-contiguous
* pages on a node.
* @nid: the preferred node ID where memory should be allocated
* @size: the number of bytes to allocate
* @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
*
* Like alloc_pages_exact(), but try to allocate on node nid first before falling
* back.
*
* Return: pointer to the allocated area or %NULL in case of error.
*/
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
{
unsigned int order = get_order(size);
struct page *p;
if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
gfp_mask &= ~__GFP_COMP;
p = alloc_pages_node(nid, gfp_mask, order);
if (!p)
return NULL;
return make_alloc_exact((unsigned long)page_address(p), order, size);
}
/**
* free_pages_exact - release memory allocated via alloc_pages_exact()
* @virt: the value returned by alloc_pages_exact.
* @size: size of allocation, same value as passed to alloc_pages_exact().
*
* Release the memory allocated by a previous call to alloc_pages_exact.
*/
void free_pages_exact(void *virt, size_t size)
{
unsigned long addr = (unsigned long)virt;
unsigned long end = addr + PAGE_ALIGN(size);
while (addr < end) {
free_page(addr);
addr += PAGE_SIZE;
}
}
EXPORT_SYMBOL(free_pages_exact);
/**
* nr_free_zone_pages - count number of pages beyond high watermark
* @offset: The zone index of the highest zone
*
* nr_free_zone_pages() counts the number of pages which are beyond the
* high watermark within all zones at or below a given zone index. For each
* zone, the number of pages is calculated as:
*
* nr_free_zone_pages = managed_pages - high_pages
*
* Return: number of pages beyond high watermark.
*/
static unsigned long nr_free_zone_pages(int offset)
{
struct zoneref *z;
struct zone *zone;
/* Just pick one node, since fallback list is circular */
unsigned long sum = 0;
struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
for_each_zone_zonelist(zone, z, zonelist, offset) {
unsigned long size = zone_managed_pages(zone);
unsigned long high = high_wmark_pages(zone);
if (size > high)
sum += size - high;
}
return sum;
}
/**
* nr_free_buffer_pages - count number of pages beyond high watermark
*
* nr_free_buffer_pages() counts the number of pages which are beyond the high
* watermark within ZONE_DMA and ZONE_NORMAL.
*
* Return: number of pages beyond high watermark within ZONE_DMA and
* ZONE_NORMAL.
*/
unsigned long nr_free_buffer_pages(void)
{
return nr_free_zone_pages(gfp_zone(GFP_USER));
}
EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
static inline void show_node(struct zone *zone)
{
if (IS_ENABLED(CONFIG_NUMA))
printk("Node %d ", zone_to_nid(zone));
}
long si_mem_available(void)
{
long available;
unsigned long pagecache;
unsigned long wmark_low = 0;
unsigned long pages[NR_LRU_LISTS];
unsigned long reclaimable;
struct zone *zone;
int lru;
for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
for_each_zone(zone)
wmark_low += low_wmark_pages(zone);
/*
* Estimate the amount of memory available for userspace allocations,
* without causing swapping.
*/
available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
/*
* Not all the page cache can be freed, otherwise the system will
* start swapping. Assume at least half of the page cache, or the
* low watermark worth of cache, needs to stay.
*/
pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
pagecache -= min(pagecache / 2, wmark_low);
available += pagecache;
/*
* Part of the reclaimable slab and other kernel memory consists of
* items that are in use, and cannot be freed. Cap this estimate at the
* low watermark.
*/
reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
available += reclaimable - min(reclaimable / 2, wmark_low);
if (available < 0)
available = 0;
return available;
}
EXPORT_SYMBOL_GPL(si_mem_available);
void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages();
val->sharedram = global_node_page_state(NR_SHMEM);
val->freeram = global_zone_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
val->totalhigh = totalhigh_pages();
val->freehigh = nr_free_highpages();
val->mem_unit = PAGE_SIZE;
}
EXPORT_SYMBOL(si_meminfo);
#ifdef CONFIG_NUMA
void si_meminfo_node(struct sysinfo *val, int nid)
{
int zone_type; /* needs to be signed */
unsigned long managed_pages = 0;
unsigned long managed_highpages = 0;
unsigned long free_highpages = 0;
pg_data_t *pgdat = NODE_DATA(nid);
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
val->totalram = managed_pages;
val->sharedram = node_page_state(pgdat, NR_SHMEM);
val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
#ifdef CONFIG_HIGHMEM
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
if (is_highmem(zone)) {
managed_highpages += zone_managed_pages(zone);
free_highpages += zone_page_state(zone, NR_FREE_PAGES);
}
}
val->totalhigh = managed_highpages;
val->freehigh = free_highpages;
#else
val->totalhigh = managed_highpages;
val->freehigh = free_highpages;
#endif
val->mem_unit = PAGE_SIZE;
}
#endif
/*
* Determine whether the node should be displayed or not, depending on whether
* SHOW_MEM_FILTER_NODES was passed to show_free_areas().
*/
static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
{
if (!(flags & SHOW_MEM_FILTER_NODES))
return false;
/*
* no node mask - aka implicit memory numa policy. Do not bother with
* the synchronization - read_mems_allowed_begin - because we do not
* have to be precise here.
*/
if (!nodemask)
nodemask = &cpuset_current_mems_allowed;
return !node_isset(nid, *nodemask);
}
#define K(x) ((x) << (PAGE_SHIFT-10))
static void show_migration_types(unsigned char type)
{
static const char types[MIGRATE_TYPES] = {
[MIGRATE_UNMOVABLE] = 'U',
[MIGRATE_MOVABLE] = 'M',
[MIGRATE_RECLAIMABLE] = 'E',
[MIGRATE_HIGHATOMIC] = 'H',
#ifdef CONFIG_CMA
[MIGRATE_CMA] = 'C',
#endif
#ifdef CONFIG_MEMORY_ISOLATION
[MIGRATE_ISOLATE] = 'I',
#endif
};
char tmp[MIGRATE_TYPES + 1];
char *p = tmp;
int i;
for (i = 0; i < MIGRATE_TYPES; i++) {
if (type & (1 << i))
*p++ = types[i];
}
*p = '\0';
printk(KERN_CONT "(%s) ", tmp);
}
/*
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
* memory on each free list with the exception of the first item on the list.
*
* Bits in @filter:
* SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
* cpuset.
*/
void show_free_areas(unsigned int filter, nodemask_t *nodemask)
{
unsigned long free_pcp = 0;
int cpu;
struct zone *zone;
pg_data_t *pgdat;
for_each_populated_zone(zone) {
if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
for_each_online_cpu(cpu)
free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
}
printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
" unevictable:%lu dirty:%lu writeback:%lu\n"
" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
" free:%lu free_pcp:%lu free_cma:%lu\n",
global_node_page_state(NR_ACTIVE_ANON),
global_node_page_state(NR_INACTIVE_ANON),
global_node_page_state(NR_ISOLATED_ANON),
global_node_page_state(NR_ACTIVE_FILE),
global_node_page_state(NR_INACTIVE_FILE),
global_node_page_state(NR_ISOLATED_FILE),
global_node_page_state(NR_UNEVICTABLE),
global_node_page_state(NR_FILE_DIRTY),
global_node_page_state(NR_WRITEBACK),
global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
global_zone_page_state(NR_PAGETABLE),
global_zone_page_state(NR_BOUNCE),
global_zone_page_state(NR_FREE_PAGES),
free_pcp,
global_zone_page_state(NR_FREE_CMA_PAGES));
trace_android_vh_show_mapcount_pages(NULL);
for_each_online_pgdat(pgdat) {
if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
continue;
printk("Node %d"
" active_anon:%lukB"
" inactive_anon:%lukB"
" active_file:%lukB"
" inactive_file:%lukB"
" unevictable:%lukB"
" isolated(anon):%lukB"
" isolated(file):%lukB"
" mapped:%lukB"
" dirty:%lukB"
" writeback:%lukB"
" shmem:%lukB"
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
" shmem_thp: %lukB"
" shmem_pmdmapped: %lukB"
" anon_thp: %lukB"
#endif
" writeback_tmp:%lukB"
" kernel_stack:%lukB"
#ifdef CONFIG_SHADOW_CALL_STACK
" shadow_call_stack:%lukB"
#endif
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
K(node_page_state(pgdat, NR_ACTIVE_ANON)),
K(node_page_state(pgdat, NR_INACTIVE_ANON)),
K(node_page_state(pgdat, NR_ACTIVE_FILE)),
K(node_page_state(pgdat, NR_INACTIVE_FILE)),
K(node_page_state(pgdat, NR_UNEVICTABLE)),
K(node_page_state(pgdat, NR_ISOLATED_ANON)),
K(node_page_state(pgdat, NR_ISOLATED_FILE)),
K(node_page_state(pgdat, NR_FILE_MAPPED)),
K(node_page_state(pgdat, NR_FILE_DIRTY)),
K(node_page_state(pgdat, NR_WRITEBACK)),
K(node_page_state(pgdat, NR_SHMEM)),
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
* HPAGE_PMD_NR),
K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
#endif
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
node_page_state(pgdat, NR_KERNEL_STACK_KB),
#ifdef CONFIG_SHADOW_CALL_STACK
node_page_state(pgdat, NR_KERNEL_SCS_KB),
#endif
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
"yes" : "no");
}
for_each_populated_zone(zone) {
int i;
if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
free_pcp = 0;
for_each_online_cpu(cpu)
free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
show_node(zone);
printk(KERN_CONT
"%s"
" free:%lukB"
" min:%lukB"
" low:%lukB"
" high:%lukB"
" reserved_highatomic:%luKB"
" active_anon:%lukB"
" inactive_anon:%lukB"
" active_file:%lukB"
" inactive_file:%lukB"
" unevictable:%lukB"
" writepending:%lukB"
" present:%lukB"
" managed:%lukB"
" mlocked:%lukB"
" pagetables:%lukB"
" bounce:%lukB"
" free_pcp:%lukB"
" local_pcp:%ukB"
" free_cma:%lukB"
"\n",
zone->name,
K(zone_page_state(zone, NR_FREE_PAGES)),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
K(zone->nr_reserved_highatomic),
K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
K(zone->present_pages),
K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_BOUNCE)),
K(free_pcp),
K(this_cpu_read(zone->pageset->pcp.count)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
printk(KERN_CONT "\n");
}
for_each_populated_zone(zone) {
unsigned int order;
unsigned long nr[MAX_ORDER], flags, total = 0;
unsigned char types[MAX_ORDER];
if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
show_node(zone);
printk(KERN_CONT "%s: ", zone->name);
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
struct free_area *area = &zone->free_area[order];
int type;
nr[order] = area->nr_free;
total += nr[order] << order;
types[order] = 0;
for (type = 0; type < MIGRATE_TYPES; type++) {
if (!free_area_empty(area, type))
types[order] |= 1 << type;
}
}
spin_unlock_irqrestore(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
printk(KERN_CONT "%lu*%lukB ",
nr[order], K(1UL) << order);
if (nr[order])
show_migration_types(types[order]);
}
printk(KERN_CONT "= %lukB\n", K(total));
}
hugetlb_show_meminfo();
printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
show_swap_cache_info();
}
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
zoneref->zone = zone;
zoneref->zone_idx = zone_idx(zone);
}
/*
* Builds allocation fallback zone lists.
*
* Add all populated zones of a node to the zonelist.
*/
static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
{
struct zone *zone;
enum zone_type zone_type = MAX_NR_ZONES;
int nr_zones = 0;
do {
zone_type--;
zone = pgdat->node_zones + zone_type;
if (populated_zone(zone)) {
zoneref_set_zone(zone, &zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}
} while (zone_type);
return nr_zones;
}
#ifdef CONFIG_NUMA
static int __parse_numa_zonelist_order(char *s)
{
/*
* We used to support different zonlists modes but they turned
* out to be just not useful. Let's keep the warning in place
* if somebody still use the cmd line parameter so that we do
* not fail it silently
*/
if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
return -EINVAL;
}
return 0;
}
char numa_zonelist_order[] = "Node";
/*
* sysctl handler for numa_zonelist_order
*/
int numa_zonelist_order_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
if (write)
return __parse_numa_zonelist_order(buffer);
return proc_dostring(table, write, buffer, length, ppos);
}
#define MAX_NODE_LOAD (nr_online_nodes)
static int node_load[MAX_NUMNODES];
/**
* find_next_best_node - find the next node that should appear in a given node's fallback list
* @node: node whose fallback list we're appending
* @used_node_mask: nodemask_t of already used nodes
*
* We use a number of factors to determine which is the next node that should
* appear on a given node's fallback list. The node should not have appeared
* already in @node's fallback list, and it should be the next closest node
* according to the distance array (which contains arbitrary distance values
* from each node to each node in the system), and should also prefer nodes
* with no CPUs, since presumably they'll have very little allocation pressure
* on them otherwise.
*
* Return: node id of the found node or %NUMA_NO_NODE if no node is found.
*/
static int find_next_best_node(int node, nodemask_t *used_node_mask)
{
int n, val;
int min_val = INT_MAX;
int best_node = NUMA_NO_NODE;
/* Use the local node if we haven't already */
if (!node_isset(node, *used_node_mask)) {
node_set(node, *used_node_mask);
return node;
}
for_each_node_state(n, N_MEMORY) {
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
continue;
/* Use the distance array to find the distance */
val = node_distance(node, n);
/* Penalize nodes under us ("prefer the next node") */
val += (n < node);
/* Give preference to headless and unused nodes */
if (!cpumask_empty(cpumask_of_node(n)))
val += PENALTY_FOR_NODE_WITH_CPUS;
/* Slight preference for less loaded node */
val *= (MAX_NODE_LOAD*MAX_NUMNODES);
val += node_load[n];
if (val < min_val) {
min_val = val;
best_node = n;
}
}
if (best_node >= 0)
node_set(best_node, *used_node_mask);
return best_node;
}
/*
* Build zonelists ordered by node and zones within node.
* This results in maximum locality--normal zone overflows into local
* DMA zone, if any--but risks exhausting DMA zone.
*/
static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
unsigned nr_nodes)
{
struct zoneref *zonerefs;
int i;
zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
for (i = 0; i < nr_nodes; i++) {
int nr_zones;
pg_data_t *node = NODE_DATA(node_order[i]);
nr_zones = build_zonerefs_node(node, zonerefs);
zonerefs += nr_zones;
}
zonerefs->zone = NULL;
zonerefs->zone_idx = 0;
}
/*
* Build gfp_thisnode zonelists
*/
static void build_thisnode_zonelists(pg_data_t *pgdat)
{
struct zoneref *zonerefs;
int nr_zones;
zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
nr_zones = build_zonerefs_node(pgdat, zonerefs);
zonerefs += nr_zones;
zonerefs->zone = NULL;
zonerefs->zone_idx = 0;
}
/*
* Build zonelists ordered by zone and nodes within zones.
* This results in conserving DMA zone[s] until all Normal memory is
* exhausted, but results in overflowing to remote node while memory
* may still exist in local DMA zone.
*/
static void build_zonelists(pg_data_t *pgdat)
{
static int node_order[MAX_NUMNODES];
int node, load, nr_nodes = 0;
nodemask_t used_mask = NODE_MASK_NONE;
int local_node, prev_node;
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
load = nr_online_nodes;
prev_node = local_node;
memset(node_order, 0, sizeof(node_order));
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
/*
* We don't want to pressure a particular node.
* So adding penalty to the first node in same
* distance group to make it round-robin.
*/
if (node_distance(local_node, node) !=
node_distance(local_node, prev_node))
node_load[node] = load;
node_order[nr_nodes++] = node;
prev_node = node;
load--;
}
build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
build_thisnode_zonelists(pgdat);
}
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* Return node id of node used for "local" allocations.
* I.e., first node id of first zone in arg node's generic zonelist.
* Used for initializing percpu 'numa_mem', which is used primarily
* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
*/
int local_memory_node(int node)
{
struct zoneref *z;
z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
gfp_zone(GFP_KERNEL),
NULL);
return zone_to_nid(z->zone);
}
#endif
static void setup_min_unmapped_ratio(void);
static void setup_min_slab_ratio(void);
#else /* CONFIG_NUMA */
static void build_zonelists(pg_data_t *pgdat)
{
int node, local_node;
struct zoneref *zonerefs;
int nr_zones;
local_node = pgdat->node_id;
zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
nr_zones = build_zonerefs_node(pgdat, zonerefs);
zonerefs += nr_zones;
/*
* Now we build the zonelist so that it contains the zones
* of all the other nodes.
* We don't want to pressure a particular node, so when
* building the zones for node N, we make sure that the
* zones coming right after the local ones are those from
* node N+1 (modulo N)
*/
for (node = local_node + 1; node < MAX_NUMNODES; node++) {
if (!node_online(node))
continue;
nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
zonerefs += nr_zones;
}
for (node = 0; node < local_node; node++) {
if (!node_online(node))
continue;
nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
zonerefs += nr_zones;
}
zonerefs->zone = NULL;
zonerefs->zone_idx = 0;
}
#endif /* CONFIG_NUMA */
/*
* Boot pageset table. One per cpu which is going to be used for all
* zones and all nodes. The parameters will be set in such a way
* that an item put on a list will immediately be handed over to
* the buddy list. This is safe since pageset manipulation is done
* with interrupts disabled.
*
* The boot_pagesets must be kept even after bootup is complete for
* unused processors and/or zones. They do play a role for bootstrapping
* hotplugged processors.
*
* zoneinfo_show() and maybe other functions do
* not check if the processor is online before following the pageset pointer.
* Other parts of the kernel may not check if the zone is available.
*/
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
static void __build_all_zonelists(void *data)
{
int nid;
int __maybe_unused cpu;
pg_data_t *self = data;
static DEFINE_SPINLOCK(lock);
spin_lock(&lock);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
#endif
/*
* This node is hotadded and no memory is yet present. So just
* building zonelists is fine - no need to touch other nodes.
*/
if (self && !node_online(self->node_id)) {
build_zonelists(self);
} else {
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
build_zonelists(pgdat);
}
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* We now know the "local memory node" for each node--
* i.e., the node of the first zone in the generic zonelist.
* Set up numa_mem percpu variable for on-line cpus. During
* boot, only the boot cpu should be on-line; we'll init the
* secondary cpus' numa_mem as they come on-line. During
* node/memory hotplug, we'll fixup all on-line cpus.
*/
for_each_online_cpu(cpu)
set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
#endif
}
spin_unlock(&lock);
}
static noinline void __init
build_all_zonelists_init(void)
{
int cpu;
__build_all_zonelists(NULL);
/*
* Initialize the boot_pagesets that are going to be used
* for bootstrapping processors. The real pagesets for
* each zone will be allocated later when the per cpu
* allocator is available.
*
* boot_pagesets are used also for bootstrapping offline
* cpus if the system is already booted because the pagesets
* are needed to initialize allocators on a specific cpu too.
* F.e. the percpu allocator needs the page allocator which
* needs the percpu allocator in order to allocate its pagesets
* (a chicken-egg dilemma).
*/
for_each_possible_cpu(cpu)
setup_pageset(&per_cpu(boot_pageset, cpu), 0);
mminit_verify_zonelist();
cpuset_init_current_mems_allowed();
}
/*
* unless system_state == SYSTEM_BOOTING.
*
* __ref due to call of __init annotated helper build_all_zonelists_init
* [protected by SYSTEM_BOOTING].
*/
void __ref build_all_zonelists(pg_data_t *pgdat)
{
unsigned long vm_total_pages;
if (system_state == SYSTEM_BOOTING) {
build_all_zonelists_init();
} else {
__build_all_zonelists(pgdat);
/* cpuset refresh routine should be here */
}
/* Get the number of free pages beyond high watermark in all zones. */
vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
/*
* Disable grouping by mobility if the number of pages in the
* system is too low to allow the mechanism to work. It would be
* more accurate, but expensive to check per-zone. This check is
* made on memory-hotadd so a system can start with mobility
* disabled and enable it later
*/
if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
page_group_by_mobility_disabled = 1;
else
page_group_by_mobility_disabled = 0;
pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
nr_online_nodes,
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
#ifdef CONFIG_NUMA
pr_info("Policy zone: %s\n", zone_names[policy_zone]);
#endif
}
/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
static bool __meminit
overlap_memmap_init(unsigned long zone, unsigned long *pfn)
{
static struct memblock_region *r;
if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
for_each_mem_region(r) {
if (*pfn < memblock_region_memory_end_pfn(r))
break;
}
}
if (*pfn >= memblock_region_memory_base_pfn(r) &&
memblock_is_mirror(r)) {
*pfn = memblock_region_memory_end_pfn(r);
return true;
}
}
return false;
}
/*
* Initially all pages are reserved - free ones are freed
* up by memblock_free_all() once the early boot process is
* done. Non-atomic initialization, single-pass.
*
* All aligned pageblocks are initialized to the specified migratetype
* (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
* zone stats (e.g., nr_isolate_pageblock) are touched.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, unsigned long zone_end_pfn,
enum meminit_context context,
struct vmem_altmap *altmap, int migratetype)
{
unsigned long pfn, end_pfn = start_pfn + size;
struct page *page;
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
#ifdef CONFIG_ZONE_DEVICE
/*
* Honor reservation requested by the driver for this ZONE_DEVICE
* memory. We limit the total number of pages to initialize to just
* those that might contain the memory mapping. We will defer the
* ZONE_DEVICE page initialization until after we have released
* the hotplug lock.
*/
if (zone == ZONE_DEVICE) {
if (!altmap)
return;
if (start_pfn == altmap->base_pfn)
start_pfn += altmap->reserve;
end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
}
#endif
for (pfn = start_pfn; pfn < end_pfn; ) {
/*
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
if (context == MEMINIT_EARLY) {
if (overlap_memmap_init(zone, &pfn))
continue;
if (defer_init(nid, pfn, zone_end_pfn))
break;
}
page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone, nid);
if (context == MEMINIT_HOTPLUG)
__SetPageReserved(page);
/*
* Usually, we want to mark the pageblock MIGRATE_MOVABLE,
* such that unmovable allocations won't be scattered all
* over the place during system boot.
*/
if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
set_pageblock_migratetype(page, migratetype);
cond_resched();
}
pfn++;
}
}
#ifdef CONFIG_ZONE_DEVICE
void __ref memmap_init_zone_device(struct zone *zone,
unsigned long start_pfn,
unsigned long nr_pages,
struct dev_pagemap *pgmap)
{
unsigned long pfn, end_pfn = start_pfn + nr_pages;
struct pglist_data *pgdat = zone->zone_pgdat;
struct vmem_altmap *altmap = pgmap_altmap(pgmap);
unsigned long zone_idx = zone_idx(zone);
unsigned long start = jiffies;
int nid = pgdat->node_id;
if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE))
return;
/*
* The call to memmap_init should have already taken care
* of the pages reserved for the memmap, so we can just jump to
* the end of that region and start processing the device pages.
*/
if (altmap) {
start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
nr_pages = end_pfn - start_pfn;
}
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
struct page *page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone_idx, nid);
/*
* Mark page reserved as it will need to wait for onlining
* phase for it to be fully associated with a zone.
*
* We can use the non-atomic __set_bit operation for setting
* the flag as we are still initializing the pages.
*/
__SetPageReserved(page);
/*
* ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
* and zone_device_data. It is a bug if a ZONE_DEVICE page is
* ever freed or placed on a driver-private list.
*/
page->pgmap = pgmap;
page->zone_device_data = NULL;
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
* to reserve their blocks rather than leaking throughout
* the address space during boot when many long-lived
* kernel allocations are made.
*
* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
* because this is done early in section_activate()
*/
if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
}
pr_info("%s initialised %lu pages in %ums\n", __func__,
nr_pages, jiffies_to_msecs(jiffies - start));
}
#endif
static void __meminit zone_init_free_lists(struct zone *zone)
{
unsigned int order, t;
for_each_migratetype_order(order, t) {
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
zone->free_area[order].nr_free = 0;
}
}
#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
/*
* Only struct pages that correspond to ranges defined by memblock.memory
* are zeroed and initialized by going through __init_single_page() during
* memmap_init_zone_range().
*
* But, there could be struct pages that correspond to holes in
* memblock.memory. This can happen because of the following reasons:
* - physical memory bank size is not necessarily the exact multiple of the
* arbitrary section size
* - early reserved memory may not be listed in memblock.memory
* - memory layouts defined with memmap= kernel parameter may not align
* nicely with memmap sections
*
* Explicitly initialize those struct pages so that:
* - PG_Reserved is set
* - zone and node links point to zone and node that span the page if the
* hole is in the middle of a zone
* - zone and node links point to adjacent zone/node if the hole falls on
* the zone boundary; the pages in such holes will be prepended to the
* zone/node above the hole except for the trailing pages in the last
* section that will be appended to the zone/node below.
*/
static void __init init_unavailable_range(unsigned long spfn,
unsigned long epfn,
int zone, int node)
{
unsigned long pfn;
u64 pgcnt = 0;
for (pfn = spfn; pfn < epfn; pfn++) {
if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
+ pageblock_nr_pages - 1;
continue;
}
__init_single_page(pfn_to_page(pfn), pfn, zone, node);
__SetPageReserved(pfn_to_page(pfn));
pgcnt++;
}
if (pgcnt)
pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
node, zone_names[zone], pgcnt);
}
#else
static inline void init_unavailable_range(unsigned long spfn,
unsigned long epfn,
int zone, int node)
{
}
#endif
static void __init memmap_init_zone_range(struct zone *zone,
unsigned long start_pfn,
unsigned long end_pfn,
unsigned long *hole_pfn)
{
unsigned long zone_start_pfn = zone->zone_start_pfn;
unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
if (start_pfn >= end_pfn)
return;
memmap_init_zone(end_pfn - start_pfn, nid, zone_id, start_pfn,
zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
if (*hole_pfn < start_pfn)
init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
*hole_pfn = end_pfn;
}
void __init __weak memmap_init(void)
{
unsigned long start_pfn, end_pfn;
unsigned long hole_pfn = 0;
int i, j, zone_id, nid;
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
struct pglist_data *node = NODE_DATA(nid);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = node->node_zones + j;
if (!populated_zone(zone))
continue;
memmap_init_zone_range(zone, start_pfn, end_pfn,
&hole_pfn);
zone_id = j;
}
}
#ifdef CONFIG_SPARSEMEM
/*
* Initialize the memory map for hole in the range [memory_end,
* section_end].
* Append the pages in this hole to the highest zone in the last
* node.
* The call to init_unavailable_range() is outside the ifdef to
* silence the compiler warining about zone_id set but not used;
* for FLATMEM it is a nop anyway
*/
end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
if (hole_pfn < end_pfn)
#endif
init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
}
/* A stub for backwards compatibility with custom implementatin on IA-64 */
void __meminit __weak arch_memmap_init(unsigned long size, int nid,
unsigned long zone,
unsigned long range_start_pfn)
{
}
static int zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
int batch;
/*
* The per-cpu-pages pools are set to around 1000th of the
* size of the zone.
*/
batch = zone_managed_pages(zone) / 1024;
/* But no more than a meg. */
if (batch * PAGE_SIZE > 1024 * 1024)
batch = (1024 * 1024) / PAGE_SIZE;
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;
/*
* Clamp the batch to a 2^n - 1 value. Having a power
* of 2 value was found to be more likely to have
* suboptimal cache aliasing properties in some cases.
*
* For example if 2 tasks are alternately allocating
* batches of pages, one task can end up with a lot
* of pages of one half of the possible page colors
* and the other with pages of the other colors.
*/
batch = rounddown_pow_of_two(batch + batch/2) - 1;
return batch;
#else
/* The deferral and batching of frees should be suppressed under NOMMU
* conditions.
*
* The problem is that NOMMU needs to be able to allocate large chunks
* of contiguous memory as there's no hardware page translation to
* assemble apparent contiguous memory from discontiguous pages.
*
* Queueing large contiguous runs of pages for batching, however,
* causes the pages to actually be freed in smaller chunks. As there
* can be a significant delay between the individual batches being
* recycled, this leads to the once large chunks of space being
* fragmented and becoming unavailable for high-order allocations.
*/
return 0;
#endif
}
/*
* pcp->high and pcp->batch values are related and dependent on one another:
* ->batch must never be higher then ->high.
* The following function updates them in a safe manner without read side
* locking.
*
* Any new users of pcp->batch and pcp->high should ensure they can cope with
* those fields changing asynchronously (acording to the above rule).
*
* mutex_is_locked(&pcp_batch_high_lock) required when calling this function
* outside of boot time (or some other assurance that no concurrent updaters
* exist).
*/
static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
unsigned long batch)
{
/* start with a fail safe value for batch */
pcp->batch = 1;
smp_wmb();
/* Update high, then batch, in order */
pcp->high = high;
smp_wmb();
pcp->batch = batch;
}
/* a companion to pageset_set_high() */
static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
{
pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
}
static void pageset_init(struct per_cpu_pageset *p)
{
struct per_cpu_pages *pcp;
int migratetype;
memset(p, 0, sizeof(*p));
pcp = &p->pcp;
for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
INIT_LIST_HEAD(&pcp->lists[migratetype]);
}
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
{
pageset_init(p);
pageset_set_batch(p, batch);
}
/*
* pageset_set_high() sets the high water mark for hot per_cpu_pagelist
* to the value high for the pageset p.
*/
static void pageset_set_high(struct per_cpu_pageset *p,
unsigned long high)
{
unsigned long batch = max(1UL, high / 4);
if ((high / 4) > (PAGE_SHIFT * 8))
batch = PAGE_SHIFT * 8;
pageset_update(&p->pcp, high, batch);
}
static void pageset_set_high_and_batch(struct zone *zone,
struct per_cpu_pageset *pcp)
{
if (percpu_pagelist_fraction)
pageset_set_high(pcp,
(zone_managed_pages(zone) /
percpu_pagelist_fraction));
else
pageset_set_batch(pcp, zone_batchsize(zone));
}
static void __meminit zone_pageset_init(struct zone *zone, int cpu)
{
struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
pageset_init(pcp);
pageset_set_high_and_batch(zone, pcp);
}
void __meminit setup_zone_pageset(struct zone *zone)
{
int cpu;
zone->pageset = alloc_percpu(struct per_cpu_pageset);
for_each_possible_cpu(cpu)
zone_pageset_init(zone, cpu);
}
/*
* Allocate per cpu pagesets and initialize them.
* Before this call only boot pagesets were available.
*/
void __init setup_per_cpu_pageset(void)
{
struct pglist_data *pgdat;
struct zone *zone;
int __maybe_unused cpu;
for_each_populated_zone(zone)
setup_zone_pageset(zone);
#ifdef CONFIG_NUMA
/*
* Unpopulated zones continue using the boot pagesets.
* The numa stats for these pagesets need to be reset.
* Otherwise, they will end up skewing the stats of
* the nodes these zones are associated with.
*/
for_each_possible_cpu(cpu) {
struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
memset(pcp->vm_numa_stat_diff, 0,
sizeof(pcp->vm_numa_stat_diff));
}
#endif
for_each_online_pgdat(pgdat)
pgdat->per_cpu_nodestats =
alloc_percpu(struct per_cpu_nodestat);
}
static __meminit void zone_pcp_init(struct zone *zone)
{
/*
* per cpu subsystem is not up at this point. The following code
* relies on the ability of the linker to provide the
* offset of a (static) per cpu variable into the per cpu area.
*/
zone->pageset = &boot_pageset;
if (populated_zone(zone))
printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
zone->name, zone->present_pages,
zone_batchsize(zone));
}
void __meminit init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn,
unsigned long size)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int zone_idx = zone_idx(zone) + 1;
if (zone_idx > pgdat->nr_zones)
pgdat->nr_zones = zone_idx;
zone->zone_start_pfn = zone_start_pfn;
mminit_dprintk(MMINIT_TRACE, "memmap_init",
"Initialising map node %d zone %lu pfns %lu -> %lu\n",
pgdat->node_id,
(unsigned long)zone_idx(zone),
zone_start_pfn, (zone_start_pfn + size));
zone_init_free_lists(zone);
zone->initialized = 1;
}
/**
* get_pfn_range_for_nid - Return the start and end page frames for a node
* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
* @start_pfn: Passed by reference. On return, it will have the node start_pfn.
* @end_pfn: Passed by reference. On return, it will have the node end_pfn.
*
* It returns the start and end page frame of a node based on information
* provided by memblock_set_node(). If called for a node
* with no available memory, a warning is printed and the start and end
* PFNs will be 0.
*/
void __init get_pfn_range_for_nid(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn)
{
unsigned long this_start_pfn, this_end_pfn;
int i;
*start_pfn = -1UL;
*end_pfn = 0;
for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
*start_pfn = min(*start_pfn, this_start_pfn);
*end_pfn = max(*end_pfn, this_end_pfn);
}
if (*start_pfn == -1UL)
*start_pfn = 0;
}
/*
* This finds a zone that can be used for ZONE_MOVABLE pages. The
* assumption is made that zones within a node are ordered in monotonic
* increasing memory addresses so that the "highest" populated zone is used
*/
static void __init find_usable_zone_for_movable(void)
{
int zone_index;
for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
if (zone_index == ZONE_MOVABLE)
continue;
if (arch_zone_highest_possible_pfn[zone_index] >
arch_zone_lowest_possible_pfn[zone_index])
break;
}
VM_BUG_ON(zone_index == -1);
movable_zone = zone_index;
}
/*
* The zone ranges provided by the architecture do not include ZONE_MOVABLE
* because it is sized independent of architecture. Unlike the other zones,
* the starting point for ZONE_MOVABLE is not fixed. It may be different
* in each node depending on the size of each node and how evenly kernelcore
* is distributed. This helper function adjusts the zone ranges
* provided by the architecture for a given node by using the end of the
* highest usable zone for ZONE_MOVABLE. This preserves the assumption that
* zones within a node are in order of monotonic increases memory addresses
*/
static void __init adjust_zone_range_for_zone_movable(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
unsigned long *zone_start_pfn,
unsigned long *zone_end_pfn)
{
/* Only adjust if ZONE_MOVABLE is on this node */
if (zone_movable_pfn[nid]) {
/* Size ZONE_MOVABLE */
if (zone_type == ZONE_MOVABLE) {
*zone_start_pfn = zone_movable_pfn[nid];
*zone_end_pfn = min(node_end_pfn,
arch_zone_highest_possible_pfn[movable_zone]);
/* Adjust for ZONE_MOVABLE starting within this range */
} else if (!mirrored_kernelcore &&
*zone_start_pfn < zone_movable_pfn[nid] &&
*zone_end_pfn > zone_movable_pfn[nid]) {
*zone_end_pfn = zone_movable_pfn[nid];
/* Check if this whole range is within ZONE_MOVABLE */
} else if (*zone_start_pfn >= zone_movable_pfn[nid])
*zone_start_pfn = *zone_end_pfn;
}
}
/*
* Return the number of pages a zone spans in a node, including holes
* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
*/
static unsigned long __init zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
unsigned long *zone_start_pfn,
unsigned long *zone_end_pfn)
{
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
/* Get the start and end of the zone */
*zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
*zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
zone_start_pfn, zone_end_pfn);
/* Check that this node has pages within the zone's required range */
if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
return 0;
/* Move the zone boundaries inside the node if necessary */
*zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
*zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
/* Return the spanned pages */
return *zone_end_pfn - *zone_start_pfn;
}
/*
* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
* then all holes in the requested range will be accounted for.
*/
unsigned long __init __absent_pages_in_range(int nid,
unsigned long range_start_pfn,
unsigned long range_end_pfn)
{
unsigned long nr_absent = range_end_pfn - range_start_pfn;
unsigned long start_pfn, end_pfn;
int i;
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
nr_absent -= end_pfn - start_pfn;
}
return nr_absent;
}
/**
* absent_pages_in_range - Return number of page frames in holes within a range
* @start_pfn: The start PFN to start searching for holes
* @end_pfn: The end PFN to stop searching for holes
*
* Return: the number of pages frames in memory holes within a range.
*/
unsigned long __init absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn)
{
return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
}
/* Return the number of page frames in holes in a zone on a node */
static unsigned long __init zone_absent_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn)
{
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
unsigned long zone_start_pfn, zone_end_pfn;
unsigned long nr_absent;
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
&zone_start_pfn, &zone_end_pfn);
nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
/*
* ZONE_MOVABLE handling.
* Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
* and vice versa.
*/
if (mirrored_kernelcore && zone_movable_pfn[nid]) {
unsigned long start_pfn, end_pfn;
struct memblock_region *r;
for_each_mem_region(r) {
start_pfn = clamp(memblock_region_memory_base_pfn(r),
zone_start_pfn, zone_end_pfn);
end_pfn = clamp(memblock_region_memory_end_pfn(r),
zone_start_pfn, zone_end_pfn);
if (zone_type == ZONE_MOVABLE &&
memblock_is_mirror(r))
nr_absent += end_pfn - start_pfn;
if (zone_type == ZONE_NORMAL &&
!memblock_is_mirror(r))
nr_absent += end_pfn - start_pfn;
}
}
return nr_absent;
}
static void __init calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long node_start_pfn,
unsigned long node_end_pfn)
{
unsigned long realtotalpages = 0, totalpages = 0;
enum zone_type i;
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
unsigned long zone_start_pfn, zone_end_pfn;
unsigned long spanned, absent;
unsigned long size, real_size;
spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
node_start_pfn,
node_end_pfn,
&zone_start_pfn,
&zone_end_pfn);
absent = zone_absent_pages_in_node(pgdat->node_id, i,
node_start_pfn,
node_end_pfn);
size = spanned;
real_size = size - absent;
if (size)
zone->zone_start_pfn = zone_start_pfn;
else
zone->zone_start_pfn = 0;
zone->spanned_pages = size;
zone->present_pages = real_size;
totalpages += size;
realtotalpages += real_size;
}
pgdat->node_spanned_pages = totalpages;
pgdat->node_present_pages = realtotalpages;
printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
realtotalpages);
}
#ifndef CONFIG_SPARSEMEM
/*
* Calculate the size of the zone->blockflags rounded to an unsigned long
* Start by making sure zonesize is a multiple of pageblock_order by rounding
* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
* round what is now in bits to nearest long in bits, then return it in
* bytes.
*/
static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
{
unsigned long usemapsize;
zonesize += zone_start_pfn & (pageblock_nr_pages-1);
usemapsize = roundup(zonesize, pageblock_nr_pages);
usemapsize = usemapsize >> pageblock_order;
usemapsize *= NR_PAGEBLOCK_BITS;
usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
return usemapsize / 8;
}
static void __ref setup_usemap(struct pglist_data *pgdat,
struct zone *zone,
unsigned long zone_start_pfn,
unsigned long zonesize)
{
unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
zone->pageblock_flags = NULL;
if (usemapsize) {
zone->pageblock_flags =
memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
pgdat->node_id);
if (!zone->pageblock_flags)
panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
usemapsize, zone->name, pgdat->node_id);
}
}
#else
static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
unsigned long zone_start_pfn, unsigned long zonesize) {}
#endif /* CONFIG_SPARSEMEM */
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
void __init set_pageblock_order(void)
{
unsigned int order;
/* Check that pageblock_nr_pages has not already been setup */
if (pageblock_order)
return;
if (HPAGE_SHIFT > PAGE_SHIFT)
order = HUGETLB_PAGE_ORDER;
else
order = MAX_ORDER - 1;
/*
* Assume the largest contiguous order of interest is a huge page.
* This value may be variable depending on boot parameters on IA64 and
* powerpc.
*/
pageblock_order = order;
}
#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
/*
* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
* is unused as pageblock_order is set at compile-time. See
* include/linux/pageblock-flags.h for the values of pageblock_order based on
* the kernel config
*/
void __init set_pageblock_order(void)
{
}
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
unsigned long present_pages)
{
unsigned long pages = spanned_pages;
/*
* Provide a more accurate estimation if there are holes within
* the zone and SPARSEMEM is in use. If there are holes within the
* zone, each populated memory region may cost us one or two extra
* memmap pages due to alignment because memmap pages for each
* populated regions may not be naturally aligned on page boundary.
* So the (present_pages >> 4) heuristic is a tradeoff for that.
*/
if (spanned_pages > present_pages + (present_pages >> 4) &&
IS_ENABLED(CONFIG_SPARSEMEM))
pages = present_pages;
return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void pgdat_init_split_queue(struct pglist_data *pgdat)
{
struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
spin_lock_init(&ds_queue->split_queue_lock);
INIT_LIST_HEAD(&ds_queue->split_queue);
ds_queue->split_queue_len = 0;
}
#else
static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
#endif
#ifdef CONFIG_COMPACTION
static void pgdat_init_kcompactd(struct pglist_data *pgdat)
{
init_waitqueue_head(&pgdat->kcompactd_wait);
}
#else
static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
#endif
static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
{
pgdat_resize_init(pgdat);
pgdat_init_split_queue(pgdat);
pgdat_init_kcompactd(pgdat);
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_ext_init(pgdat);
spin_lock_init(&pgdat->lru_lock);
lruvec_init(&pgdat->__lruvec);
}
static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
unsigned long remaining_pages)
{
atomic_long_set(&zone->managed_pages, remaining_pages);
zone_set_nid(zone, nid);
zone->name = zone_names[idx];
zone->zone_pgdat = NODE_DATA(nid);
spin_lock_init(&zone->lock);
zone_seqlock_init(zone);
zone_pcp_init(zone);
}
/*
* Set up the zone data structures
* - init pgdat internals
* - init all zones belonging to this node
*
* NOTE: this function is only called during memory hotplug
*/
#ifdef CONFIG_MEMORY_HOTPLUG
void __ref free_area_init_core_hotplug(int nid)
{
enum zone_type z;
pg_data_t *pgdat = NODE_DATA(nid);
pgdat_init_internals(pgdat);
for (z = 0; z < MAX_NR_ZONES; z++)
zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
}
#endif
/*
* Set up the zone data structures:
* - mark all pages reserved
* - mark all memory queues empty
* - clear the memory bitmaps
*
* NOTE: pgdat should get zeroed by caller.
* NOTE: this function is only called during early init.
*/
static void __init free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
pgdat_init_internals(pgdat);
pgdat->per_cpu_nodestats = &boot_nodestats;
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, freesize, memmap_pages;
unsigned long zone_start_pfn = zone->zone_start_pfn;
size = zone->spanned_pages;
freesize = zone->present_pages;
/*
* Adjust freesize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
memmap_pages = calc_memmap_size(size, freesize);
if (!is_highmem_idx(j)) {
if (freesize >= memmap_pages) {
freesize -= memmap_pages;
if (memmap_pages)
printk(KERN_DEBUG
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
pr_warn(" %s zone: %lu pages exceeds freesize %lu\n",
zone_names[j], memmap_pages, freesize);
}
/* Account for reserved pages */
if (j == 0 && freesize > dma_reserve) {
freesize -= dma_reserve;
printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
zone_names[0], dma_reserve);
}
if (!is_highmem_idx(j))
nr_kernel_pages += freesize;
/* Charge for highmem memmap if there are enough kernel pages */
else if (nr_kernel_pages > memmap_pages * 2)
nr_kernel_pages -= memmap_pages;
nr_all_pages += freesize;
/*
* Set an approximate value for lowmem here, it will be adjusted
* when the bootmem allocator frees pages into the buddy system.
* And all highmem pages will be managed by the buddy system.
*/
zone_init_internals(zone, j, nid, freesize);
if (!size)
continue;
set_pageblock_order();
setup_usemap(pgdat, zone, zone_start_pfn, size);
init_currently_empty_zone(zone, zone_start_pfn, size);
arch_memmap_init(size, nid, j, zone_start_pfn);
}
}
#ifdef CONFIG_FLAT_NODE_MEM_MAP
static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
{
unsigned long __maybe_unused start = 0;
unsigned long __maybe_unused offset = 0;
/* Skip empty nodes */
if (!pgdat->node_spanned_pages)
return;
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
offset = pgdat->node_start_pfn - start;
/* ia64 gets its own node_mem_map, before this, without bootmem */
if (!pgdat->node_mem_map) {
unsigned long size, end;
struct page *map;
/*
* The zone's endpoints aren't required to be MAX_ORDER
* aligned but the node_mem_map endpoints must be in order
* for the buddy allocator to function correctly.
*/
end = pgdat_end_pfn(pgdat);
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
map = memblock_alloc_node(size, SMP_CACHE_BYTES,
pgdat->node_id);
if (!map)
panic("Failed to allocate %ld bytes for node %d memory map\n",
size, pgdat->node_id);
pgdat->node_mem_map = map + offset;
}
pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
__func__, pgdat->node_id, (unsigned long)pgdat,
(unsigned long)pgdat->node_mem_map);
#ifndef CONFIG_NEED_MULTIPLE_NODES
/*
* With no DISCONTIG, the global mem_map is just set as node 0's
*/
if (pgdat == NODE_DATA(0)) {
mem_map = NODE_DATA(0)->node_mem_map;
if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
mem_map -= offset;
}
#endif
}
#else
static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
{
pgdat->first_deferred_pfn = ULONG_MAX;
}
#else
static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
#endif
static void __init free_area_init_node(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long start_pfn = 0;
unsigned long end_pfn = 0;
/* pg_data_t should be reset to zero when it's allocated */
WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
pgdat->node_id = nid;
pgdat->node_start_pfn = start_pfn;
pgdat->per_cpu_nodestats = NULL;
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
calculate_node_totalpages(pgdat, start_pfn, end_pfn);
alloc_node_mem_map(pgdat);
pgdat_set_deferred_range(pgdat);
free_area_init_core(pgdat);
}
void __init free_area_init_memoryless_node(int nid)
{
free_area_init_node(nid);
}
#if MAX_NUMNODES > 1
/*
* Figure out the number of possible node ids.
*/
void __init setup_nr_node_ids(void)
{
unsigned int highest;
highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
nr_node_ids = highest + 1;
}
#endif
/**
* node_map_pfn_alignment - determine the maximum internode alignment
*
* This function should be called after node map is populated and sorted.
* It calculates the maximum power of two alignment which can distinguish
* all the nodes.
*
* For example, if all nodes are 1GiB and aligned to 1GiB, the return value
* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
* shifted, 1GiB is enough and this function will indicate so.
*
* This is used to test whether pfn -> nid mapping of the chosen memory
* model has fine enough granularity to avoid incorrect mapping for the
* populated node map.
*
* Return: the determined alignment in pfn's. 0 if there is no alignment
* requirement (single node).
*/
unsigned long __init node_map_pfn_alignment(void)
{
unsigned long accl_mask = 0, last_end = 0;
unsigned long start, end, mask;
int last_nid = NUMA_NO_NODE;
int i, nid;
for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
if (!start || last_nid < 0 || last_nid == nid) {
last_nid = nid;
last_end = end;
continue;
}
/*
* Start with a mask granular enough to pin-point to the
* start pfn and tick off bits one-by-one until it becomes
* too coarse to separate the current node from the last.
*/
mask = ~((1 << __ffs(start)) - 1);
while (mask && last_end <= (start & (mask << 1)))
mask <<= 1;
/* accumulate all internode masks */
accl_mask |= mask;
}
/* convert mask to number of pages */
return ~accl_mask + 1;
}
/**
* find_min_pfn_with_active_regions - Find the minimum PFN registered
*
* Return: the minimum PFN based on information provided via
* memblock_set_node().
*/
unsigned long __init find_min_pfn_with_active_regions(void)
{
return PHYS_PFN(memblock_start_of_DRAM());
}
/*
* early_calculate_totalpages()
* Sum pages in active regions for movable zone.
* Populate N_MEMORY for calculating usable_nodes.
*/
static unsigned long __init early_calculate_totalpages(void)
{
unsigned long totalpages = 0;
unsigned long start_pfn, end_pfn;
int i, nid;
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
unsigned long pages = end_pfn - start_pfn;
totalpages += pages;
if (pages)
node_set_state(nid, N_MEMORY);
}
return totalpages;
}
/*
* Find the PFN the Movable zone begins in each node. Kernel memory
* is spread evenly between nodes as long as the nodes have enough
* memory. When they don't, some nodes will have more kernelcore than
* others
*/
static void __init find_zone_movable_pfns_for_nodes(void)
{
int i, nid;
unsigned long usable_startpfn;
unsigned long kernelcore_node, kernelcore_remaining;
/* save the state before borrow the nodemask */
nodemask_t saved_node_state = node_states[N_MEMORY];
unsigned long totalpages = early_calculate_totalpages();
int usable_nodes = nodes_weight(node_states[N_MEMORY]);
struct memblock_region *r;
/* Need to find movable_zone earlier when movable_node is specified. */
find_usable_zone_for_movable();
/*
* If movable_node is specified, ignore kernelcore and movablecore
* options.
*/
if (movable_node_is_enabled()) {
for_each_mem_region(r) {
if (!memblock_is_hotpluggable(r))
continue;
nid = memblock_get_region_node(r);
usable_startpfn = PFN_DOWN(r->base);
zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
min(usable_startpfn, zone_movable_pfn[nid]) :
usable_startpfn;
}
goto out2;
}
/*
* If kernelcore=mirror is specified, ignore movablecore option
*/
if (mirrored_kernelcore) {
bool mem_below_4gb_not_mirrored = false;
for_each_mem_region(r) {
if (memblock_is_mirror(r))
continue;
nid = memblock_get_region_node(r);
usable_startpfn = memblock_region_memory_base_pfn(r);
if (usable_startpfn < 0x100000) {
mem_below_4gb_not_mirrored = true;
continue;
}
zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
min(usable_startpfn, zone_movable_pfn[nid]) :
usable_startpfn;
}
if (mem_below_4gb_not_mirrored)
pr_warn("This configuration results in unmirrored kernel memory.\n");
goto out2;
}
/*
* If kernelcore=nn% or movablecore=nn% was specified, calculate the
* amount of necessary memory.
*/
if (required_kernelcore_percent)
required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
10000UL;
if (required_movablecore_percent)
required_movablecore = (totalpages * 100 * required_movablecore_percent) /
10000UL;
/*
* If movablecore= was specified, calculate what size of
* kernelcore that corresponds so that memory usable for
* any allocation type is evenly spread. If both kernelcore
* and movablecore are specified, then the value of kernelcore
* will be used for required_kernelcore if it's greater than
* what movablecore would have allowed.
*/
if (required_movablecore) {
unsigned long corepages;
/*
* Round-up so that ZONE_MOVABLE is at least as large as what
* was requested by the user
*/
required_movablecore =
roundup(required_movablecore, MAX_ORDER_NR_PAGES);
required_movablecore = min(totalpages, required_movablecore);
corepages = totalpages - required_movablecore;
required_kernelcore = max(required_kernelcore, corepages);
}
/*
* If kernelcore was not specified or kernelcore size is larger
* than totalpages, there is no ZONE_MOVABLE.
*/
if (!required_kernelcore || required_kernelcore >= totalpages)
goto out;
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
restart:
/* Spread kernelcore memory as evenly as possible throughout nodes */
kernelcore_node = required_kernelcore / usable_nodes;
for_each_node_state(nid, N_MEMORY) {
unsigned long start_pfn, end_pfn;
/*
* Recalculate kernelcore_node if the division per node
* now exceeds what is necessary to satisfy the requested
* amount of memory for the kernel
*/
if (required_kernelcore < kernelcore_node)
kernelcore_node = required_kernelcore / usable_nodes;
/*
* As the map is walked, we track how much memory is usable
* by the kernel using kernelcore_remaining. When it is
* 0, the rest of the node is usable by ZONE_MOVABLE
*/
kernelcore_remaining = kernelcore_node;
/* Go through each range of PFNs within this node */
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
unsigned long size_pages;
start_pfn = max(start_pfn, zone_movable_pfn[nid]);
if (start_pfn >= end_pfn)
continue;
/* Account for what is only usable for kernelcore */
if (start_pfn < usable_startpfn) {
unsigned long kernel_pages;
kernel_pages = min(end_pfn, usable_startpfn)
- start_pfn;
kernelcore_remaining -= min(kernel_pages,
kernelcore_remaining);
required_kernelcore -= min(kernel_pages,
required_kernelcore);
/* Continue if range is now fully accounted */
if (end_pfn <= usable_startpfn) {
/*
* Push zone_movable_pfn to the end so
* that if we have to rebalance
* kernelcore across nodes, we will
* not double account here
*/
zone_movable_pfn[nid] = end_pfn;
continue;
}
start_pfn = usable_startpfn;
}
/*
* The usable PFN range for ZONE_MOVABLE is from
* start_pfn->end_pfn. Calculate size_pages as the
* number of pages used as kernelcore
*/
size_pages = end_pfn - start_pfn;
if (size_pages > kernelcore_remaining)
size_pages = kernelcore_remaining;
zone_movable_pfn[nid] = start_pfn + size_pages;
/*
* Some kernelcore has been met, update counts and
* break if the kernelcore for this node has been
* satisfied
*/
required_kernelcore -= min(required_kernelcore,
size_pages);
kernelcore_remaining -= size_pages;
if (!kernelcore_remaining)
break;
}
}
/*
* If there is still required_kernelcore, we do another pass with one
* less node in the count. This will push zone_movable_pfn[nid] further
* along on the nodes that still have memory until kernelcore is
* satisfied
*/
usable_nodes--;
if (usable_nodes && required_kernelcore > usable_nodes)
goto restart;
out2:
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
for (nid = 0; nid < MAX_NUMNODES; nid++) {
unsigned long start_pfn, end_pfn;
zone_movable_pfn[nid] =
roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
if (zone_movable_pfn[nid] >= end_pfn)
zone_movable_pfn[nid] = 0;
}
out:
/* restore the node_state */
node_states[N_MEMORY] = saved_node_state;
}
/* Any regular or high memory on that node ? */
static void check_for_memory(pg_data_t *pgdat, int nid)
{
enum zone_type zone_type;
for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
if (populated_zone(zone)) {
if (IS_ENABLED(CONFIG_HIGHMEM))
node_set_state(nid, N_HIGH_MEMORY);
if (zone_type <= ZONE_NORMAL)
node_set_state(nid, N_NORMAL_MEMORY);
break;
}
}
}
/*
* Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
* such cases we allow max_zone_pfn sorted in the descending order
*/
bool __weak arch_has_descending_max_zone_pfns(void)
{
return false;
}
/**
* free_area_init - Initialise all pg_data_t and zone data
* @max_zone_pfn: an array of max PFNs for each zone
*
* This will call free_area_init_node() for each active node in the system.
* Using the page ranges provided by memblock_set_node(), the size of each
* zone in each node and their holes is calculated. If the maximum PFN
* between two adjacent zones match, it is assumed that the zone is empty.
* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
* that arch_max_dma32_pfn has no pages. It is also assumed that a zone
* starts where the previous one ended. For example, ZONE_DMA32 starts
* at arch_max_dma_pfn.
*/
void __init free_area_init(unsigned long *max_zone_pfn)
{
unsigned long start_pfn, end_pfn;
int i, nid, zone;
bool descending;
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));
start_pfn = find_min_pfn_with_active_regions();
descending = arch_has_descending_max_zone_pfns();
for (i = 0; i < MAX_NR_ZONES; i++) {
if (descending)
zone = MAX_NR_ZONES - i - 1;
else
zone = i;
if (zone == ZONE_MOVABLE)
continue;
end_pfn = max(max_zone_pfn[zone], start_pfn);
arch_zone_lowest_possible_pfn[zone] = start_pfn;
arch_zone_highest_possible_pfn[zone] = end_pfn;
start_pfn = end_pfn;
}
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
find_zone_movable_pfns_for_nodes();
/* Print out the zone ranges */
pr_info("Zone ranges:\n");
for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
pr_info(" %-8s ", zone_names[i]);
if (arch_zone_lowest_possible_pfn[i] ==
arch_zone_highest_possible_pfn[i])
pr_cont("empty\n");
else
pr_cont("[mem %#018Lx-%#018Lx]\n",
(u64)arch_zone_lowest_possible_pfn[i]
<< PAGE_SHIFT,
((u64)arch_zone_highest_possible_pfn[i]
<< PAGE_SHIFT) - 1);
}
/* Print out the PFNs ZONE_MOVABLE begins at in each node */
pr_info("Movable zone start for each node\n");
for (i = 0; i < MAX_NUMNODES; i++) {
if (zone_movable_pfn[i])
pr_info(" Node %d: %#018Lx\n", i,
(u64)zone_movable_pfn[i] << PAGE_SHIFT);
}
/*
* Print out the early node map, and initialize the
* subsection-map relative to active online memory ranges to
* enable future "sub-section" extensions of the memory map.
*/
pr_info("Early memory node ranges\n");
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
((u64)end_pfn << PAGE_SHIFT) - 1);
subsection_map_init(start_pfn, end_pfn - start_pfn);
}
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid);
/* Any memory on that node */
if (pgdat->node_present_pages)
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat, nid);
}
memmap_init();
}
static int __init cmdline_parse_core(char *p, unsigned long *core,
unsigned long *percent)
{
unsigned long long coremem;
char *endptr;
if (!p)
return -EINVAL;
/* Value may be a percentage of total memory, otherwise bytes */
coremem = simple_strtoull(p, &endptr, 0);
if (*endptr == '%') {
/* Paranoid check for percent values greater than 100 */
WARN_ON(coremem > 100);
*percent = coremem;
} else {
coremem = memparse(p, &p);
/* Paranoid check that UL is enough for the coremem value */
WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
*core = coremem >> PAGE_SHIFT;
*percent = 0UL;
}
return 0;
}
/*
* kernelcore=size sets the amount of memory for use for allocations that
* cannot be reclaimed or migrated.
*/
static int __init cmdline_parse_kernelcore(char *p)
{
/* parse kernelcore=mirror */
if (parse_option_str(p, "mirror")) {
mirrored_kernelcore = true;
return 0;
}
return cmdline_parse_core(p, &required_kernelcore,
&required_kernelcore_percent);
}
/*
* movablecore=size sets the amount of memory for use for allocations that
* can be reclaimed or migrated.
*/
static int __init cmdline_parse_movablecore(char *p)
{
return cmdline_parse_core(p, &required_movablecore,
&required_movablecore_percent);
}
early_param("kernelcore", cmdline_parse_kernelcore);
early_param("movablecore", cmdline_parse_movablecore);
void adjust_managed_page_count(struct page *page, long count)
{
atomic_long_add(count, &page_zone(page)->managed_pages);
totalram_pages_add(count);
#ifdef CONFIG_HIGHMEM
if (PageHighMem(page))
totalhigh_pages_add(count);
#endif
}
EXPORT_SYMBOL(adjust_managed_page_count);
unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
{
void *pos;
unsigned long pages = 0;
start = (void *)PAGE_ALIGN((unsigned long)start);
end = (void *)((unsigned long)end & PAGE_MASK);
for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
struct page *page = virt_to_page(pos);
void *direct_map_addr;
/*
* 'direct_map_addr' might be different from 'pos'
* because some architectures' virt_to_page()
* work with aliases. Getting the direct map
* address ensures that we get a _writeable_
* alias for the memset().
*/
direct_map_addr = page_address(page);
/*
* Perform a kasan-unchecked memset() since this memory
* has not been initialized.
*/
direct_map_addr = kasan_reset_tag(direct_map_addr);
if ((unsigned int)poison <= 0xFF)
memset(direct_map_addr, poison, PAGE_SIZE);
free_reserved_page(page);
}
if (pages && s)
pr_info("Freeing %s memory: %ldK\n",
s, pages << (PAGE_SHIFT - 10));
return pages;
}
#ifdef CONFIG_HIGHMEM
void free_highmem_page(struct page *page)
{
__free_reserved_page(page);
totalram_pages_inc();
atomic_long_inc(&page_zone(page)->managed_pages);
totalhigh_pages_inc();
}
#endif
void __init mem_init_print_info(const char *str)
{
unsigned long physpages, codesize, datasize, rosize, bss_size;
unsigned long init_code_size, init_data_size;
physpages = get_num_physpages();
codesize = _etext - _stext;
datasize = _edata - _sdata;
rosize = __end_rodata - __start_rodata;
bss_size = __bss_stop - __bss_start;
init_data_size = __init_end - __init_begin;
init_code_size = _einittext - _sinittext;
/*
* Detect special cases and adjust section sizes accordingly:
* 1) .init.* may be embedded into .data sections
* 2) .init.text.* may be out of [__init_begin, __init_end],
* please refer to arch/tile/kernel/vmlinux.lds.S.
* 3) .rodata.* may be embedded into .text or .data sections.
*/
#define adj_init_size(start, end, size, pos, adj) \
do { \
if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
size -= adj; \
} while (0)
adj_init_size(__init_begin, __init_end, init_data_size,
_sinittext, init_code_size);
adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
#undef adj_init_size
pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
#ifdef CONFIG_HIGHMEM
", %luK highmem"
#endif
"%s%s)\n",
nr_free_pages() << (PAGE_SHIFT - 10),
physpages << (PAGE_SHIFT - 10),
codesize >> 10, datasize >> 10, rosize >> 10,
(init_data_size + init_code_size) >> 10, bss_size >> 10,
(physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
totalcma_pages << (PAGE_SHIFT - 10),
#ifdef CONFIG_HIGHMEM
totalhigh_pages() << (PAGE_SHIFT - 10),
#endif
str ? ", " : "", str ? str : "");
}
/**
* set_dma_reserve - set the specified number of pages reserved in the first zone
* @new_dma_reserve: The number of pages to mark reserved
*
* The per-cpu batchsize and zone watermarks are determined by managed_pages.
* In the DMA zone, a significant percentage may be consumed by kernel image
* and other unfreeable allocations which can skew the watermarks badly. This
* function may optionally be used to account for unfreeable pages in the
* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
* smaller per-cpu batchsize.
*/
void __init set_dma_reserve(unsigned long new_dma_reserve)
{
dma_reserve = new_dma_reserve;
}
static int page_alloc_cpu_dead(unsigned int cpu)
{
lru_add_drain_cpu(cpu);
drain_pages(cpu);
/*
* Spill the event counters of the dead processor
* into the current processors event counters.
* This artificially elevates the count of the current
* processor.
*/
vm_events_fold_cpu(cpu);
/*
* Zero the differential counters of the dead processor
* so that the vm statistics are consistent.
*
* This is only okay since the processor is dead and cannot
* race with what we are doing.
*/
cpu_vm_stats_fold(cpu);
return 0;
}
#ifdef CONFIG_NUMA
int hashdist = HASHDIST_DEFAULT;
static int __init set_hashdist(char *str)
{
if (!str)
return 0;
hashdist = simple_strtoul(str, &str, 0);
return 1;
}
__setup("hashdist=", set_hashdist);
#endif
void __init page_alloc_init(void)
{
int ret;
#ifdef CONFIG_NUMA
if (num_node_state(N_MEMORY) == 1)
hashdist = 0;
#endif
ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
"mm/page_alloc:dead", NULL,
page_alloc_cpu_dead);
WARN_ON(ret < 0);
}
/*
* calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
* or min_free_kbytes changes.
*/
static void calculate_totalreserve_pages(void)
{
struct pglist_data *pgdat;
unsigned long reserve_pages = 0;
enum zone_type i, j;
for_each_online_pgdat(pgdat) {
pgdat->totalreserve_pages = 0;
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
long max = 0;
unsigned long managed_pages = zone_managed_pages(zone);
/* Find valid and maximum lowmem_reserve in the zone */
for (j = i; j < MAX_NR_ZONES; j++) {
if (zone->lowmem_reserve[j] > max)
max = zone->lowmem_reserve[j];
}
/* we treat the high watermark as reserved pages. */
max += high_wmark_pages(zone);
if (max > managed_pages)
max = managed_pages;
pgdat->totalreserve_pages += max;
reserve_pages += max;
}
}
totalreserve_pages = reserve_pages;
}
/*
* setup_per_zone_lowmem_reserve - called whenever
* sysctl_lowmem_reserve_ratio changes. Ensures that each zone
* has a correct pages reserved value, so an adequate number of
* pages are left in the zone after a successful __alloc_pages().
*/
static void setup_per_zone_lowmem_reserve(void)
{
struct pglist_data *pgdat;
enum zone_type i, j;
for_each_online_pgdat(pgdat) {
for (i = 0; i < MAX_NR_ZONES - 1; i++) {
struct zone *zone = &pgdat->node_zones[i];
int ratio = sysctl_lowmem_reserve_ratio[i];
bool clear = !ratio || !zone_managed_pages(zone);
unsigned long managed_pages = 0;
for (j = i + 1; j < MAX_NR_ZONES; j++) {
struct zone *upper_zone = &pgdat->node_zones[j];
managed_pages += zone_managed_pages(upper_zone);
if (clear)
zone->lowmem_reserve[j] = 0;
else
zone->lowmem_reserve[j] = managed_pages / ratio;
}
}
}
/* update totalreserve_pages */
calculate_totalreserve_pages();
}
static void __setup_per_zone_wmarks(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
unsigned long flags;
/* Calculate total number of !ZONE_HIGHMEM pages */
for_each_zone(zone) {
if (!is_highmem(zone))
lowmem_pages += zone_managed_pages(zone);
}
for_each_zone(zone) {
u64 tmp, low;
spin_lock_irqsave(&zone->lock, flags);
tmp = (u64)pages_min * zone_managed_pages(zone);
do_div(tmp, lowmem_pages);
low = (u64)pages_low * zone_managed_pages(zone);
do_div(low, nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)));
if (is_highmem(zone)) {
/*
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
* need highmem pages, so cap pages_min to a small
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
* deltas control async page reclaim, and so should
* not be capped for highmem.
*/
unsigned long min_pages;
min_pages = zone_managed_pages(zone) / 1024;
min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
zone->_watermark[WMARK_MIN] = min_pages;
} else {
/*
* If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
zone->_watermark[WMARK_MIN] = tmp;
}
/*
* Set the kswapd watermarks distance according to the
* scale factor in proportion to available memory, but
* ensure a minimum size on small systems.
*/
tmp = max_t(u64, tmp >> 2,
mult_frac(zone_managed_pages(zone),
watermark_scale_factor, 10000));
zone->watermark_boost = 0;
zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + low + tmp;
zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + low + tmp * 2;
spin_unlock_irqrestore(&zone->lock, flags);
}
/* update totalreserve_pages */
calculate_totalreserve_pages();
}
/**
* setup_per_zone_wmarks - called when min_free_kbytes changes
* or when memory is hot-{added|removed}
*
* Ensures that the watermark[min,low,high] values for each zone are set
* correctly with respect to min_free_kbytes.
*/
void setup_per_zone_wmarks(void)
{
static DEFINE_SPINLOCK(lock);
spin_lock(&lock);
__setup_per_zone_wmarks();
spin_unlock(&lock);
}
/*
* Initialise min_free_kbytes.
*
* For small machines we want it small (128k min). For large machines
* we want it large (256MB max). But it is not linear, because network
* bandwidth does not increase linearly with machine size. We use
*
* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
* min_free_kbytes = sqrt(lowmem_kbytes * 16)
*
* which yields
*
* 16MB: 512k
* 32MB: 724k
* 64MB: 1024k
* 128MB: 1448k
* 256MB: 2048k
* 512MB: 2896k
* 1024MB: 4096k
* 2048MB: 5792k
* 4096MB: 8192k
* 8192MB: 11584k
* 16384MB: 16384k
*/
int __meminit init_per_zone_wmark_min(void)
{
unsigned long lowmem_kbytes;
int new_min_free_kbytes;
lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
if (new_min_free_kbytes > user_min_free_kbytes) {
min_free_kbytes = new_min_free_kbytes;
if (min_free_kbytes < 128)
min_free_kbytes = 128;
if (min_free_kbytes > 262144)
min_free_kbytes = 262144;
} else {
pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
new_min_free_kbytes, user_min_free_kbytes);
}
setup_per_zone_wmarks();
refresh_zone_stat_thresholds();
setup_per_zone_lowmem_reserve();
#ifdef CONFIG_NUMA
setup_min_unmapped_ratio();
setup_min_slab_ratio();
#endif
khugepaged_min_free_kbytes_update();
return 0;
}
postcore_initcall(init_per_zone_wmark_min)
/*
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
* that we can call two helper functions whenever min_free_kbytes
* or extra_free_kbytes changes.
*/
int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
if (write) {
user_min_free_kbytes = min_free_kbytes;
setup_per_zone_wmarks();
}
return 0;
}
int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
if (write)
setup_per_zone_wmarks();
return 0;
}
#ifdef CONFIG_NUMA
static void setup_min_unmapped_ratio(void)
{
pg_data_t *pgdat;
struct zone *zone;
for_each_online_pgdat(pgdat)
pgdat->min_unmapped_pages = 0;
for_each_zone(zone)
zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
sysctl_min_unmapped_ratio) / 100;
}
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
setup_min_unmapped_ratio();
return 0;
}
static void setup_min_slab_ratio(void)
{
pg_data_t *pgdat;
struct zone *zone;
for_each_online_pgdat(pgdat)
pgdat->min_slab_pages = 0;
for_each_zone(zone)
zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
sysctl_min_slab_ratio) / 100;
}
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
setup_min_slab_ratio();
return 0;
}
#endif
/*
* lowmem_reserve_ratio_sysctl_handler - just a wrapper around
* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
* whenever sysctl_lowmem_reserve_ratio changes.
*
* The reserve ratio obviously has absolutely no relation with the
* minimum watermarks. The lowmem reserve ratio can only make sense
* if in function of the boot time zone sizes.
*/
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int i;
proc_dointvec_minmax(table, write, buffer, length, ppos);
for (i = 0; i < MAX_NR_ZONES; i++) {
if (sysctl_lowmem_reserve_ratio[i] < 1)
sysctl_lowmem_reserve_ratio[i] = 0;
}
setup_per_zone_lowmem_reserve();
return 0;
}
static void __zone_pcp_update(struct zone *zone)
{
unsigned int cpu;
for_each_possible_cpu(cpu)
pageset_set_high_and_batch(zone,
per_cpu_ptr(zone->pageset, cpu));
}
/*
* percpu_pagelist_fraction - changes the pcp->high for each zone on each
* cpu. It is the fraction of total pages in each zone that a hot per cpu
* pagelist can have before it gets flushed back to buddy allocator.
*/
int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
int old_percpu_pagelist_fraction;
int ret;
mutex_lock(&pcp_batch_high_lock);
old_percpu_pagelist_fraction = percpu_pagelist_fraction;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (!write || ret < 0)
goto out;
/* Sanity checking to avoid pcp imbalance */
if (percpu_pagelist_fraction &&
percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
percpu_pagelist_fraction = old_percpu_pagelist_fraction;
ret = -EINVAL;
goto out;
}
/* No change? */
if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
goto out;
for_each_populated_zone(zone)
__zone_pcp_update(zone);
out:
mutex_unlock(&pcp_batch_high_lock);
return ret;
}
#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
/*
* Returns the number of pages that arch has reserved but
* is not known to alloc_large_system_hash().
*/
static unsigned long __init arch_reserved_kernel_pages(void)
{
return 0;
}
#endif
/*
* Adaptive scale is meant to reduce sizes of hash tables on large memory
* machines. As memory size is increased the scale is also increased but at
* slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
* quadruples the scale is increased by one, which means the size of hash table
* only doubles, instead of quadrupling as well.
* Because 32-bit systems cannot have large physical memory, where this scaling
* makes sense, it is disabled on such platforms.
*/
#if __BITS_PER_LONG > 32
#define ADAPT_SCALE_BASE (64ul << 30)
#define ADAPT_SCALE_SHIFT 2
#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
#endif
/*
* allocate a large system hash table from bootmem
* - it is assumed that the hash table must contain an exact power-of-2
* quantity of entries
* - limit is the number of hash buckets, not the total allocation size
*/
void *__init alloc_large_system_hash(const char *tablename,
unsigned long bucketsize,
unsigned long numentries,
int scale,
int flags,
unsigned int *_hash_shift,
unsigned int *_hash_mask,
unsigned long low_limit,
unsigned long high_limit)
{
unsigned long long max = high_limit;
unsigned long log2qty, size;
void *table = NULL;
gfp_t gfp_flags;
bool virt;
/* allow the kernel cmdline to have a say */
if (!numentries) {
/* round applicable memory size up to nearest megabyte */
numentries = nr_kernel_pages;
numentries -= arch_reserved_kernel_pages();
/* It isn't necessary when PAGE_SIZE >= 1MB */
if (PAGE_SHIFT < 20)
numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
#if __BITS_PER_LONG > 32
if (!high_limit) {
unsigned long adapt;
for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
adapt <<= ADAPT_SCALE_SHIFT)
scale++;
}
#endif
/* limit to 1 bucket per 2^scale bytes of low memory */
if (scale > PAGE_SHIFT)
numentries >>= (scale - PAGE_SHIFT);
else
numentries <<= (PAGE_SHIFT - scale);
/* Make sure we've got at least a 0-order allocation.. */
if (unlikely(flags & HASH_SMALL)) {
/* Makes no sense without HASH_EARLY */
WARN_ON(!(flags & HASH_EARLY));
if (!(numentries >> *_hash_shift)) {
numentries = 1UL << *_hash_shift;
BUG_ON(!numentries);
}
} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
numentries = PAGE_SIZE / bucketsize;
}
numentries = roundup_pow_of_two(numentries);
/* limit allocation size to 1/16 total memory by default */
if (max == 0) {
max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
do_div(max, bucketsize);
}
max = min(max, 0x80000000ULL);
if (numentries < low_limit)
numentries = low_limit;
if (numentries > max)
numentries = max;
log2qty = ilog2(numentries);
gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
do {
virt = false;
size = bucketsize << log2qty;
if (flags & HASH_EARLY) {
if (flags & HASH_ZERO)
table = memblock_alloc(size, SMP_CACHE_BYTES);
else
table = memblock_alloc_raw(size,
SMP_CACHE_BYTES);
} else if (get_order(size) >= MAX_ORDER || hashdist) {
table = __vmalloc(size, gfp_flags);
virt = true;
} else {
/*
* If bucketsize is not a power-of-two, we may free
* some pages at the end of hash table which
* alloc_pages_exact() automatically does
*/
table = alloc_pages_exact(size, gfp_flags);
kmemleak_alloc(table, size, 1, gfp_flags);
}
} while (!table && size > PAGE_SIZE && --log2qty);
if (!table)
panic("Failed to allocate %s hash table\n", tablename);
pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
virt ? "vmalloc" : "linear");
if (_hash_shift)
*_hash_shift = log2qty;
if (_hash_mask)
*_hash_mask = (1 << log2qty) - 1;
return table;
}
/*
* This function checks whether pageblock includes unmovable pages or not.
*
* PageLRU check without isolation or lru_lock could race so that
* MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
* check without lock_page also may miss some movable non-lru pages at
* race condition. So you can't expect this function should be exact.
*
* Returns a page without holding a reference. If the caller wants to
* dereference that page (e.g., dumping), it has to make sure that it
* cannot get removed (e.g., via memory unplug) concurrently.
*
*/
struct page *has_unmovable_pages(struct zone *zone, struct page *page,
int migratetype, int flags)
{
unsigned long iter = 0;
unsigned long pfn = page_to_pfn(page);
unsigned long offset = pfn % pageblock_nr_pages;
if (is_migrate_cma_page(page)) {
/*
* CMA allocations (alloc_contig_range) really need to mark
* isolate CMA pageblocks even when they are not movable in fact
* so consider them movable here.
*/
if (is_migrate_cma(migratetype))
return NULL;
return page;
}
for (; iter < pageblock_nr_pages - offset; iter++) {
if (!pfn_valid_within(pfn + iter))
continue;
page = pfn_to_page(pfn + iter);
/*
* Both, bootmem allocations and memory holes are marked
* PG_reserved and are unmovable. We can even have unmovable
* allocations inside ZONE_MOVABLE, for example when
* specifying "movablecore".
*/
if (PageReserved(page))
return page;
/*
* If the zone is movable and we have ruled out all reserved
* pages then it should be reasonably safe to assume the rest
* is movable.
*/
if (zone_idx(zone) == ZONE_MOVABLE)
continue;
/*
* Hugepages are not in LRU lists, but they're movable.
* THPs are on the LRU, but need to be counted as #small pages.
* We need not scan over tail pages because we don't
* handle each tail page individually in migration.
*/
if (PageHuge(page) || PageTransCompound(page)) {
struct page *head = compound_head(page);
unsigned int skip_pages;
if (PageHuge(page)) {
if (!hugepage_migration_supported(page_hstate(head)))
return page;
} else if (!PageLRU(head) && !__PageMovable(head)) {
return page;
}
skip_pages = compound_nr(head) - (page - head);
iter += skip_pages - 1;
continue;
}
/*
* We can't use page_count without pin a page
* because another CPU can free compound page.
* This check already skips compound tails of THP
* because their page->_refcount is zero at all time.
*/
if (!page_ref_count(page)) {
if (PageBuddy(page))
iter += (1 << buddy_order(page)) - 1;
continue;
}
/*
* The HWPoisoned page may be not in buddy system, and
* page_count() is not 0.
*/
if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
continue;
/*
* We treat all PageOffline() pages as movable when offlining
* to give drivers a chance to decrement their reference count
* in MEM_GOING_OFFLINE in order to indicate that these pages
* can be offlined as there are no direct references anymore.
* For actually unmovable PageOffline() where the driver does
* not support this, we will fail later when trying to actually
* move these pages that still have a reference count > 0.
* (false negatives in this function only)
*/
if ((flags & MEMORY_OFFLINE) && PageOffline(page))
continue;
if (__PageMovable(page) || PageLRU(page))
continue;
/*
* If there are RECLAIMABLE pages, we need to check
* it. But now, memory offline itself doesn't call
* shrink_node_slabs() and it still to be fixed.
*/
return page;
}
return NULL;
}
#ifdef CONFIG_CONTIG_ALLOC
static unsigned long pfn_max_align_down(unsigned long pfn)
{
return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
pageblock_nr_pages) - 1);
}
unsigned long pfn_max_align_up(unsigned long pfn)
{
return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
pageblock_nr_pages));
}
#if defined(CONFIG_DYNAMIC_DEBUG) || \
(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
/* Usage: See admin-guide/dynamic-debug-howto.rst */
static void alloc_contig_dump_pages(struct list_head *page_list)
{
DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
struct page *page;
unsigned long nr_skip = 0;
unsigned long nr_pages = 0;
dump_stack();
list_for_each_entry(page, page_list, lru) {
nr_pages++;
/* The page will be freed by putback_movable_pages soon */
if (page_count(page) == 1) {
nr_skip++;
continue;
}
dump_page(page, "migration failure");
}
pr_warn("total dump_pages %lu skipping %lu\n", nr_pages, nr_skip);
}
}
#else
static inline void alloc_contig_dump_pages(struct list_head *page_list)
{
}
#endif
/* [start, end) must belong to a single zone. */
static int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned long start, unsigned long end,
struct acr_info *info)
{
/* This function is based on compact_zone() from compaction.c. */
unsigned int nr_reclaimed;
unsigned long pfn = start;
unsigned int tries = 0;
unsigned int max_tries = 5;
int ret = 0;
struct page *page;
struct migration_target_control mtc = {
.nid = zone_to_nid(cc->zone),
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
};
if (cc->alloc_contig && cc->mode == MIGRATE_ASYNC)
max_tries = 1;
lru_cache_disable();
while (pfn < end || !list_empty(&cc->migratepages)) {
if (fatal_signal_pending(current)) {
ret = -EINTR;
break;
}
if (list_empty(&cc->migratepages)) {
cc->nr_migratepages = 0;
pfn = isolate_migratepages_range(cc, pfn, end);
if (!pfn) {
ret = -EINTR;
break;
}
tries = 0;
} else if (++tries == max_tries) {
ret = ret < 0 ? ret : -EBUSY;
break;
}
nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
&cc->migratepages);
info->nr_reclaimed += nr_reclaimed;
cc->nr_migratepages -= nr_reclaimed;
list_for_each_entry(page, &cc->migratepages, lru)
info->nr_mapped += page_mapcount(page);
ret = migrate_pages(&cc->migratepages, alloc_migration_target,
NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
if (!ret)
info->nr_migrated += cc->nr_migratepages;
}
lru_cache_enable();
if (ret < 0) {
if (ret == -EBUSY) {
alloc_contig_dump_pages(&cc->migratepages);
page_pinner_mark_migration_failed_pages(&cc->migratepages);
}
if (!list_empty(&cc->migratepages)) {
page = list_first_entry(&cc->migratepages, struct page , lru);
info->failed_pfn = page_to_pfn(page);
}
putback_movable_pages(&cc->migratepages);
info->err |= ACR_ERR_MIGRATE;
return ret;
}
return 0;
}
/**
* alloc_contig_range() -- tries to allocate given range of pages
* @start: start PFN to allocate
* @end: one-past-the-last PFN to allocate
* @migratetype: migratetype of the underlaying pageblocks (either
* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
* in range must have the same migratetype and it must
* be either of the two.
* @gfp_mask: GFP mask to use during compaction
*
* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
* aligned. The PFN range must belong to a single zone.
*
* The first thing this routine does is attempt to MIGRATE_ISOLATE all
* pageblocks in the range. Once isolated, the pageblocks should not
* be modified by others.
*
* Return: zero on success or negative error code. On success all
* pages which PFN is in [start, end) are allocated for the caller and
* need to be freed with free_contig_range().
*/
int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype, gfp_t gfp_mask,
struct acr_info *info)
{
unsigned long outer_start, outer_end;
unsigned int order;
int ret = 0;
bool skip_drain_all_pages = false;
struct compact_control cc = {
.nr_migratepages = 0,
.order = -1,
.zone = page_zone(pfn_to_page(start)),
.mode = gfp_mask & __GFP_NORETRY ? MIGRATE_ASYNC : MIGRATE_SYNC,
.ignore_skip_hint = true,
.no_set_skip_hint = true,
.gfp_mask = current_gfp_context(gfp_mask),
.alloc_contig = true,
};
INIT_LIST_HEAD(&cc.migratepages);
/*
* What we do here is we mark all pageblocks in range as
* MIGRATE_ISOLATE. Because pageblock and max order pages may
* have different sizes, and due to the way page allocator
* work, we align the range to biggest of the two pages so
* that page allocator won't try to merge buddies from
* different pageblocks and change MIGRATE_ISOLATE to some
* other migration type.
*
* Once the pageblocks are marked as MIGRATE_ISOLATE, we
* migrate the pages from an unaligned range (ie. pages that
* we are interested in). This will put all the pages in
* range back to page allocator as MIGRATE_ISOLATE.
*
* When this is done, we take the pages in range from page
* allocator removing them from the buddy system. This way
* page allocator will never consider using them.
*
* This lets us mark the pageblocks back as
* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
* aligned range but not in the unaligned, original range are
* put back to page allocator so that buddy can use them.
*/
ret = start_isolate_page_range(pfn_max_align_down(start),
pfn_max_align_up(end), migratetype, 0,
&info->failed_pfn);
if (ret) {
info->err |= ACR_ERR_ISOLATE;
return ret;
}
trace_android_vh_cma_drain_all_pages_bypass(migratetype,
&skip_drain_all_pages);
if (!skip_drain_all_pages)
drain_all_pages(cc.zone);
/*
* In case of -EBUSY, we'd like to know which page causes problem.
* So, just fall through. test_pages_isolated() has a tracepoint
* which will report the busy page.
*
* It is possible that busy pages could become available before
* the call to test_pages_isolated, and the range will actually be
* allocated. So, if we fall through be sure to clear ret so that
* -EBUSY is not accidentally used or returned to caller.
*/
ret = __alloc_contig_migrate_range(&cc, start, end, info);
if (ret && (ret != -EBUSY || (gfp_mask & __GFP_NORETRY)))
goto done;
ret =0;
/*
* Pages from [start, end) are within a MAX_ORDER_NR_PAGES
* aligned blocks that are marked as MIGRATE_ISOLATE. What's
* more, all pages in [start, end) are free in page allocator.
* What we are going to do is to allocate all pages from
* [start, end) (that is remove them from page allocator).
*
* The only problem is that pages at the beginning and at the
* end of interesting range may be not aligned with pages that
* page allocator holds, ie. they can be part of higher order
* pages. Because of this, we reserve the bigger range and
* once this is done free the pages we are not interested in.
*
* We don't have to hold zone->lock here because the pages are
* isolated thus they won't get removed from buddy.
*/
order = 0;
outer_start = start;
while (!PageBuddy(pfn_to_page(outer_start))) {
if (++order >= MAX_ORDER) {
outer_start = start;
break;
}
outer_start &= ~0UL << order;
}
if (outer_start != start) {
order = buddy_order(pfn_to_page(outer_start));
/*
* outer_start page could be small order buddy page and
* it doesn't include start page. Adjust outer_start
* in this case to report failed page properly
* on tracepoint in test_pages_isolated()
*/
if (outer_start + (1UL << order) <= start)
outer_start = start;
}
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end, 0, &info->failed_pfn)) {
pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
__func__, outer_start, end);
ret = -EBUSY;
info->err |= ACR_ERR_TEST;
goto done;
}
/* Grab isolated pages from freelists. */
outer_end = isolate_freepages_range(&cc, outer_start, end);
if (!outer_end) {
ret = -EBUSY;
goto done;
}
/* Free head and tail (if any) */
if (start != outer_start)
free_contig_range(outer_start, start - outer_start);
if (end != outer_end)
free_contig_range(end, outer_end - end);
done:
undo_isolate_page_range(pfn_max_align_down(start),
pfn_max_align_up(end), migratetype);
return ret;
}
EXPORT_SYMBOL(alloc_contig_range);
static int __alloc_contig_pages(unsigned long start_pfn,
unsigned long nr_pages, gfp_t gfp_mask)
{
struct acr_info dummy;
unsigned long end_pfn = start_pfn + nr_pages;
return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
gfp_mask, &dummy);
}
static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
unsigned long nr_pages)
{
unsigned long i, end_pfn = start_pfn + nr_pages;
struct page *page;
for (i = start_pfn; i < end_pfn; i++) {
page = pfn_to_online_page(i);
if (!page)
return false;
if (page_zone(page) != z)
return false;
if (PageReserved(page))
return false;
if (page_count(page) > 0)
return false;
if (PageHuge(page))
return false;
}
return true;
}
static bool zone_spans_last_pfn(const struct zone *zone,
unsigned long start_pfn, unsigned long nr_pages)
{
unsigned long last_pfn = start_pfn + nr_pages - 1;
return zone_spans_pfn(zone, last_pfn);
}
/**
* alloc_contig_pages() -- tries to find and allocate contiguous range of pages
* @nr_pages: Number of contiguous pages to allocate
* @gfp_mask: GFP mask to limit search and used during compaction
* @nid: Target node
* @nodemask: Mask for other possible nodes
*
* This routine is a wrapper around alloc_contig_range(). It scans over zones
* on an applicable zonelist to find a contiguous pfn range which can then be
* tried for allocation with alloc_contig_range(). This routine is intended
* for allocation requests which can not be fulfilled with the buddy allocator.
*
* The allocated memory is always aligned to a page boundary. If nr_pages is a
* power of two then the alignment is guaranteed to be to the given nr_pages
* (e.g. 1GB request would be aligned to 1GB).
*
* Allocated pages can be freed with free_contig_range() or by manually calling
* __free_page() on each allocated page.
*
* Return: pointer to contiguous pages on success, or NULL if not successful.
*/
struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
unsigned long ret, pfn, flags;
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
zonelist = node_zonelist(nid, gfp_mask);
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(gfp_mask), nodemask) {
spin_lock_irqsave(&zone->lock, flags);
pfn = ALIGN(zone->zone_start_pfn, nr_pages);
while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
/*
* We release the zone lock here because
* alloc_contig_range() will also lock the zone
* at some point. If there's an allocation
* spinning on this lock, it may win the race
* and cause alloc_contig_range() to fail...
*/
spin_unlock_irqrestore(&zone->lock, flags);
ret = __alloc_contig_pages(pfn, nr_pages,
gfp_mask);
if (!ret)
return pfn_to_page(pfn);
spin_lock_irqsave(&zone->lock, flags);
}
pfn += nr_pages;
}
spin_unlock_irqrestore(&zone->lock, flags);
}
return NULL;
}
#endif /* CONFIG_CONTIG_ALLOC */
void free_contig_range(unsigned long pfn, unsigned int nr_pages)
{
unsigned int count = 0;
for (; nr_pages--; pfn++) {
struct page *page = pfn_to_page(pfn);
count += page_count(page) != 1;
__free_page(page);
}
WARN(count != 0, "%d pages are still in use!\n", count);
}
EXPORT_SYMBOL(free_contig_range);
/*
* The zone indicated has a new number of managed_pages; batch sizes and percpu
* page high values need to be recalulated.
*/
void __meminit zone_pcp_update(struct zone *zone)
{
mutex_lock(&pcp_batch_high_lock);
__zone_pcp_update(zone);
mutex_unlock(&pcp_batch_high_lock);
}
void zone_pcp_reset(struct zone *zone)
{
unsigned long flags;
int cpu;
struct per_cpu_pageset *pset;
/* avoid races with drain_pages() */
local_irq_save(flags);
if (zone->pageset != &boot_pageset) {
for_each_online_cpu(cpu) {
pset = per_cpu_ptr(zone->pageset, cpu);
drain_zonestat(zone, pset);
}
free_percpu(zone->pageset);
zone->pageset = &boot_pageset;
}
local_irq_restore(flags);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
* All pages in the range must be in a single zone, must not contain holes,
* must span full sections, and must be isolated before calling this function.
*/
void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn = start_pfn;
struct page *page;
struct zone *zone;
unsigned int order;
unsigned long flags;
offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
spin_lock_irqsave(&zone->lock, flags);
while (pfn < end_pfn) {
page = pfn_to_page(pfn);
/*
* The HWPoisoned page may be not in buddy system, and
* page_count() is not 0.
*/
if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
pfn++;
continue;
}
/*
* At this point all remaining PageOffline() pages have a
* reference count of 0 and can simply be skipped.
*/
if (PageOffline(page)) {
BUG_ON(page_count(page));
BUG_ON(PageBuddy(page));
pfn++;
continue;
}
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
order = buddy_order(page);
del_page_from_free_list(page, zone, order);
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
}
#endif
bool is_free_buddy_page(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
unsigned long flags;
unsigned int order;
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
if (PageBuddy(page_head) && buddy_order(page_head) >= order)
break;
}
spin_unlock_irqrestore(&zone->lock, flags);
return order < MAX_ORDER;
}
#ifdef CONFIG_MEMORY_FAILURE
/*
* Break down a higher-order page in sub-pages, and keep our target out of
* buddy allocator.
*/
static void break_down_buddy_pages(struct zone *zone, struct page *page,
struct page *target, int low, int high,
int migratetype)
{
unsigned long size = 1 << high;
struct page *current_buddy, *next_page;
while (high > low) {
high--;
size >>= 1;
if (target >= &page[size]) {
next_page = page + size;
current_buddy = page;
} else {
next_page = page;
current_buddy = page + size;
}
if (set_page_guard(zone, current_buddy, high, migratetype))
continue;
if (current_buddy != target) {
add_to_free_list(current_buddy, zone, high, migratetype);
set_buddy_order(current_buddy, high);
page = next_page;
}
}
}
/*
* Take a page that will be marked as poisoned off the buddy allocator.
*/
bool take_page_off_buddy(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
unsigned long flags;
unsigned int order;
bool ret = false;
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
int page_order = buddy_order(page_head);
if (PageBuddy(page_head) && page_order >= order) {
unsigned long pfn_head = page_to_pfn(page_head);
int migratetype = get_pfnblock_migratetype(page_head,
pfn_head);
del_page_from_free_list(page_head, zone, page_order);
break_down_buddy_pages(zone, page_head, page, 0,
page_order, migratetype);
if (!is_migrate_isolate(migratetype))
__mod_zone_freepage_state(zone, -1, migratetype);
ret = true;
break;
}
if (page_count(page_head) > 0)
break;
}
spin_unlock_irqrestore(&zone->lock, flags);
return ret;
}
#endif
#ifdef CONFIG_ZONE_DMA
bool has_managed_dma(void)
{
struct pglist_data *pgdat;
for_each_online_pgdat(pgdat) {
struct zone *zone = &pgdat->node_zones[ZONE_DMA];
if (managed_zone(zone))
return true;
}
return false;
}
#endif /* CONFIG_ZONE_DMA */