android_kernel_xiaomi_sm8450/fs/erofs/zdata.h
Sandeep Dhavale 6f48588062 BACKPORT: erofs: add per-cpu threads for decompression as an option
Using per-cpu thread pool we can reduce the scheduling latency compared
to workqueue implementation. With this patch scheduling latency and
variation is reduced as per-cpu threads are high priority kthread_workers.

The results were evaluated on arm64 Android devices running 5.10 kernel.

The table below shows resulting improvements of total scheduling latency
for the same app launch benchmark runs with 50 iterations. Scheduling
latency is the latency between when the task (workqueue kworker vs
kthread_worker) became eligible to run to when it actually started
running.
+-------------------------+-----------+----------------+---------+
|                         | workqueue | kthread_worker |  diff   |
+-------------------------+-----------+----------------+---------+
| Average (us)            |     15253 |           2914 | -80.89% |
| Median (us)             |     14001 |           2912 | -79.20% |
| Minimum (us)            |      3117 |           1027 | -67.05% |
| Maximum (us)            |     30170 |           3805 | -87.39% |
| Standard deviation (us) |      7166 |            359 |         |
+-------------------------+-----------+----------------+---------+

Background: Boot times and cold app launch benchmarks are very
important to the Android ecosystem as they directly translate to
responsiveness from user point of view. While EROFS provides
a lot of important features like space savings, we saw some
performance penalty in cold app launch benchmarks in few scenarios.
Analysis showed that the significant variance was coming from the
scheduling cost while decompression cost was more or less the same.

Having per-cpu thread pool we can see from the above table that this
variation is reduced by ~80% on average. This problem was discussed
at LPC 2022. Link to LPC 2022 slides and talk at [1]

[1] https://lpc.events/event/16/contributions/1338/

[ Gao Xiang: At least, we have to add this until WQ_UNBOUND workqueue
             issue [2] on many arm64 devices is resolved. ]
[2] https://lore.kernel.org/r/CAJkfWY490-m6wNubkxiTPsW59sfsQs37Wey279LmiRxKt7aQYg@mail.gmail.com

Bug: 271636421
Bug: 278520205
Test: launch_cvd
Change-Id: I9dce2bfd6f40ec6a210161b80cee7c0417b4edb3
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20230208093322.75816-1-hsiangkao@linux.alibaba.com
(cherry picked from commit 3fffb589b9a6e331e39cb75373ee7691acd7b109)
[dhavale: Fixed minor conflict as upstream now has zdata.h folded in
zdata.c]
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
(cherry picked from commit 566a7f6c6b3f5f13b766fe749bbdb45918b029ac)
[dhavale: Fixed minor conflicts in Kconfig and zdata.c]
(cherry picked from commit 2de95f5d183c2174c9380a902919c8e59e380293)
2023-04-17 22:07:07 +00:00

193 lines
4.8 KiB
C

/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2018 HUAWEI, Inc.
* https://www.huawei.com/
* Created by Gao Xiang <gaoxiang25@huawei.com>
*/
#ifndef __EROFS_FS_ZDATA_H
#define __EROFS_FS_ZDATA_H
#include <linux/kthread.h>
#include "internal.h"
#include "zpvec.h"
#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
#define Z_EROFS_NR_INLINE_PAGEVECS 3
/*
* Structure fields follow one of the following exclusion rules.
*
* I: Modifiable by initialization/destruction paths and read-only
* for everyone else;
*
* L: Field should be protected by pageset lock;
*
* A: Field should be accessed / updated in atomic for parallelized code.
*/
struct z_erofs_collection {
struct mutex lock;
/* I: page offset of start position of decompression */
unsigned short pageofs;
/* L: maximum relative page index in pagevec[] */
unsigned short nr_pages;
/* L: total number of pages in pagevec[] */
unsigned int vcnt;
union {
/* L: inline a certain number of pagevecs for bootstrap */
erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS];
/* I: can be used to free the pcluster by RCU. */
struct rcu_head rcu;
};
};
#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001
#define Z_EROFS_PCLUSTER_LENGTH_BIT 1
/*
* let's leave a type here in case of introducing
* another tagged pointer later.
*/
typedef void *z_erofs_next_pcluster_t;
struct z_erofs_pcluster {
struct erofs_workgroup obj;
struct z_erofs_collection primary_collection;
/* A: point to next chained pcluster or TAILs */
z_erofs_next_pcluster_t next;
/* A: lower limit of decompressed length and if full length or not */
unsigned int length;
/* I: physical cluster size in pages */
unsigned short pclusterpages;
/* I: compression algorithm format */
unsigned char algorithmformat;
/* A: compressed pages (can be cached or inplaced pages) */
struct page *compressed_pages[];
};
#define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection)
/* let's avoid the valid 32-bit kernel addresses */
/* the chained workgroup has't submitted io (still open) */
#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE)
/* the chained workgroup has already submitted io */
#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD)
#define Z_EROFS_PCLUSTER_NIL (NULL)
struct z_erofs_decompressqueue {
struct super_block *sb;
atomic_t pending_bios;
z_erofs_next_pcluster_t head;
union {
struct completion done;
struct work_struct work;
struct kthread_work kthread_work;
} u;
};
#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
struct page *page)
{
return page->mapping == MNGD_MAPPING(sbi);
}
#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2
#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS)
/*
* waiters (aka. ongoing_packs): # to unlock the page
* sub-index: 0 - for partial page, >= 1 full page sub-index
*/
typedef atomic_t z_erofs_onlinepage_t;
/* type punning */
union z_erofs_onlinepage_converter {
z_erofs_onlinepage_t *o;
unsigned long *v;
};
static inline unsigned int z_erofs_onlinepage_index(struct page *page)
{
union z_erofs_onlinepage_converter u;
DBG_BUGON(!PagePrivate(page));
u.v = &page_private(page);
return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
}
static inline void z_erofs_onlinepage_init(struct page *page)
{
union {
z_erofs_onlinepage_t o;
unsigned long v;
/* keep from being unlocked in advance */
} u = { .o = ATOMIC_INIT(1) };
set_page_private(page, u.v);
smp_wmb();
SetPagePrivate(page);
}
static inline void z_erofs_onlinepage_fixup(struct page *page,
uintptr_t index, bool down)
{
union z_erofs_onlinepage_converter u = { .v = &page_private(page) };
int orig, orig_index, val;
repeat:
orig = atomic_read(u.o);
orig_index = orig >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
if (orig_index) {
if (!index)
return;
DBG_BUGON(orig_index != index);
}
val = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
((orig & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down);
if (atomic_cmpxchg(u.o, orig, val) != orig)
goto repeat;
}
static inline void z_erofs_onlinepage_endio(struct page *page)
{
union z_erofs_onlinepage_converter u;
unsigned int v;
DBG_BUGON(!PagePrivate(page));
u.v = &page_private(page);
v = atomic_dec_return(u.o);
if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
set_page_private(page, 0);
ClearPagePrivate(page);
if (!PageError(page))
SetPageUptodate(page);
unlock_page(page);
}
erofs_dbg("%s, page %p value %x", __func__, page, atomic_read(u.o));
}
#define Z_EROFS_VMAP_ONSTACK_PAGES \
min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U)
#define Z_EROFS_VMAP_GLOBAL_PAGES 2048
#endif