Revert "perf: Fix perf_event_read_local() time"

This reverts commit 91b04e83c7 which is
commit 09f5e7dc7ad705289e1b1ec065439aa3c42951c4 upstream

It breaks the kernel abi and is not needed for Android systems, so
revert it.

Bug: 161946584
Fixes: 91b04e83c7 ("perf: Fix perf_event_read_local() time")
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: Ic93864b39858bea47cc735e83c294fc4063b9dcd
This commit is contained in:
Greg Kroah-Hartman 2022-02-07 16:41:30 +01:00
parent 0b4470b56e
commit e33a5b611c
2 changed files with 112 additions and 149 deletions

View File

@ -678,6 +678,18 @@ struct perf_event {
u64 total_time_running;
u64 tstamp;
/*
* timestamp shadows the actual context timing but it can
* be safely used in NMI interrupt context. It reflects the
* context time as it was when the event was last scheduled in,
* or when ctx_sched_in failed to schedule the event because we
* run out of PMC.
*
* ctx_time already accounts for ctx->timestamp. Therefore to
* compute ctx_time for a sample, simply add perf_clock().
*/
u64 shadow_ctx_time;
struct perf_event_attr attr;
u16 header_size;
u16 id_header_size;
@ -822,7 +834,6 @@ struct perf_event_context {
*/
u64 time;
u64 timestamp;
u64 timeoffset;
/*
* These fields let us detect when two contexts have both
@ -905,8 +916,6 @@ struct bpf_perf_event_data_kern {
struct perf_cgroup_info {
u64 time;
u64 timestamp;
u64 timeoffset;
int active;
};
struct perf_cgroup {

View File

@ -673,23 +673,6 @@ perf_event_set_state(struct perf_event *event, enum perf_event_state state)
WRITE_ONCE(event->state, state);
}
/*
* UP store-release, load-acquire
*/
#define __store_release(ptr, val) \
do { \
barrier(); \
WRITE_ONCE(*(ptr), (val)); \
} while (0)
#define __load_acquire(ptr) \
({ \
__unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \
barrier(); \
___p; \
})
#ifdef CONFIG_CGROUP_PERF
static inline bool
@ -735,51 +718,34 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
return t->time;
}
static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
{
struct perf_cgroup_info *t;
struct perf_cgroup_info *info;
u64 now;
t = per_cpu_ptr(event->cgrp->info, event->cpu);
if (!__load_acquire(&t->active))
return t->time;
now += READ_ONCE(t->timeoffset);
return now;
}
now = perf_clock();
static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
{
if (adv)
info->time += now - info->timestamp;
info = this_cpu_ptr(cgrp->info);
info->time += now - info->timestamp;
info->timestamp = now;
/*
* see update_context_time()
*/
WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
}
static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
{
struct perf_cgroup *cgrp = cpuctx->cgrp;
struct cgroup_subsys_state *css;
struct perf_cgroup_info *info;
if (cgrp) {
u64 now = perf_clock();
for (css = &cgrp->css; css; css = css->parent) {
cgrp = container_of(css, struct perf_cgroup, css);
info = this_cpu_ptr(cgrp->info);
__update_cgrp_time(info, now, true);
if (final)
__store_release(&info->active, 0);
__update_cgrp_time(cgrp);
}
}
}
static inline void update_cgrp_time_from_event(struct perf_event *event)
{
struct perf_cgroup_info *info;
struct perf_cgroup *cgrp;
/*
@ -793,10 +759,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
/*
* Do not update time when cgroup is not active
*/
if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) {
info = this_cpu_ptr(event->cgrp->info);
__update_cgrp_time(info, perf_clock(), true);
}
if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
__update_cgrp_time(event->cgrp);
}
static inline void
@ -820,8 +784,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
for (css = &cgrp->css; css; css = css->parent) {
cgrp = container_of(css, struct perf_cgroup, css);
info = this_cpu_ptr(cgrp->info);
__update_cgrp_time(info, ctx->timestamp, false);
__store_release(&info->active, 1);
info->timestamp = ctx->timestamp;
}
}
@ -1017,6 +980,14 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
return ret;
}
static inline void
perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
{
struct perf_cgroup_info *t;
t = per_cpu_ptr(event->cgrp->info, event->cpu);
event->shadow_ctx_time = now - t->timestamp;
}
static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
@ -1094,8 +1065,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}
static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
bool final)
static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
{
}
@ -1127,12 +1097,12 @@ perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
{
}
static inline u64 perf_cgroup_event_time(struct perf_event *event)
static inline void
perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
{
return 0;
}
static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
return 0;
}
@ -1554,59 +1524,22 @@ static void perf_unpin_context(struct perf_event_context *ctx)
/*
* Update the record of the current time in a context.
*/
static void __update_context_time(struct perf_event_context *ctx, bool adv)
static void update_context_time(struct perf_event_context *ctx)
{
u64 now = perf_clock();
if (adv)
ctx->time += now - ctx->timestamp;
ctx->time += now - ctx->timestamp;
ctx->timestamp = now;
/*
* The above: time' = time + (now - timestamp), can be re-arranged
* into: time` = now + (time - timestamp), which gives a single value
* offset to compute future time without locks on.
*
* See perf_event_time_now(), which can be used from NMI context where
* it's (obviously) not possible to acquire ctx->lock in order to read
* both the above values in a consistent manner.
*/
WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
}
static void update_context_time(struct perf_event_context *ctx)
{
__update_context_time(ctx, true);
}
static u64 perf_event_time(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
if (unlikely(!ctx))
return 0;
if (is_cgroup_event(event))
return perf_cgroup_event_time(event);
return ctx->time;
}
static u64 perf_event_time_now(struct perf_event *event, u64 now)
{
struct perf_event_context *ctx = event->ctx;
if (unlikely(!ctx))
return 0;
if (is_cgroup_event(event))
return perf_cgroup_event_time_now(event, now);
if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
return ctx->time;
now += READ_ONCE(ctx->timeoffset);
return now;
return ctx ? ctx->time : 0;
}
static enum event_type_t get_event_type(struct perf_event *event)
@ -2400,7 +2333,7 @@ __perf_remove_from_context(struct perf_event *event,
if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
update_cgrp_time_from_cpuctx(cpuctx, false);
update_cgrp_time_from_cpuctx(cpuctx);
}
event_sched_out(event, cpuctx, ctx);
@ -2409,9 +2342,6 @@ __perf_remove_from_context(struct perf_event *event,
list_del_event(event, ctx);
if (!ctx->nr_events && ctx->is_active) {
if (ctx == &cpuctx->ctx)
update_cgrp_time_from_cpuctx(cpuctx, true);
ctx->is_active = 0;
ctx->rotate_necessary = 0;
if (ctx->task) {
@ -2537,6 +2467,40 @@ void perf_event_disable_inatomic(struct perf_event *event)
irq_work_queue(&event->pending);
}
static void perf_set_shadow_time(struct perf_event *event,
struct perf_event_context *ctx)
{
/*
* use the correct time source for the time snapshot
*
* We could get by without this by leveraging the
* fact that to get to this function, the caller
* has most likely already called update_context_time()
* and update_cgrp_time_xx() and thus both timestamp
* are identical (or very close). Given that tstamp is,
* already adjusted for cgroup, we could say that:
* tstamp - ctx->timestamp
* is equivalent to
* tstamp - cgrp->timestamp.
*
* Then, in perf_output_read(), the calculation would
* work with no changes because:
* - event is guaranteed scheduled in
* - no scheduled out in between
* - thus the timestamp would be the same
*
* But this is a bit hairy.
*
* So instead, we have an explicit cgroup call to remain
* within the time source all along. We believe it
* is cleaner and simpler to understand.
*/
if (is_cgroup_event(event))
perf_cgroup_set_shadow_time(event, event->tstamp);
else
event->shadow_ctx_time = event->tstamp - ctx->timestamp;
}
#define MAX_INTERRUPTS (~0ULL)
static void perf_log_throttle(struct perf_event *event, int enable);
@ -2577,6 +2541,8 @@ event_sched_in(struct perf_event *event,
perf_pmu_disable(event->pmu);
perf_set_shadow_time(event, ctx);
perf_log_itrace_start(event);
if (event->pmu->add(event, PERF_EF_START)) {
@ -3250,6 +3216,16 @@ static void ctx_sched_out(struct perf_event_context *ctx,
return;
}
ctx->is_active &= ~event_type;
if (!(ctx->is_active & EVENT_ALL))
ctx->is_active = 0;
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
if (!ctx->is_active)
cpuctx->task_ctx = NULL;
}
/*
* Always update time if it was set; not only when it changes.
* Otherwise we can 'forget' to update time for any but the last
@ -3263,22 +3239,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
if (is_active & EVENT_TIME) {
/* update (and stop) ctx time */
update_context_time(ctx);
update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
/*
* CPU-release for the below ->is_active store,
* see __load_acquire() in perf_event_time_now()
*/
barrier();
}
ctx->is_active &= ~event_type;
if (!(ctx->is_active & EVENT_ALL))
ctx->is_active = 0;
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
if (!ctx->is_active)
cpuctx->task_ctx = NULL;
update_cgrp_time_from_cpuctx(cpuctx);
}
is_active ^= ctx->is_active; /* changed bits */
@ -3715,19 +3676,13 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
return 0;
}
/*
* Because the userpage is strictly per-event (there is no concept of context,
* so there cannot be a context indirection), every userpage must be updated
* when context time starts :-(
*
* IOW, we must not miss EVENT_TIME edges.
*/
static inline bool event_update_userpage(struct perf_event *event)
{
if (likely(!atomic_read(&event->mmap_count)))
return false;
perf_event_update_time(event);
perf_set_shadow_time(event, event->ctx);
perf_event_update_userpage(event);
return true;
@ -3811,23 +3766,13 @@ ctx_sched_in(struct perf_event_context *ctx,
struct task_struct *task)
{
int is_active = ctx->is_active;
u64 now;
lockdep_assert_held(&ctx->lock);
if (likely(!ctx->nr_events))
return;
if (is_active ^ EVENT_TIME) {
/* start ctx time */
__update_context_time(ctx, false);
perf_cgroup_set_timestamp(task, ctx);
/*
* CPU-release for the below ->is_active store,
* see __load_acquire() in perf_event_time_now()
*/
barrier();
}
ctx->is_active |= (event_type | EVENT_TIME);
if (ctx->task) {
if (!is_active)
@ -3838,6 +3783,13 @@ ctx_sched_in(struct perf_event_context *ctx,
is_active ^= ctx->is_active; /* changed bits */
if (is_active & EVENT_TIME) {
/* start ctx time */
now = perf_clock();
ctx->timestamp = now;
perf_cgroup_set_timestamp(task, ctx);
}
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
@ -4373,18 +4325,6 @@ static inline u64 perf_event_count(struct perf_event *event)
return local64_read(&event->count) + atomic64_read(&event->child_count);
}
static void calc_timer_values(struct perf_event *event,
u64 *now,
u64 *enabled,
u64 *running)
{
u64 ctx_time;
*now = perf_clock();
ctx_time = perf_event_time_now(event, *now);
__perf_update_times(event, ctx_time, enabled, running);
}
/*
* NMI-safe method to read a local event, that is an event that
* is:
@ -4444,9 +4384,10 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
*value = local64_read(&event->count);
if (enabled || running) {
u64 __enabled, __running, __now;;
u64 now = event->shadow_ctx_time + perf_clock();
u64 __enabled, __running;
calc_timer_values(event, &__now, &__enabled, &__running);
__perf_update_times(event, now, &__enabled, &__running);
if (enabled)
*enabled = __enabled;
if (running)
@ -5754,6 +5695,18 @@ static int perf_event_index(struct perf_event *event)
return event->pmu->event_idx(event);
}
static void calc_timer_values(struct perf_event *event,
u64 *now,
u64 *enabled,
u64 *running)
{
u64 ctx_time;
*now = perf_clock();
ctx_time = event->shadow_ctx_time + *now;
__perf_update_times(event, ctx_time, enabled, running);
}
static void perf_event_init_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
@ -6293,6 +6246,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
ring_buffer_attach(event, rb);
perf_event_update_time(event);
perf_set_shadow_time(event, event->ctx);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
} else {