Revert "perf: Fix perf_event_read_local() time"

This reverts commit 91b04e83c7 which is commit 09f5e7dc7ad705289e1b1ec065439aa3c42951c4 upstream It breaks the kernel abi and is not needed for Android systems, so revert it. Bug: 161946584 Fixes: 91b04e83c7 ("perf: Fix perf_event_read_local() time") Signed-off-by: Greg Kroah-Hartman <gregkh@google.com> Change-Id: Ic93864b39858bea47cc735e83c294fc4063b9dcd
2022-02-07 16:41:30 +01:00 · 2022-02-07 16:41:30 +01:00 · e33a5b611c
commit e33a5b611c
parent 0b4470b56e
2 changed files with 112 additions and 149 deletions
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@ -678,6 +678,18 @@ struct perf_event {
 	u64				total_time_running;
 	u64				tstamp;

+	/*
+	 * timestamp shadows the actual context timing but it can
+	 * be safely used in NMI interrupt context. It reflects the
+	 * context time as it was when the event was last scheduled in,
+	 * or when ctx_sched_in failed to schedule the event because we
+	 * run out of PMC.
+	 *
+	 * ctx_time already accounts for ctx->timestamp. Therefore to
+	 * compute ctx_time for a sample, simply add perf_clock().
+	 */
+	u64				shadow_ctx_time;
+
 	struct perf_event_attr		attr;
 	u16				header_size;
 	u16				id_header_size;
@ -822,7 +834,6 @@ struct perf_event_context {
 	 */
 	u64				time;
 	u64				timestamp;
-	u64				timeoffset;

 	/*
 	 * These fields let us detect when two contexts have both
@ -905,8 +916,6 @@ struct bpf_perf_event_data_kern {
 struct perf_cgroup_info {
 	u64				time;
 	u64				timestamp;
-	u64				timeoffset;
-	int				active;
 };

 struct perf_cgroup {
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@ -673,23 +673,6 @@ perf_event_set_state(struct perf_event *event, enum perf_event_state state)
 	WRITE_ONCE(event->state, state);
 }

-/*
- * UP store-release, load-acquire
- */
-
-#define __store_release(ptr, val)					\
-do {									\
-	barrier();							\
-	WRITE_ONCE(*(ptr), (val));					\
-} while (0)
-
-#define __load_acquire(ptr)						\
-({									\
-	__unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr));	\
-	barrier();							\
-	___p;								\
-})
-
 #ifdef CONFIG_CGROUP_PERF

 static inline bool
@ -735,51 +718,34 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
 	return t->time;
 }

-static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
+static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
 {
-	struct perf_cgroup_info *t;
+	struct perf_cgroup_info *info;
+	u64 now;

-	t = per_cpu_ptr(event->cgrp->info, event->cpu);
-	if (!__load_acquire(&t->active))
-		return t->time;
-	now += READ_ONCE(t->timeoffset);
-	return now;
-}
+	now = perf_clock();

-static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
-{
-	if (adv)
-		info->time += now - info->timestamp;
+	info = this_cpu_ptr(cgrp->info);
+
+	info->time += now - info->timestamp;
 	info->timestamp = now;
-	/*
-	 * see update_context_time()
-	 */
-	WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
 }

-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 {
 	struct perf_cgroup *cgrp = cpuctx->cgrp;
 	struct cgroup_subsys_state *css;
-	struct perf_cgroup_info *info;

 	if (cgrp) {
-		u64 now = perf_clock();
-
 		for (css = &cgrp->css; css; css = css->parent) {
 			cgrp = container_of(css, struct perf_cgroup, css);
-			info = this_cpu_ptr(cgrp->info);
-
-			__update_cgrp_time(info, now, true);
-			if (final)
-				__store_release(&info->active, 0);
+			__update_cgrp_time(cgrp);
 		}
 	}
 }

 static inline void update_cgrp_time_from_event(struct perf_event *event)
 {
-	struct perf_cgroup_info *info;
 	struct perf_cgroup *cgrp;

 	/*
@ -793,10 +759,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
 	/*
 	 * Do not update time when cgroup is not active
 	 */
-	if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) {
-		info = this_cpu_ptr(event->cgrp->info);
-		__update_cgrp_time(info, perf_clock(), true);
-	}
+	if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
+		__update_cgrp_time(event->cgrp);
 }

 static inline void
@ -820,8 +784,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 	for (css = &cgrp->css; css; css = css->parent) {
 		cgrp = container_of(css, struct perf_cgroup, css);
 		info = this_cpu_ptr(cgrp->info);
-		__update_cgrp_time(info, ctx->timestamp, false);
-		__store_release(&info->active, 1);
+		info->timestamp = ctx->timestamp;
 	}
 }

@ -1017,6 +980,14 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 	return ret;
 }

+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+	struct perf_cgroup_info *t;
+	t = per_cpu_ptr(event->cgrp->info, event->cpu);
+	event->shadow_ctx_time = now - t->timestamp;
+}
+
 static inline void
 perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
 {
@ -1094,8 +1065,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
 {
 }

-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
-						bool final)
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 {
 }

@ -1127,12 +1097,12 @@ perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
 {
 }

-static inline u64 perf_cgroup_event_time(struct perf_event *event)
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
 {
-	return 0;
 }

-static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
 {
 	return 0;
 }
@ -1554,59 +1524,22 @@ static void perf_unpin_context(struct perf_event_context *ctx)
 /*
 * Update the record of the current time in a context.
 */
-static void __update_context_time(struct perf_event_context *ctx, bool adv)
+static void update_context_time(struct perf_event_context *ctx)
 {
 	u64 now = perf_clock();

-	if (adv)
-		ctx->time += now - ctx->timestamp;
+	ctx->time += now - ctx->timestamp;
 	ctx->timestamp = now;
-
-	/*
-	 * The above: time' = time + (now - timestamp), can be re-arranged
-	 * into: time` = now + (time - timestamp), which gives a single value
-	 * offset to compute future time without locks on.
-	 *
-	 * See perf_event_time_now(), which can be used from NMI context where
-	 * it's (obviously) not possible to acquire ctx->lock in order to read
-	 * both the above values in a consistent manner.
-	 */
-	WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
-}
-
-static void update_context_time(struct perf_event_context *ctx)
-{
-	__update_context_time(ctx, true);
 }

 static u64 perf_event_time(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;

-	if (unlikely(!ctx))
-		return 0;
-
 	if (is_cgroup_event(event))
 		return perf_cgroup_event_time(event);

-	return ctx->time;
-}
-
-static u64 perf_event_time_now(struct perf_event *event, u64 now)
-{
-	struct perf_event_context *ctx = event->ctx;
-
-	if (unlikely(!ctx))
-		return 0;
-
-	if (is_cgroup_event(event))
-		return perf_cgroup_event_time_now(event, now);
-
-	if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
-		return ctx->time;
-
-	now += READ_ONCE(ctx->timeoffset);
-	return now;
+	return ctx ? ctx->time : 0;
 }

 static enum event_type_t get_event_type(struct perf_event *event)
@ -2400,7 +2333,7 @@ __perf_remove_from_context(struct perf_event *event,

 	if (ctx->is_active & EVENT_TIME) {
 		update_context_time(ctx);
-		update_cgrp_time_from_cpuctx(cpuctx, false);
+		update_cgrp_time_from_cpuctx(cpuctx);
 	}

 	event_sched_out(event, cpuctx, ctx);
@ -2409,9 +2342,6 @@ __perf_remove_from_context(struct perf_event *event,
 	list_del_event(event, ctx);

 	if (!ctx->nr_events && ctx->is_active) {
-		if (ctx == &cpuctx->ctx)
-			update_cgrp_time_from_cpuctx(cpuctx, true);
-
 		ctx->is_active = 0;
 		ctx->rotate_necessary = 0;
 		if (ctx->task) {
@ -2537,6 +2467,40 @@ void perf_event_disable_inatomic(struct perf_event *event)
 	irq_work_queue(&event->pending);
 }

+static void perf_set_shadow_time(struct perf_event *event,
+				 struct perf_event_context *ctx)
+{
+	/*
+	 * use the correct time source for the time snapshot
+	 *
+	 * We could get by without this by leveraging the
+	 * fact that to get to this function, the caller
+	 * has most likely already called update_context_time()
+	 * and update_cgrp_time_xx() and thus both timestamp
+	 * are identical (or very close). Given that tstamp is,
+	 * already adjusted for cgroup, we could say that:
+	 *    tstamp - ctx->timestamp
+	 * is equivalent to
+	 *    tstamp - cgrp->timestamp.
+	 *
+	 * Then, in perf_output_read(), the calculation would
+	 * work with no changes because:
+	 * - event is guaranteed scheduled in
+	 * - no scheduled out in between
+	 * - thus the timestamp would be the same
+	 *
+	 * But this is a bit hairy.
+	 *
+	 * So instead, we have an explicit cgroup call to remain
+	 * within the time source all along. We believe it
+	 * is cleaner and simpler to understand.
+	 */
+	if (is_cgroup_event(event))
+		perf_cgroup_set_shadow_time(event, event->tstamp);
+	else
+		event->shadow_ctx_time = event->tstamp - ctx->timestamp;
+}
+
 #define MAX_INTERRUPTS (~0ULL)

 static void perf_log_throttle(struct perf_event *event, int enable);
@ -2577,6 +2541,8 @@ event_sched_in(struct perf_event *event,

 	perf_pmu_disable(event->pmu);

+	perf_set_shadow_time(event, ctx);
+
 	perf_log_itrace_start(event);

 	if (event->pmu->add(event, PERF_EF_START)) {
@ -3250,6 +3216,16 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 		return;
 	}

+	ctx->is_active &= ~event_type;
+	if (!(ctx->is_active & EVENT_ALL))
+		ctx->is_active = 0;
+
+	if (ctx->task) {
+		WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+		if (!ctx->is_active)
+			cpuctx->task_ctx = NULL;
+	}
+
 	/*
 	 * Always update time if it was set; not only when it changes.
 	 * Otherwise we can 'forget' to update time for any but the last
@ -3263,22 +3239,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 	if (is_active & EVENT_TIME) {
 		/* update (and stop) ctx time */
 		update_context_time(ctx);
-		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
-		/*
-		 * CPU-release for the below ->is_active store,
-		 * see __load_acquire() in perf_event_time_now()
-		 */
-		barrier();
-	}
-
-	ctx->is_active &= ~event_type;
-	if (!(ctx->is_active & EVENT_ALL))
-		ctx->is_active = 0;
-
-	if (ctx->task) {
-		WARN_ON_ONCE(cpuctx->task_ctx != ctx);
-		if (!ctx->is_active)
-			cpuctx->task_ctx = NULL;
+		update_cgrp_time_from_cpuctx(cpuctx);
 	}

 	is_active ^= ctx->is_active; /* changed bits */
@ -3715,19 +3676,13 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
 	return 0;
 }

-/*
- * Because the userpage is strictly per-event (there is no concept of context,
- * so there cannot be a context indirection), every userpage must be updated
- * when context time starts :-(
- *
- * IOW, we must not miss EVENT_TIME edges.
- */
 static inline bool event_update_userpage(struct perf_event *event)
 {
 	if (likely(!atomic_read(&event->mmap_count)))
 		return false;

 	perf_event_update_time(event);
+	perf_set_shadow_time(event, event->ctx);
 	perf_event_update_userpage(event);

 	return true;
@ -3811,23 +3766,13 @@ ctx_sched_in(struct perf_event_context *ctx,
 	     struct task_struct *task)
 {
 	int is_active = ctx->is_active;
+	u64 now;

 	lockdep_assert_held(&ctx->lock);

 	if (likely(!ctx->nr_events))
 		return;

-	if (is_active ^ EVENT_TIME) {
-		/* start ctx time */
-		__update_context_time(ctx, false);
-		perf_cgroup_set_timestamp(task, ctx);
-		/*
-		 * CPU-release for the below ->is_active store,
-		 * see __load_acquire() in perf_event_time_now()
-		 */
-		barrier();
-	}
-
 	ctx->is_active |= (event_type | EVENT_TIME);
 	if (ctx->task) {
 		if (!is_active)
@ -3838,6 +3783,13 @@ ctx_sched_in(struct perf_event_context *ctx,

 	is_active ^= ctx->is_active; /* changed bits */

+	if (is_active & EVENT_TIME) {
+		/* start ctx time */
+		now = perf_clock();
+		ctx->timestamp = now;
+		perf_cgroup_set_timestamp(task, ctx);
+	}
+
 	/*
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
@ -4373,18 +4325,6 @@ static inline u64 perf_event_count(struct perf_event *event)
 	return local64_read(&event->count) + atomic64_read(&event->child_count);
 }

-static void calc_timer_values(struct perf_event *event,
-				u64 *now,
-				u64 *enabled,
-				u64 *running)
-{
-	u64 ctx_time;
-
-	*now = perf_clock();
-	ctx_time = perf_event_time_now(event, *now);
-	__perf_update_times(event, ctx_time, enabled, running);
-}
-
 /*
 * NMI-safe method to read a local event, that is an event that
 * is:
@ -4444,9 +4384,10 @@ int perf_event_read_local(struct perf_event *event, u64 *value,

 	*value = local64_read(&event->count);
 	if (enabled || running) {
-		u64 __enabled, __running, __now;;
+		u64 now = event->shadow_ctx_time + perf_clock();
+		u64 __enabled, __running;

-		calc_timer_values(event, &__now, &__enabled, &__running);
+		__perf_update_times(event, now, &__enabled, &__running);
 		if (enabled)
 			*enabled = __enabled;
 		if (running)
@ -5754,6 +5695,18 @@ static int perf_event_index(struct perf_event *event)
 	return event->pmu->event_idx(event);
 }

+static void calc_timer_values(struct perf_event *event,
+				u64 *now,
+				u64 *enabled,
+				u64 *running)
+{
+	u64 ctx_time;
+
+	*now = perf_clock();
+	ctx_time = event->shadow_ctx_time + *now;
+	__perf_update_times(event, ctx_time, enabled, running);
+}
+
 static void perf_event_init_userpage(struct perf_event *event)
 {
 	struct perf_event_mmap_page *userpg;
@ -6293,6 +6246,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		ring_buffer_attach(event, rb);

 		perf_event_update_time(event);
+		perf_set_shadow_time(event, event->ctx);
 		perf_event_init_userpage(event);
 		perf_event_update_userpage(event);
 	} else {