diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 0593c6e636c6..0d11f301fd72 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -509,6 +509,34 @@ alloc_failed:
 	return NULL;
 }
 
+bool match_callstack_filter(struct machine *machine, u64 *callstack)
+{
+	struct map *kmap;
+	struct symbol *sym;
+	u64 ip;
+
+	if (list_empty(&callstack_filters))
+		return true;
+
+	for (int i = 0; i < max_stack_depth; i++) {
+		struct callstack_filter *filter;
+
+		if (!callstack || !callstack[i])
+			break;
+
+		ip = callstack[i];
+		sym = machine__find_kernel_symbol(machine, ip, &kmap);
+		if (sym == NULL)
+			continue;
+
+		list_for_each_entry(filter, &callstack_filters, list) {
+			if (strstr(sym->name, filter->name))
+				return true;
+		}
+	}
+	return false;
+}
+
 struct trace_lock_handler {
 	/* it's used on CONFIG_LOCKDEP */
 	int (*acquire_event)(struct evsel *evsel,
@@ -1070,12 +1098,6 @@ static int report_lock_contention_begin_event(struct evsel *evsel,
 		ls = lock_stat_findnew(key, name, flags);
 		if (!ls)
 			return -ENOMEM;
-
-		if (aggr_mode == LOCK_AGGR_CALLER && needs_callstack()) {
-			ls->callstack = get_callstack(sample, max_stack_depth);
-			if (ls->callstack == NULL)
-				return -ENOMEM;
-		}
 	}
 
 	if (filters.nr_types) {
@@ -1106,6 +1128,22 @@ static int report_lock_contention_begin_event(struct evsel *evsel,
 			return 0;
 	}
 
+	if (needs_callstack()) {
+		u64 *callstack = get_callstack(sample, max_stack_depth);
+		if (callstack == NULL)
+			return -ENOMEM;
+
+		if (!match_callstack_filter(machine, callstack)) {
+			free(callstack);
+			return 0;
+		}
+
+		if (ls->callstack == NULL)
+			ls->callstack = callstack;
+		else
+			free(callstack);
+	}
+
 	ts = thread_stat_findnew(sample->tid);
 	if (!ts)
 		return -ENOMEM;
@@ -1606,31 +1644,6 @@ static void print_contention_result(struct lock_contention *con)
 		if (!st->wait_time_total)
 			continue;
 
-		if (aggr_mode == LOCK_AGGR_CALLER && !list_empty(&callstack_filters)) {
-			struct map *kmap;
-			struct symbol *sym;
-			u64 ip;
-
-			for (int i = 0; i < max_stack_depth; i++) {
-				struct callstack_filter *filter;
-
-				if (!st->callstack || !st->callstack[i])
-					break;
-
-				ip = st->callstack[i];
-				sym = machine__find_kernel_symbol(con->machine, ip, &kmap);
-				if (sym == NULL)
-					continue;
-
-				list_for_each_entry(filter, &callstack_filters, list) {
-					if (strstr(sym->name, filter->name))
-						goto found;
-				}
-			}
-			continue;
-		}
-
-found:
 		list_for_each_entry(key, &lock_keys, list) {
 			key->print(key, st);
 			pr_info(" ");
diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
index ead2898ba377..72cf81114982 100644
--- a/tools/perf/util/bpf_lock_contention.c
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -34,13 +34,15 @@ int lock_contention_prepare(struct lock_contention *con)
 	bpf_map__set_max_entries(skel->maps.lock_stat, con->map_nr_entries);
 	bpf_map__set_max_entries(skel->maps.tstamp, con->map_nr_entries);
 
-	if (con->aggr_mode == LOCK_AGGR_TASK) {
+	if (con->aggr_mode == LOCK_AGGR_TASK)
 		bpf_map__set_max_entries(skel->maps.task_data, con->map_nr_entries);
-		bpf_map__set_max_entries(skel->maps.stacks, 1);
-	} else {
+	else
 		bpf_map__set_max_entries(skel->maps.task_data, 1);
+
+	if (con->save_callstack)
 		bpf_map__set_max_entries(skel->maps.stacks, con->map_nr_entries);
-	}
+	else
+		bpf_map__set_max_entries(skel->maps.stacks, 1);
 
 	if (target__has_cpu(target))
 		ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus);
@@ -146,6 +148,7 @@ int lock_contention_prepare(struct lock_contention *con)
 	/* these don't work well if in the rodata section */
 	skel->bss->stack_skip = con->stack_skip;
 	skel->bss->aggr_mode = con->aggr_mode;
+	skel->bss->needs_callstack = con->save_callstack;
 
 	lock_contention_bpf__attach(skel);
 	return 0;
@@ -177,7 +180,7 @@ static const char *lock_contention_get_name(struct lock_contention *con,
 
 	if (con->aggr_mode == LOCK_AGGR_TASK) {
 		struct contention_task_data task;
-		int pid = key->aggr_key;
+		int pid = key->pid;
 		int task_fd = bpf_map__fd(skel->maps.task_data);
 
 		/* do not update idle comm which contains CPU number */
@@ -194,7 +197,7 @@ static const char *lock_contention_get_name(struct lock_contention *con,
 	}
 
 	if (con->aggr_mode == LOCK_AGGR_ADDR) {
-		sym = machine__find_kernel_symbol(machine, key->aggr_key, &kmap);
+		sym = machine__find_kernel_symbol(machine, key->lock_addr, &kmap);
 		if (sym)
 			name = sym->name;
 		return name;
@@ -255,20 +258,35 @@ int lock_contention_read(struct lock_contention *con)
 
 	prev_key = NULL;
 	while (!bpf_map_get_next_key(fd, prev_key, &key)) {
-		s32 stack_id;
+		s64 ls_key;
 		const char *name;
 
 		/* to handle errors in the loop body */
 		err = -1;
 
 		bpf_map_lookup_elem(fd, &key, &data);
-
 		if (con->save_callstack) {
-			stack_id = key.aggr_key;
-			bpf_map_lookup_elem(stack, &stack_id, stack_trace);
+			bpf_map_lookup_elem(stack, &key.stack_id, stack_trace);
+
+			if (!match_callstack_filter(machine, stack_trace))
+				goto next;
 		}
 
-		st = lock_stat_find(key.aggr_key);
+		switch (con->aggr_mode) {
+		case LOCK_AGGR_CALLER:
+			ls_key = key.stack_id;
+			break;
+		case LOCK_AGGR_TASK:
+			ls_key = key.pid;
+			break;
+		case LOCK_AGGR_ADDR:
+			ls_key = key.lock_addr;
+			break;
+		default:
+			goto next;
+		}
+
+		st = lock_stat_find(ls_key);
 		if (st != NULL) {
 			st->wait_time_total += data.total_time;
 			if (st->wait_time_max < data.max_time)
@@ -283,7 +301,7 @@ int lock_contention_read(struct lock_contention *con)
 		}
 
 		name = lock_contention_get_name(con, &key, stack_trace);
-		st = lock_stat_findnew(key.aggr_key, name, data.flags);
+		st = lock_stat_findnew(ls_key, name, data.flags);
 		if (st == NULL)
 			break;
 
@@ -295,8 +313,6 @@ int lock_contention_read(struct lock_contention *con)
 		if (data.count)
 			st->avg_wait_time = data.total_time / data.count;
 
-		st->flags = data.flags;
-
 		if (con->save_callstack) {
 			st->callstack = memdup(stack_trace, stack_size);
 			if (st->callstack == NULL)
diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index ad0ca5d50557..7ce276ed987e 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -82,6 +82,7 @@ int has_cpu;
 int has_task;
 int has_type;
 int has_addr;
+int needs_callstack;
 int stack_skip;
 
 /* determine the key of lock stat */
@@ -173,7 +174,7 @@ int contention_begin(u64 *ctx)
 	pelem->lock = (__u64)ctx[0];
 	pelem->flags = (__u32)ctx[1];
 
-	if (aggr_mode == LOCK_AGGR_CALLER) {
+	if (needs_callstack) {
 		pelem->stack_id = bpf_get_stackid(ctx, &stacks,
 						  BPF_F_FAST_STACK_CMP | stack_skip);
 		if (pelem->stack_id < 0)
@@ -188,7 +189,7 @@ int contention_end(u64 *ctx)
 {
 	__u32 pid;
 	struct tstamp_data *pelem;
-	struct contention_key key;
+	struct contention_key key = {};
 	struct contention_data *data;
 	__u64 duration;
 
@@ -204,14 +205,18 @@ int contention_end(u64 *ctx)
 
 	switch (aggr_mode) {
 	case LOCK_AGGR_CALLER:
-		key.aggr_key = pelem->stack_id;
+		key.stack_id = pelem->stack_id;
 		break;
 	case LOCK_AGGR_TASK:
-		key.aggr_key = pid;
+		key.pid = pid;
 		update_task_data(pid);
+		if (needs_callstack)
+			key.stack_id = pelem->stack_id;
 		break;
 	case LOCK_AGGR_ADDR:
-		key.aggr_key = pelem->lock;
+		key.lock_addr = pelem->lock;
+		if (needs_callstack)
+			key.stack_id = pelem->stack_id;
 		break;
 	default:
 		/* should not happen */
diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
index ce71cf1a7e1e..3d35fd4407ac 100644
--- a/tools/perf/util/bpf_skel/lock_data.h
+++ b/tools/perf/util/bpf_skel/lock_data.h
@@ -4,7 +4,9 @@
 #define UTIL_BPF_SKEL_LOCK_DATA_H
 
 struct contention_key {
-	u64 aggr_key;  /* can be stack_id, pid or lock addr */
+	u32 stack_id;
+	u32 pid;
+	u64 lock_addr;
 };
 
 #define TASK_COMM_LEN  16
diff --git a/tools/perf/util/lock-contention.h b/tools/perf/util/lock-contention.h
index 39d5bfc77f4e..e5fc036108ec 100644
--- a/tools/perf/util/lock-contention.h
+++ b/tools/perf/util/lock-contention.h
@@ -68,6 +68,8 @@ struct lock_stat {
 struct lock_stat *lock_stat_find(u64 addr);
 struct lock_stat *lock_stat_findnew(u64 addr, const char *name, int flags);
 
+bool match_callstack_filter(struct machine *machine, u64 *callstack);
+
 /*
  * struct lock_seq_stat:
  * Place to put on state of one lock sequence