[llvm] [CHR] Skip regions containing convergent calls (PR #180882)
Yaxun Liu via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 18 07:49:10 PST 2026
================
@@ -0,0 +1,179 @@
+; Test that CHR does not transform regions containing convergent or
+; noduplicate calls, following the same guard as SimplifyCFG.
+;
+; CHR (Control Height Reduction) merges multiple biased branches into a
+; single speculative check, cloning the region into hot/cold paths. On GPU
+; targets, this merged branch may be divergent (per-thread), splitting the
+; wavefront: some threads take the hot path, others the cold path.
+;
+; A convergent call like ds_bpermute (a cross-lane operation on AMDGPU)
+; requires a specific set of threads to be active — when thread X reads
+; from thread Y via ds_bpermute, thread Y must be active and participating
+; in the same call. After CHR cloning, thread Y may have gone to the cold
+; path while thread X is on the hot path, so the hot-path ds_bpermute reads
+; a stale register value from thread Y instead of the intended value.
+;
+; Similarly, noduplicate calls must not be duplicated by definition.
+;
+; RUN: opt < %s -passes='require<profile-summary>,function(chr)' -S | FileCheck %s
+; REQUIRES: amdgpu-registered-target
+
+target triple = "amdgcn-amd-amdhsa"
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #0
+
+; Two biased divergent branches where the first region contains a convergent
+; cross-lane operation (ds_bpermute). CHR must not clone this region.
+;
+; Original code (should be preserved as-is):
+; if (val > 0) // Biased true, per-thread condition
+; result = bpermute() // Cross-lane read: thread X reads from thread Y
+; if (val < 100) // Biased true, per-thread condition
+; output[tid] = result
+;
+; Without this fix, CHR would transform to:
+; if (val > 0 && val < 100) { // Merged speculative branch (hot path)
+; result = bpermute() // BUG: thread Y may not be on hot path,
+; // so thread X reads stale value from Y
+; output[tid] = result
+; } else { // Cold path (.nonchr clone)
+; if (val > 0) result = bpermute()
+; if (val < 100) output[tid] = result
+; }
+;
+; The merged branch splits the wavefront differently than the original
+; branches, changing which threads are active at the bpermute call site.
+;
+define amdgpu_kernel void @test_chr_convergent(
+ ptr addrspace(1) %input,
+ ptr addrspace(1) %output) !prof !14 {
+; CHECK-LABEL: @test_chr_convergent(
+; CHECK-NOT: nonchr
+; CHECK: entry:
+; CHECK: %cond1 = icmp sgt i32 %val, 0
+; CHECK: br i1 %cond1, label %bb1, label %merge1
+; CHECK: bb1:
+; CHECK: %perm = call i32 @llvm.amdgcn.ds.bpermute(i32 %lane_idx, i32 %val)
+; CHECK: merge1:
+; CHECK: %cond2 = icmp slt i32 %val, 100
+; CHECK: br i1 %cond2, label %bb2, label %merge2
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep_in = getelementptr inbounds i32, ptr addrspace(1) %input, i32 %tid
+ %val = load i32, ptr addrspace(1) %gep_in, align 4
+ %cond1 = icmp sgt i32 %val, 0
+ br i1 %cond1, label %bb1, label %merge1, !prof !15
+
+bb1:
+ %lane_idx = shl i32 %tid, 2
+ %perm = call i32 @llvm.amdgcn.ds.bpermute(i32 %lane_idx, i32 %val)
+ br label %merge1
+
+merge1:
+ %result = phi i32 [ %perm, %bb1 ], [ 0, %entry ]
+ %cond2 = icmp slt i32 %val, 100
+ br i1 %cond2, label %bb2, label %merge2, !prof !15
+
+bb2:
+ %gep_out = getelementptr inbounds i32, ptr addrspace(1) %output, i32 %tid
+ store i32 %result, ptr addrspace(1) %gep_out, align 4
+ br label %merge2
+
+merge2:
+ ret void
+}
+
+; Same pattern but with a noduplicate call instead of convergent.
+; CHR must also skip this region.
+declare void @noduplicate_callee() #1
+
+define amdgpu_kernel void @test_chr_noduplicate(
+ ptr addrspace(1) %input,
+ ptr addrspace(1) %output) !prof !14 {
+; CHECK-LABEL: @test_chr_noduplicate(
+; CHECK-NOT: nonchr
+; CHECK: entry:
+; CHECK: br i1 %cond1, label %bb1, label %merge1
+; CHECK: bb1:
+; CHECK: call void @noduplicate_callee()
+; CHECK: merge1:
+; CHECK: br i1 %cond2, label %bb2, label %merge2
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep_in = getelementptr inbounds i32, ptr addrspace(1) %input, i32 %tid
+ %val = load i32, ptr addrspace(1) %gep_in, align 4
+ %cond1 = icmp sgt i32 %val, 0
+ br i1 %cond1, label %bb1, label %merge1, !prof !15
+
+bb1:
+ call void @noduplicate_callee()
+ br label %merge1
+
+merge1:
+ %cond2 = icmp slt i32 %val, 100
+ br i1 %cond2, label %bb2, label %merge2, !prof !15
+
+bb2:
+ %gep_out = getelementptr inbounds i32, ptr addrspace(1) %output, i32 %tid
+ store i32 %val, ptr addrspace(1) %gep_out, align 4
+ br label %merge2
+
+merge2:
+ ret void
+}
+
+; A case without convergent or noduplicate calls — CHR should still transform.
----------------
yxsamliu wrote:
removed
https://github.com/llvm/llvm-project/pull/180882
More information about the llvm-commits
mailing list