[llvm] AMDGPU: Fix assert when multi operands to update after folding imm (PR #148205)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 15 08:34:45 PDT 2025
https://github.com/macurtis-amd updated https://github.com/llvm/llvm-project/pull/148205
>From 3943134ac014208765b5f7734a7dc2da0b3ce41c Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis at amd.com>
Date: Fri, 11 Jul 2025 05:33:54 -0500
Subject: [PATCH 1/7] AMDGPU: Fix assert when multi operands to update after
folding imm
In the original motivating test case, FoldList had entries:
#0: UseMI: %224:sreg_32 = S_OR_B32 %219.sub0:sreg_64, %219.sub1:sreg_64, implicit-def dead $scc
UseOpNo: 1
#1: UseMI: %224:sreg_32 = S_OR_B32 %219.sub0:sreg_64, %219.sub1:sreg_64, implicit-def dead $scc
UseOpNo: 2
After calling updateOperand(#0), tryConstantFoldOp(#0.UseMI) removed operand 1, and
entry #1.UseOpNo was no longer valid, resulting in an assert.
This change defers constant folding until after all operands have been updated
so that UseOpNo values remain valid.
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 14 +++--
...bug-multi-operands-to-update-after-fold.ll | 58 +++++++++++++++++++
2 files changed, 68 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 0ed06c37507af..0f2a932f984b1 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1761,6 +1761,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
for (MachineInstr *Copy : CopiesToReplace)
Copy->addImplicitDefUseOperands(*MF);
+ SmallVector<MachineInstr *, 4> ConstantFoldCandidates;
for (FoldCandidate &Fold : FoldList) {
assert(!Fold.isReg() || Fold.Def.OpToFold);
if (Fold.isReg() && Fold.getReg().isVirtual()) {
@@ -1783,16 +1784,21 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
<< static_cast<int>(Fold.UseOpNo) << " of "
<< *Fold.UseMI);
- if (Fold.isImm() && tryConstantFoldOp(Fold.UseMI)) {
- LLVM_DEBUG(dbgs() << "Constant folded " << *Fold.UseMI);
- Changed = true;
- }
+ if (Fold.isImm() && !is_contained(ConstantFoldCandidates, Fold.UseMI))
+ ConstantFoldCandidates.push_back(Fold.UseMI);
} else if (Fold.Commuted) {
// Restoring instruction's original operand order if fold has failed.
TII->commuteInstruction(*Fold.UseMI, false);
}
}
+
+ for (MachineInstr *MI : ConstantFoldCandidates) {
+ if (tryConstantFoldOp(MI)) {
+ LLVM_DEBUG(dbgs() << "Constant folded " << *MI);
+ Changed = true;
+ }
+ }
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll
new file mode 100644
index 0000000000000..a81fc6a25e43e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O3 -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -o - < %s | FileCheck %s
+
+%struct.bar = type { %struct.bar.0, %struct.bar.0, %struct.bar.0 }
+%struct.bar.0 = type { %struct.blam }
+%struct.blam = type { i32, i32, i32, i32 }
+
+ at global = external addrspace(3) global %struct.bar
+
+define void @snork() {
+; CHECK-LABEL: snork:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: v_mov_b32_e32 v4, global at abs32@lo
+; CHECK-NEXT: s_mov_b32 s5, s4
+; CHECK-NEXT: s_mov_b32 s6, s4
+; CHECK-NEXT: s_mov_b32 s7, s4
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: v_mov_b32_e32 v1, s5
+; CHECK-NEXT: v_mov_b32_e32 v2, s6
+; CHECK-NEXT: v_mov_b32_e32 v3, s7
+; CHECK-NEXT: s_cmp_lg_u32 0, 0
+; CHECK-NEXT: ds_write_b128 v4, v[0:3] offset:32
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %call = call float @llvm.amdgcn.rcp.f32(float 0.000000e+00)
+ %fmul = fmul ninf float %call, 0.000000e+00
+ %fptoui = fptoui float %fmul to i32
+ %zext = zext i32 %fptoui to i64
+ %mul = mul i64 2, %zext
+ %trunc = trunc i64 %mul to i32
+ store i32 %trunc, ptr addrspace(3) getelementptr inbounds (%struct.bar, ptr addrspace(3) @global, i32 0, i32 2), align 16
+ store i32 0, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 36), align 4
+ store i32 0, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 40), align 8
+ store i32 %trunc, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 44), align 4
+ %load = load <4 x i32>, ptr addrspace(3) getelementptr inbounds (%struct.bar, ptr addrspace(3) @global, i32 0, i32 2), align 16
+ %extractelement = extractelement <4 x i32> %load, i64 0
+ %icmp = icmp ne i32 %extractelement, 0
+ %extractelement1 = extractelement <4 x i32> %load, i64 3
+ %icmp2 = icmp ne i32 %extractelement1, 0
+ %select = select i1 %icmp, i1 true, i1 %icmp2
+ br i1 %select, label %bb5, label %bb3
+
+bb3: ; preds = %bb
+ %and = and <4 x i32> %load, splat (i32 1)
+ %extractelement4 = extractelement <4 x i32> %and, i64 0
+ br label %bb5
+
+bb5: ; preds = %bb3, %bb
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.amdgcn.rcp.f32(float) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
>From eabe0484618ba11ec8d1ad5ac6f0e290e71e291e Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis at amd.com>
Date: Fri, 11 Jul 2025 12:28:36 -0500
Subject: [PATCH 2/7] fixup! AMDGPU: Fix assert when multi operands to update
after folding imm
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 6 +-
...bug-multi-operands-to-update-after-fold.ll | 58 --------
...ug-multi-operands-to-update-after-fold.mir | 128 ++++++++++++++++++
3 files changed, 131 insertions(+), 61 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 0f2a932f984b1..e172c0b63189b 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1761,7 +1761,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
for (MachineInstr *Copy : CopiesToReplace)
Copy->addImplicitDefUseOperands(*MF);
- SmallVector<MachineInstr *, 4> ConstantFoldCandidates;
+ SetVector<MachineInstr *> ConstantFoldCandidates;
for (FoldCandidate &Fold : FoldList) {
assert(!Fold.isReg() || Fold.Def.OpToFold);
if (Fold.isReg() && Fold.getReg().isVirtual()) {
@@ -1784,8 +1784,8 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
<< static_cast<int>(Fold.UseOpNo) << " of "
<< *Fold.UseMI);
- if (Fold.isImm() && !is_contained(ConstantFoldCandidates, Fold.UseMI))
- ConstantFoldCandidates.push_back(Fold.UseMI);
+ if (Fold.isImm())
+ ConstantFoldCandidates.insert(Fold.UseMI);
} else if (Fold.Commuted) {
// Restoring instruction's original operand order if fold has failed.
diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll
deleted file mode 100644
index a81fc6a25e43e..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -O3 -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -o - < %s | FileCheck %s
-
-%struct.bar = type { %struct.bar.0, %struct.bar.0, %struct.bar.0 }
-%struct.bar.0 = type { %struct.blam }
-%struct.blam = type { i32, i32, i32, i32 }
-
- at global = external addrspace(3) global %struct.bar
-
-define void @snork() {
-; CHECK-LABEL: snork:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: v_mov_b32_e32 v4, global at abs32@lo
-; CHECK-NEXT: s_mov_b32 s5, s4
-; CHECK-NEXT: s_mov_b32 s6, s4
-; CHECK-NEXT: s_mov_b32 s7, s4
-; CHECK-NEXT: v_mov_b32_e32 v0, s4
-; CHECK-NEXT: v_mov_b32_e32 v1, s5
-; CHECK-NEXT: v_mov_b32_e32 v2, s6
-; CHECK-NEXT: v_mov_b32_e32 v3, s7
-; CHECK-NEXT: s_cmp_lg_u32 0, 0
-; CHECK-NEXT: ds_write_b128 v4, v[0:3] offset:32
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_setpc_b64 s[30:31]
-bb:
- %call = call float @llvm.amdgcn.rcp.f32(float 0.000000e+00)
- %fmul = fmul ninf float %call, 0.000000e+00
- %fptoui = fptoui float %fmul to i32
- %zext = zext i32 %fptoui to i64
- %mul = mul i64 2, %zext
- %trunc = trunc i64 %mul to i32
- store i32 %trunc, ptr addrspace(3) getelementptr inbounds (%struct.bar, ptr addrspace(3) @global, i32 0, i32 2), align 16
- store i32 0, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 36), align 4
- store i32 0, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 40), align 8
- store i32 %trunc, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 44), align 4
- %load = load <4 x i32>, ptr addrspace(3) getelementptr inbounds (%struct.bar, ptr addrspace(3) @global, i32 0, i32 2), align 16
- %extractelement = extractelement <4 x i32> %load, i64 0
- %icmp = icmp ne i32 %extractelement, 0
- %extractelement1 = extractelement <4 x i32> %load, i64 3
- %icmp2 = icmp ne i32 %extractelement1, 0
- %select = select i1 %icmp, i1 true, i1 %icmp2
- br i1 %select, label %bb5, label %bb3
-
-bb3: ; preds = %bb
- %and = and <4 x i32> %load, splat (i32 1)
- %extractelement4 = extractelement <4 x i32> %and, i64 0
- br label %bb5
-
-bb5: ; preds = %bb3, %bb
- ret void
-}
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare float @llvm.amdgcn.rcp.f32(float) #0
-
-attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
new file mode 100644
index 0000000000000..da362bdacc90f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
@@ -0,0 +1,128 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -run-pass=si-fold-operands -o - %s | FileCheck %s
+--- |
+ %struct.bar = type { %struct.bar.0, %struct.bar.0, %struct.bar.0 }
+ %struct.bar.0 = type { %struct.blam }
+ %struct.blam = type { i32, i32, i32, i32 }
+
+ @global = external addrspace(3) global %struct.bar
+
+ define void @snork() {
+ bb:
+ %call = call float @llvm.amdgcn.rcp.f32(float 0.000000e+00)
+ %fmul = fmul ninf float %call, 0.000000e+00
+ %fptoui = fptoui float %fmul to i32
+ %zext = zext i32 %fptoui to i64
+ %mul = mul i64 2, %zext
+ %trunc = trunc i64 %mul to i32
+ %0 = insertelement <4 x i32> poison, i32 %trunc, i32 0
+ %1 = insertelement <4 x i32> %0, i32 0, i32 1
+ %2 = insertelement <4 x i32> %1, i32 0, i32 2
+ %3 = insertelement <4 x i32> %2, i32 %trunc, i32 3
+ store <4 x i32> %3, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32), align 16
+ %load = load <4 x i32>, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32), align 16
+ %extractelement = extractelement <4 x i32> %load, i64 0
+ %icmp = icmp ne i32 %extractelement, 0
+ %extractelement1 = extractelement <4 x i32> %load, i64 3
+ %icmp2 = icmp ne i32 %extractelement1, 0
+ %select = select i1 %icmp, i1 true, i1 %icmp2
+ %select.inv = xor i1 %select, true
+ br i1 %select.inv, label %bb3, label %bb5, !amdgpu.uniform !0
+
+ bb3: ; preds = %bb
+ %and = and <4 x i32> %load, splat (i32 1)
+ br label %bb5, !amdgpu.uniform !0
+
+ bb5: ; preds = %bb3, %bb
+ ret void
+ }
+
+ ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+ declare float @llvm.amdgcn.rcp.f32(float)
+
+ !0 = !{}
+...
+---
+name: snork
+alignment: 1
+tracksRegLiveness: true
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+registers:
+ - { id: 0, class: sgpr_128 }
+ - { id: 1, class: sgpr_64 }
+ - { id: 2, class: sgpr_64 }
+ - { id: 3, class: sgpr_64 }
+ - { id: 4, class: sgpr_64 }
+ - { id: 5, class: sgpr_32 }
+ - { id: 6, class: sgpr_32 }
+ - { id: 7, class: sgpr_32 }
+ - { id: 8, class: sgpr_32 }
+ - { id: 9, class: sreg_32 }
+ - { id: 10, class: sgpr_128 }
+ - { id: 11, class: vgpr_32 }
+ - { id: 12, class: vreg_128 }
+ - { id: 13, class: sreg_32 }
+ - { id: 14, class: sreg_32 }
+ - { id: 15, class: sreg_32 }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ dispatchPtr: { reg: '$sgpr4_sgpr5' }
+ queuePtr: { reg: '$sgpr6_sgpr7' }
+ dispatchID: { reg: '$sgpr10_sgpr11' }
+ workGroupIDX: { reg: '$sgpr12' }
+ workGroupIDY: { reg: '$sgpr13' }
+ workGroupIDZ: { reg: '$sgpr14' }
+ LDSKernelId: { reg: '$sgpr15' }
+ implicitArgPtr: { reg: '$sgpr8_sgpr9' }
+ workItemIDX: { reg: '$vgpr31', mask: 1023 }
+ workItemIDY: { reg: '$vgpr31', mask: 1047552 }
+ workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
+ occupancy: 16
+ sgprForEXECCopy: '$sgpr105'
+body: |
+ ; CHECK-LABEL: name: snork
+ ; CHECK: bb.0.bb:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @global, implicit $exec
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+ ; CHECK-NEXT: DS_WRITE_B128_gfx9 killed [[V_MOV_B32_e32_]], [[COPY]], 32, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32)`, addrspace 3)
+ ; CHECK-NEXT: S_CMP_LG_U32 0, 0, implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1.bb3:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2.bb5:
+ ; CHECK-NEXT: SI_RETURN
+ bb.0.bb:
+ successors: %bb.1, %bb.2
+
+ %9:sreg_32 = S_MOV_B32 0
+ %10:sgpr_128 = REG_SEQUENCE %9, %subreg.sub0, %9, %subreg.sub1, %9, %subreg.sub2, %9, %subreg.sub3
+ %11:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @global, implicit $exec
+ %12:vreg_128 = COPY %10
+ DS_WRITE_B128_gfx9 killed %11, %12, 32, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32)`, addrspace 3)
+ %15:sreg_32 = S_OR_B32 %10.sub0, %10.sub3, implicit-def dead $scc
+ S_CMP_LG_U32 killed %15, 0, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1.bb3:
+
+ bb.2.bb5:
+ SI_RETURN
+...
>From c80176b842713f214610fb4b2cadba818f990c7f Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis at amd.com>
Date: Fri, 11 Jul 2025 12:36:56 -0500
Subject: [PATCH 3/7] fixup! AMDGPU: Fix assert when multi operands to update
after folding imm
---
.../CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
index da362bdacc90f..95095c132f879 100644
--- a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
+++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
@@ -37,7 +37,6 @@
ret void
}
- ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.amdgcn.rcp.f32(float)
!0 = !{}
>From c0f7cce7c5ebc7e051e6985615a038654ea997fb Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis at amd.com>
Date: Fri, 11 Jul 2025 15:51:48 -0500
Subject: [PATCH 4/7] fixup! AMDGPU: Fix assert when multi operands to update
after folding imm
---
...ug-multi-operands-to-update-after-fold.mir | 82 ++-----------------
1 file changed, 5 insertions(+), 77 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
index 95095c132f879..027b6fd0b014d 100644
--- a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
+++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
@@ -1,93 +1,21 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -run-pass=si-fold-operands -o - %s | FileCheck %s
--- |
- %struct.bar = type { %struct.bar.0, %struct.bar.0, %struct.bar.0 }
- %struct.bar.0 = type { %struct.blam }
- %struct.blam = type { i32, i32, i32, i32 }
-
- @global = external addrspace(3) global %struct.bar
+ @global = external addrspace(3) global i32
define void @snork() {
bb:
- %call = call float @llvm.amdgcn.rcp.f32(float 0.000000e+00)
- %fmul = fmul ninf float %call, 0.000000e+00
- %fptoui = fptoui float %fmul to i32
- %zext = zext i32 %fptoui to i64
- %mul = mul i64 2, %zext
- %trunc = trunc i64 %mul to i32
- %0 = insertelement <4 x i32> poison, i32 %trunc, i32 0
- %1 = insertelement <4 x i32> %0, i32 0, i32 1
- %2 = insertelement <4 x i32> %1, i32 0, i32 2
- %3 = insertelement <4 x i32> %2, i32 %trunc, i32 3
- store <4 x i32> %3, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32), align 16
- %load = load <4 x i32>, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32), align 16
- %extractelement = extractelement <4 x i32> %load, i64 0
- %icmp = icmp ne i32 %extractelement, 0
- %extractelement1 = extractelement <4 x i32> %load, i64 3
- %icmp2 = icmp ne i32 %extractelement1, 0
- %select = select i1 %icmp, i1 true, i1 %icmp2
- %select.inv = xor i1 %select, true
- br i1 %select.inv, label %bb3, label %bb5, !amdgpu.uniform !0
+ br label %bb3
- bb3: ; preds = %bb
- %and = and <4 x i32> %load, splat (i32 1)
- br label %bb5, !amdgpu.uniform !0
+ bb3:
+ br label %bb5
- bb5: ; preds = %bb3, %bb
+ bb5:
ret void
}
-
- declare float @llvm.amdgcn.rcp.f32(float)
-
- !0 = !{}
...
---
name: snork
-alignment: 1
-tracksRegLiveness: true
-noPhis: false
-isSSA: true
-noVRegs: false
-hasFakeUses: false
-registers:
- - { id: 0, class: sgpr_128 }
- - { id: 1, class: sgpr_64 }
- - { id: 2, class: sgpr_64 }
- - { id: 3, class: sgpr_64 }
- - { id: 4, class: sgpr_64 }
- - { id: 5, class: sgpr_32 }
- - { id: 6, class: sgpr_32 }
- - { id: 7, class: sgpr_32 }
- - { id: 8, class: sgpr_32 }
- - { id: 9, class: sreg_32 }
- - { id: 10, class: sgpr_128 }
- - { id: 11, class: vgpr_32 }
- - { id: 12, class: vreg_128 }
- - { id: 13, class: sreg_32 }
- - { id: 14, class: sreg_32 }
- - { id: 15, class: sreg_32 }
-frameInfo:
- maxAlignment: 1
-machineFunctionInfo:
- maxKernArgAlign: 1
- scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
- frameOffsetReg: '$sgpr33'
- stackPtrOffsetReg: '$sgpr32'
- argumentInfo:
- privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
- dispatchPtr: { reg: '$sgpr4_sgpr5' }
- queuePtr: { reg: '$sgpr6_sgpr7' }
- dispatchID: { reg: '$sgpr10_sgpr11' }
- workGroupIDX: { reg: '$sgpr12' }
- workGroupIDY: { reg: '$sgpr13' }
- workGroupIDZ: { reg: '$sgpr14' }
- LDSKernelId: { reg: '$sgpr15' }
- implicitArgPtr: { reg: '$sgpr8_sgpr9' }
- workItemIDX: { reg: '$vgpr31', mask: 1023 }
- workItemIDY: { reg: '$vgpr31', mask: 1047552 }
- workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
- occupancy: 16
- sgprForEXECCopy: '$sgpr105'
body: |
; CHECK-LABEL: name: snork
; CHECK: bb.0.bb:
>From 8db18fb4c34f6b289ef0b384f5b712b0caf3af53 Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis at amd.com>
Date: Mon, 14 Jul 2025 08:01:43 -0500
Subject: [PATCH 5/7] fixup! AMDGPU: Fix assert when multi operands to update
after folding imm
---
...ug-multi-operands-to-update-after-fold.mir | 55 ++++---------------
1 file changed, 10 insertions(+), 45 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
index 027b6fd0b014d..ec4ed94e25b79 100644
--- a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
+++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
@@ -1,55 +1,20 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -run-pass=si-fold-operands -o - %s | FileCheck %s
--- |
- @global = external addrspace(3) global i32
-
define void @snork() {
- bb:
- br label %bb3
-
- bb3:
- br label %bb5
-
- bb5:
ret void
}
...
---
-name: snork
-body: |
- ; CHECK-LABEL: name: snork
- ; CHECK: bb.0.bb:
- ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
- ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @global, implicit $exec
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
- ; CHECK-NEXT: DS_WRITE_B128_gfx9 killed [[V_MOV_B32_e32_]], [[COPY]], 32, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32)`, addrspace 3)
- ; CHECK-NEXT: S_CMP_LG_U32 0, 0, implicit-def $scc
- ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
- ; CHECK-NEXT: S_BRANCH %bb.1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.1.bb3:
- ; CHECK-NEXT: successors: %bb.2(0x80000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.2.bb5:
- ; CHECK-NEXT: SI_RETURN
- bb.0.bb:
- successors: %bb.1, %bb.2
-
- %9:sreg_32 = S_MOV_B32 0
- %10:sgpr_128 = REG_SEQUENCE %9, %subreg.sub0, %9, %subreg.sub1, %9, %subreg.sub2, %9, %subreg.sub3
- %11:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @global, implicit $exec
- %12:vreg_128 = COPY %10
- DS_WRITE_B128_gfx9 killed %11, %12, 32, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32)`, addrspace 3)
- %15:sreg_32 = S_OR_B32 %10.sub0, %10.sub3, implicit-def dead $scc
- S_CMP_LG_U32 killed %15, 0, implicit-def $scc
- S_CBRANCH_SCC1 %bb.2, implicit $scc
- S_BRANCH %bb.1
-
- bb.1.bb3:
-
- bb.2.bb5:
+name: snork
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: snork
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE undef [[S_MOV_B32_]], %subreg.sub0, undef [[S_MOV_B32_]], %subreg.sub1, undef [[S_MOV_B32_]], %subreg.sub2, undef [[S_MOV_B32_]], %subreg.sub3
+ ; CHECK-NEXT: SI_RETURN
+ %0:sreg_32 = S_MOV_B32 0
+ %1:sgpr_128 = REG_SEQUENCE undef %0, %subreg.sub0, undef %0, %subreg.sub1, undef %0, %subreg.sub2, undef %0, %subreg.sub3
+ %2:sreg_32 = S_OR_B32 undef %1.sub0, undef %1.sub3, implicit-def dead $scc
SI_RETURN
...
>From 9be3cda1ac61f15e3250d0427981e88bc4b9759d Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis at amd.com>
Date: Tue, 15 Jul 2025 03:57:06 -0500
Subject: [PATCH 6/7] fixup! AMDGPU: Fix assert when multi operands to update
after folding imm
---
.../AMDGPU/bug-multi-operands-to-update-after-fold.mir | 5 -----
1 file changed, 5 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
index ec4ed94e25b79..66c13b3f969a6 100644
--- a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
+++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
@@ -1,10 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -run-pass=si-fold-operands -o - %s | FileCheck %s
---- |
- define void @snork() {
- ret void
- }
-...
---
name: snork
body: |
>From 5bd778bb6d4469cf5e460aa43ce2f5b52d1edc1a Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis at amd.com>
Date: Tue, 15 Jul 2025 10:34:04 -0500
Subject: [PATCH 7/7] fixup! AMDGPU: Fix assert when multi operands to update
after folding imm
---
.../AMDGPU/bug-multi-operands-to-update-after-fold.mir | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
index 66c13b3f969a6..d0c9740c6954e 100644
--- a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
+++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
@@ -6,10 +6,10 @@ body: |
bb.0:
; CHECK-LABEL: name: snork
; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE undef [[S_MOV_B32_]], %subreg.sub0, undef [[S_MOV_B32_]], %subreg.sub1, undef [[S_MOV_B32_]], %subreg.sub2, undef [[S_MOV_B32_]], %subreg.sub3
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
; CHECK-NEXT: SI_RETURN
%0:sreg_32 = S_MOV_B32 0
- %1:sgpr_128 = REG_SEQUENCE undef %0, %subreg.sub0, undef %0, %subreg.sub1, undef %0, %subreg.sub2, undef %0, %subreg.sub3
- %2:sreg_32 = S_OR_B32 undef %1.sub0, undef %1.sub3, implicit-def dead $scc
+ %1:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3
+ %2:sreg_32 = S_OR_B32 %1.sub0, %1.sub3, implicit-def dead $scc
SI_RETURN
...
More information about the llvm-commits
mailing list