[llvm] AMDGPU: Do not fold copy to physreg from operation on frame index (PR #115977)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 12 18:23:05 PST 2024
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/115977
Fixes verifier error after erasing needed copies.
>From 0d5d5c22d8add57bb3463f765098ad7793a7d9a2 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 12 Nov 2024 18:21:15 -0800
Subject: [PATCH] AMDGPU: Do not fold copy to physreg from operation on frame
index
Fixes verifier error after erasing needed copies.
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 8 +-
.../CodeGen/AMDGPU/captured-frame-index.ll | 15 ++++
.../fold-operands-s-add-copy-to-vgpr.mir | 80 ++++++++++++++++---
3 files changed, 88 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 73834773f66e3c..4fb5cb066be7c3 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1599,10 +1599,6 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
return false;
- if (OpToFold.isReg() &&
- foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI))
- return true;
-
// Prevent folding operands backwards in the function. For example,
// the COPY opcode must not be replaced by 1 in this example:
//
@@ -1612,6 +1608,10 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
if (!DstReg.isVirtual())
return false;
+ if (OpToFold.isReg() &&
+ foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI))
+ return true;
+
bool Changed = foldInstOperand(MI, OpToFold);
// If we managed to fold all uses of this copy then we might as well
diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
index 2ec4c074a892dc..03f0609c817461 100644
--- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -325,8 +325,23 @@ define amdgpu_kernel void @kernel_alloca_offset_use_asm_vgpr() {
ret void
}
+; GCN-LABEL: {{^}}live_out_physreg_copy_add_fi:
+; GCN: s_or_b32 [[FI:s[0-9]+]], s{{[0-9]+}}, 4
+; GCN: v_mov_b32_e32 v0, [[FI]]
+; GCN: v_mov_b32_e32 v1
+; GCN: s_swappc_b64
+define void @live_out_physreg_copy_add_fi(ptr %fptr) #2 {
+bb:
+ %alloca = alloca [4 x i32], align 16, addrspace(5)
+ %addrspacecast = addrspacecast ptr addrspace(5) %alloca to ptr
+ %getelementptr = getelementptr i8, ptr %addrspacecast, i64 4
+ call void %fptr(ptr %getelementptr) #2
+ ret void
+}
+
declare void @llvm.lifetime.start.p5(i64, ptr addrspace(5) nocapture) #1
declare void @llvm.lifetime.end.p5(i64, ptr addrspace(5) nocapture) #1
attributes #0 = { nounwind }
attributes #1 = { argmemonly nounwind }
+attributes #2 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir
index 8c88c7a97174e2..4b4324b8a1611b 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir
@@ -120,17 +120,10 @@ stack:
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
body: |
bb.0:
- ; GFX8-LABEL: name: fold_s_add_i32__mov_fi_const_copy_to_phys_vgpr
- ; GFX8: $vgpr0 = V_ADD_CO_U32_e32 128, %stack.0, implicit-def dead $vcc, implicit $exec
- ; GFX8-NEXT: SI_RETURN implicit $vgpr0
- ;
- ; GFX9-LABEL: name: fold_s_add_i32__mov_fi_const_copy_to_phys_vgpr
- ; GFX9: $vgpr0 = V_ADD_U32_e32 128, %stack.0, implicit $exec
- ; GFX9-NEXT: SI_RETURN implicit $vgpr0
- ;
- ; GFX10-LABEL: name: fold_s_add_i32__mov_fi_const_copy_to_phys_vgpr
- ; GFX10: $vgpr0 = V_ADD_U32_e32 128, %stack.0, implicit $exec
- ; GFX10-NEXT: SI_RETURN implicit $vgpr0
+ ; CHECK-LABEL: name: fold_s_add_i32__mov_fi_const_copy_to_phys_vgpr
+ ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 128, implicit-def dead $scc
+ ; CHECK-NEXT: $vgpr0 = COPY [[S_ADD_I32_]]
+ ; CHECK-NEXT: SI_RETURN implicit $vgpr0
%0:sreg_32 = S_MOV_B32 %stack.0
%1:sreg_32 = S_ADD_I32 %0, 128, implicit-def dead $scc
$vgpr0 = COPY %1
@@ -535,3 +528,68 @@ body: |
%2:vgpr_32 = COPY %1
SI_RETURN implicit %2
...
+
+# Physreg copy of %2 to $vgpr0 should not be erased
+---
+name: fold_fi_into_s_or_b32_user_is_physreg_copy
+tracksRegLiveness: true
+stack:
+ - { id: 0, size: 16, alignment: 16 }
+machineFunctionInfo:
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ ; CHECK-LABEL: name: fold_fi_into_s_or_b32_user_is_physreg_copy
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 4, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]].sub1, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+ ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE]], [[COPY]], implicit $exec
+ ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U64_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: $vgpr0 = COPY [[S_ADD_I32_]]
+ ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[REG_SEQUENCE]], 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+ ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]]
+ ; CHECK-NEXT: SI_RETURN
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ %0:vreg_64 = COPY $vgpr0_vgpr1
+ %1:sreg_32 = S_MOV_B32 %stack.0
+ %2:sreg_32 = S_ADD_I32 killed %1, 4, implicit-def dead $scc
+ %3:sreg_64_xexec = S_MOV_B64 $exec
+
+ bb.1:
+ %4:sgpr_32 = V_READFIRSTLANE_B32 %0.sub0, implicit $exec
+ %5:sgpr_32 = V_READFIRSTLANE_B32 %0.sub1, implicit $exec
+ %6:sgpr_64 = REG_SEQUENCE %4, %subreg.sub0, %5, %subreg.sub1
+ %7:sreg_64_xexec = V_CMP_EQ_U64_e64 %6, %0, implicit $exec
+ %8:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed %7, implicit-def $exec, implicit-def $scc, implicit $exec
+ ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ $vgpr0 = COPY %2
+ $sgpr30_sgpr31 = SI_CALL %6, 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ $exec = S_XOR_B64_term $exec, %8, implicit-def $scc
+ SI_WATERFALL_LOOP %bb.1, implicit $exec
+
+ bb.2:
+ $exec = S_MOV_B64 %3
+ SI_RETURN
+
+...
More information about the llvm-commits
mailing list