[llvm] [AMDGPU] Reset kill flags for multiple uses of SDWAInst Ops (PR #97135)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 30 10:08:53 PDT 2024
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/97135
>From 9620b78eff2038cdb91a04ec6274d6413331341f Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 28 Jun 2024 19:03:46 -0700
Subject: [PATCH 1/2] [AMDGPU] Reset kill flags for multiple uses of SDWAInst
Ops
Change-Id: I8b56d86a55c397623567945a87ad2f55749680bc
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 9 +
.../AMDGPU/sdwa-peephole-multiuse-kill.ll | 194 ++++++++++++++++++
2 files changed, 203 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-multiuse-kill.ll
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index f47731bf6aac3..bf33a1982c5fa 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1184,8 +1184,17 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
if (PotentialMatches.count(Operand->getParentInst()) == 0)
Converted |= Operand->convertToSDWA(*SDWAInst, TII);
}
+
if (Converted) {
ConvertedInstructions.push_back(SDWAInst);
+ auto &MRI = SDWAInst->getParent()->getParent()->getRegInfo();
+ for (MachineOperand &MO : SDWAInst->uses()) {
+ if (!MO.isReg())
+ continue;
+
+ if (!MRI.hasOneUse(MO.getReg()))
+ MRI.clearKillFlags(MO.getReg());
+ }
} else {
SDWAInst->eraseFromParent();
return false;
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-multiuse-kill.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-multiuse-kill.ll
new file mode 100644
index 0000000000000..61cc34bf4fa9c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-multiuse-kill.ll
@@ -0,0 +1,194 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -stop-after=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck %s
+
+
+define amdgpu_kernel void @multiuse(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst1, ptr addrspace(1) nocapture %dst2, ptr addrspace(1) nocapture %dst3) {
+ ; CHECK-LABEL: name: multiuse
+ ; CHECK: bb.0.entry:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s256) from %ir.src1.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 68, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset + 32, align 4, addrspace 4)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 3, [[COPY1]](s32), implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed [[REG_SEQUENCE]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.gep1, addrspace 1)
+ ; CHECK-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[COPY1]](s32), 14, implicit $exec
+ ; CHECK-NEXT: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 [[COPY1]](s32), 15, implicit $exec
+ ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1.bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -256
+ ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[S_MOV_B32_]], implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 24, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_AND_B32_e64_]], 0, 5, 0, 3, 6, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+ ; CHECK-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, killed [[S_MOV_B32_1]], implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8, implicit $exec
+ ; CHECK-NEXT: [[V_LSHLREV_B16_sdwa:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_sdwa 0, [[V_MOV_B32_e32_1]], 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, 6, 0, 6, 5, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_LSHLREV_B16_sdwa]], 0, 6, 0, 0, 6, implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 8, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_AND_B32_sdwa:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[S_MOV_B32_]], 0, 6, 0, 5, 6, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa2:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_AND_B32_sdwa]], 0, 5, 0, 1, 6, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 101123332
+ ; CHECK-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, killed [[S_MOV_B32_2]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+ ; CHECK-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[S_MOV_B32_3]], [[V_OR_B32_sdwa1]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa3:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa1]], 0, [[V_OR_B32_sdwa2]], 0, 6, 0, 4, 6, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa4:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa1]], 0, [[V_OR_B32_sdwa]], 0, 6, 0, 4, 6, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s32) into %ir.6, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[V_PERM_B32_e64_]], [[REG_SEQUENCE1]], 8, 0, implicit $exec :: (store (s32) into %ir.gep5, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[V_OR_B32_sdwa3]], [[REG_SEQUENCE1]], 16, 0, implicit $exec :: (store (s32) into %ir.gep6, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[V_OR_B32_sdwa4]], [[REG_SEQUENCE1]], 24, 0, implicit $exec :: (store (s32) into %ir.gep7, addrspace 1)
+ ; CHECK-NEXT: [[V_CMP_LT_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 [[COPY1]](s32), 7, implicit $exec
+ ; CHECK-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[V_CMP_GT_U32_e64_]], $exec, implicit-def $scc
+ ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_LT_U32_e64_1]], $exec, implicit-def $scc
+ ; CHECK-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_ANDN2_B64_]], [[S_AND_B64_]], implicit-def $scc
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2.Flow:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[V_CMP_GT_U32_e64_]], %bb.0, [[S_OR_B64_]], %bb.1
+ ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[PHI]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3.bb.2:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
+ ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B16_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 8, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 8, [[V_LSHRREV_B32_e64_2]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa5:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_LSHLREV_B16_e64_]], 0, 5, 0, 1, 6, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+ ; CHECK-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[S_MOV_B32_4]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 -256
+ ; CHECK-NEXT: [[V_AND_B32_e64_4:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHRREV_B32_e64_2]], [[S_MOV_B32_5]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa6:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_AND_B32_e64_4]], 0, 6, 0, 0, 6, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+ ; CHECK-NEXT: [[V_AND_B32_e64_5:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[S_MOV_B32_6]], [[V_OR_B32_sdwa6]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa7:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa6]], 0, [[V_OR_B32_sdwa5]], 0, 6, 0, 4, 6, implicit $exec
+ ; CHECK-NEXT: [[V_AND_B32_e64_6:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[S_MOV_B32_5]], implicit $exec
+ ; CHECK-NEXT: [[V_AND_B32_e64_7:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHRREV_B32_e64_2]], [[S_MOV_B32_4]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa8:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_LSHRREV_B32_e64_2]], 0, [[V_AND_B32_e64_6]], 0, 5, 0, 0, 6, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 202113025
+ ; CHECK-NEXT: [[V_PERM_B32_e64_1:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[S_MOV_B32_7]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 -65536
+ ; CHECK-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, killed [[S_MOV_B32_8]], [[V_PERM_B32_e64_1]], implicit $exec
+ ; CHECK-NEXT: [[V_PERM_B32_e64_2:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[V_LSHRREV_B32_e64_2]], [[S_MOV_B32_7]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa9:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa6]], 0, [[V_OR_B32_sdwa8]], 0, 6, 0, 4, 6, implicit $exec
+ ; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, killed [[V_PERM_B32_e64_2]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_PERM_B32_e64_1]], killed [[V_LSHLREV_B32_e64_1]], implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_2]], killed [[V_OR_B32_sdwa7]], [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s32) into %ir.11, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_2]], killed [[V_OR_B32_sdwa9]], [[REG_SEQUENCE2]], 8, 0, implicit $exec :: (store (s32) into %ir.gep9, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_2]], killed [[V_AND_OR_B32_e64_]], [[REG_SEQUENCE2]], 16, 0, implicit $exec :: (store (s32) into %ir.gep10, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_2]], killed [[V_OR_B32_e64_]], [[REG_SEQUENCE2]], 24, 0, implicit $exec :: (store (s32) into %ir.gep11, addrspace 1)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4.bb.3:
+ ; CHECK-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_LOAD_DWORDX2_IMM]].sub0, %subreg.sub0, [[S_LOAD_DWORDX2_IMM]].sub1, %subreg.sub1
+ ; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+ ; CHECK-NEXT: [[V_AND_B32_e64_8:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[S_MOV_B32_9]], implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B32_e64_3:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 -256
+ ; CHECK-NEXT: [[V_AND_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[S_MOV_B32_10]], 0, 6, 0, 5, 6, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa10:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_AND_B32_sdwa1]], 0, 5, 0, 0, 6, implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B16_e64_2:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 8, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8, implicit $exec
+ ; CHECK-NEXT: [[V_LSHLREV_B16_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_sdwa 0, [[V_MOV_B32_e32_4]], 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, 6, 0, 6, 5, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa11:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_LSHLREV_B16_sdwa1]], 0, 6, 0, 1, 6, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+ ; CHECK-NEXT: [[V_AND_B32_e64_9:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_11]], [[V_OR_B32_sdwa11]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa12:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa11]], 0, [[V_OR_B32_sdwa10]], 0, 6, 0, 4, 6, implicit $exec
+ ; CHECK-NEXT: [[V_LSHLREV_B16_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 8, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_AND_B32_sdwa2:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[S_MOV_B32_9]], 0, 6, 0, 5, 6, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa13:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_AND_B32_sdwa2]], 0, [[V_LSHLREV_B16_e64_1]], 0, 5, 0, 6, 6, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa14:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_AND_B32_sdwa1]], 0, 6, 0, 1, 6, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa15:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_LSHLREV_B16_sdwa1]], 0, 5, 0, 0, 6, implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B32_e64_4:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 24, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa16:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_LSHLREV_B16_e64_1]], 0, 5, 0, 3, 6, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa17:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa11]], 0, [[V_OR_B32_sdwa16]], 0, 6, 0, 4, 6, implicit $exec
+ ; CHECK-NEXT: [[V_AND_B32_e64_10:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_11]], [[V_OR_B32_sdwa14]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa18:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa14]], 0, [[V_OR_B32_sdwa15]], 0, 6, 0, 4, 6, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa19:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa14]], 0, [[V_OR_B32_sdwa13]], 0, 6, 0, 4, 6, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_3]], killed [[V_OR_B32_sdwa12]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s32) into %ir.12, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_3]], killed [[V_OR_B32_sdwa17]], [[REG_SEQUENCE3]], 8, 0, implicit $exec :: (store (s32) into %ir.gep13, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_3]], killed [[V_OR_B32_sdwa18]], [[REG_SEQUENCE3]], 16, 0, implicit $exec :: (store (s32) into %ir.gep14, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_3]], killed [[V_OR_B32_sdwa19]], [[REG_SEQUENCE3]], 24, 0, implicit $exec :: (store (s32) into %ir.gep15, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ %s1 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %s2 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+ %s3 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ %s4 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
+ %gep4 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 0
+ %gep5 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 1
+ %gep6 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 2
+ %gep7 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 3
+ store <4 x i8> %s1, ptr addrspace(1) %gep4, align 4
+ store <4 x i8> %s2, ptr addrspace(1) %gep5, align 4
+ store <4 x i8> %s3, ptr addrspace(1) %gep6, align 4
+ store <4 x i8> %s4, ptr addrspace(1) %gep7, align 4
+ %cmp2 = icmp ult i32 %idx, 7
+ br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+ %s5 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
+ %s6 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+ %s7 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+ %s8 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ %gep8 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 0
+ %gep9 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 1
+ %gep10 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 2
+ %gep11 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 3
+ store <4 x i8> %s5, ptr addrspace(1) %gep8, align 4
+ store <4 x i8> %s6, ptr addrspace(1) %gep9, align 4
+ store <4 x i8> %s7, ptr addrspace(1) %gep10, align 4
+ store <4 x i8> %s8, ptr addrspace(1) %gep11, align 4
+ br label %bb.3
+
+bb.3:
+ %s9 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
+ %s10 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+ %s11 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+ %s12 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+ %gep12 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 0
+ %gep13 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 1
+ %gep14 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 2
+ %gep15 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 3
+ store <4 x i8> %s9, ptr addrspace(1) %gep12, align 4
+ store <4 x i8> %s10, ptr addrspace(1) %gep13, align 4
+ store <4 x i8> %s11, ptr addrspace(1) %gep14, align 4
+ store <4 x i8> %s12, ptr addrspace(1) %gep15, align 4
+ ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SDWA: {{.*}}
>From da686f8cb9f5d13e47afc262d505e6b7c0f02da7 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Sun, 30 Jun 2024 09:56:55 -0700
Subject: [PATCH 2/2] Review comments
Change-Id: Id3820b1637f2dabfec951111dc361d6e7a576d65
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 6 +-
.../AMDGPU/sdwa-peephole-multiuse-kill.ll | 194 ------------------
llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir | 37 +++-
3 files changed, 37 insertions(+), 200 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-multiuse-kill.ll
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index bf33a1982c5fa..d428864c9dd59 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1187,13 +1187,11 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
if (Converted) {
ConvertedInstructions.push_back(SDWAInst);
- auto &MRI = SDWAInst->getParent()->getParent()->getRegInfo();
for (MachineOperand &MO : SDWAInst->uses()) {
if (!MO.isReg())
continue;
-
- if (!MRI.hasOneUse(MO.getReg()))
- MRI.clearKillFlags(MO.getReg());
+
+ MRI->clearKillFlags(MO.getReg());
}
} else {
SDWAInst->eraseFromParent();
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-multiuse-kill.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-multiuse-kill.ll
deleted file mode 100644
index 61cc34bf4fa9c..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-multiuse-kill.ll
+++ /dev/null
@@ -1,194 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -stop-after=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck %s
-
-
-define amdgpu_kernel void @multiuse(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst1, ptr addrspace(1) nocapture %dst2, ptr addrspace(1) nocapture %dst3) {
- ; CHECK-LABEL: name: multiuse
- ; CHECK: bb.0.entry:
- ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
- ; CHECK-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s256) from %ir.src1.kernarg.offset, align 4, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 68, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset + 32, align 4, addrspace 4)
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
- ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 3, [[COPY1]](s32), implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed [[REG_SEQUENCE]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.gep1, addrspace 1)
- ; CHECK-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[COPY1]](s32), 14, implicit $exec
- ; CHECK-NEXT: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 [[COPY1]](s32), 15, implicit $exec
- ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: S_BRANCH %bb.1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.1.bb.1:
- ; CHECK-NEXT: successors: %bb.2(0x80000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
- ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -256
- ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[S_MOV_B32_]], implicit $exec
- ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 24, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_AND_B32_e64_]], 0, 5, 0, 3, 6, implicit $exec
- ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 255
- ; CHECK-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, killed [[S_MOV_B32_1]], implicit $exec
- ; CHECK-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
- ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8, implicit $exec
- ; CHECK-NEXT: [[V_LSHLREV_B16_sdwa:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_sdwa 0, [[V_MOV_B32_e32_1]], 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, 6, 0, 6, 5, implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_LSHLREV_B16_sdwa]], 0, 6, 0, 0, 6, implicit $exec
- ; CHECK-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 8, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
- ; CHECK-NEXT: [[V_AND_B32_sdwa:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[S_MOV_B32_]], 0, 6, 0, 5, 6, implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa2:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_AND_B32_sdwa]], 0, 5, 0, 1, 6, implicit $exec
- ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 101123332
- ; CHECK-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, killed [[S_MOV_B32_2]], implicit $exec
- ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
- ; CHECK-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[S_MOV_B32_3]], [[V_OR_B32_sdwa1]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa3:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa1]], 0, [[V_OR_B32_sdwa2]], 0, 6, 0, 4, 6, implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa4:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa1]], 0, [[V_OR_B32_sdwa]], 0, 6, 0, 4, 6, implicit $exec
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s32) into %ir.6, addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[V_PERM_B32_e64_]], [[REG_SEQUENCE1]], 8, 0, implicit $exec :: (store (s32) into %ir.gep5, addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[V_OR_B32_sdwa3]], [[REG_SEQUENCE1]], 16, 0, implicit $exec :: (store (s32) into %ir.gep6, addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[V_OR_B32_sdwa4]], [[REG_SEQUENCE1]], 24, 0, implicit $exec :: (store (s32) into %ir.gep7, addrspace 1)
- ; CHECK-NEXT: [[V_CMP_LT_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 [[COPY1]](s32), 7, implicit $exec
- ; CHECK-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[V_CMP_GT_U32_e64_]], $exec, implicit-def $scc
- ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_LT_U32_e64_1]], $exec, implicit-def $scc
- ; CHECK-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_ANDN2_B64_]], [[S_AND_B64_]], implicit-def $scc
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.2.Flow:
- ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[V_CMP_GT_U32_e64_]], %bb.0, [[S_OR_B64_]], %bb.1
- ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[PHI]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: S_BRANCH %bb.3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.3.bb.2:
- ; CHECK-NEXT: successors: %bb.4(0x80000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
- ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
- ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[V_LSHRREV_B16_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 8, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
- ; CHECK-NEXT: [[V_LSHRREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
- ; CHECK-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 8, [[V_LSHRREV_B32_e64_2]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa5:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_LSHLREV_B16_e64_]], 0, 5, 0, 1, 6, implicit $exec
- ; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 255
- ; CHECK-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[S_MOV_B32_4]], implicit $exec
- ; CHECK-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 -256
- ; CHECK-NEXT: [[V_AND_B32_e64_4:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHRREV_B32_e64_2]], [[S_MOV_B32_5]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa6:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_AND_B32_e64_4]], 0, 6, 0, 0, 6, implicit $exec
- ; CHECK-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
- ; CHECK-NEXT: [[V_AND_B32_e64_5:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[S_MOV_B32_6]], [[V_OR_B32_sdwa6]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa7:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa6]], 0, [[V_OR_B32_sdwa5]], 0, 6, 0, 4, 6, implicit $exec
- ; CHECK-NEXT: [[V_AND_B32_e64_6:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[S_MOV_B32_5]], implicit $exec
- ; CHECK-NEXT: [[V_AND_B32_e64_7:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHRREV_B32_e64_2]], [[S_MOV_B32_4]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa8:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_LSHRREV_B32_e64_2]], 0, [[V_AND_B32_e64_6]], 0, 5, 0, 0, 6, implicit $exec
- ; CHECK-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 202113025
- ; CHECK-NEXT: [[V_PERM_B32_e64_1:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[S_MOV_B32_7]], implicit $exec
- ; CHECK-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 -65536
- ; CHECK-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, killed [[S_MOV_B32_8]], [[V_PERM_B32_e64_1]], implicit $exec
- ; CHECK-NEXT: [[V_PERM_B32_e64_2:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[V_LSHRREV_B32_e64_2]], [[S_MOV_B32_7]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa9:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa6]], 0, [[V_OR_B32_sdwa8]], 0, 6, 0, 4, 6, implicit $exec
- ; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, killed [[V_PERM_B32_e64_2]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_PERM_B32_e64_1]], killed [[V_LSHLREV_B32_e64_1]], implicit $exec
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_2]], killed [[V_OR_B32_sdwa7]], [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s32) into %ir.11, addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_2]], killed [[V_OR_B32_sdwa9]], [[REG_SEQUENCE2]], 8, 0, implicit $exec :: (store (s32) into %ir.gep9, addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_2]], killed [[V_AND_OR_B32_e64_]], [[REG_SEQUENCE2]], 16, 0, implicit $exec :: (store (s32) into %ir.gep10, addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_2]], killed [[V_OR_B32_e64_]], [[REG_SEQUENCE2]], 24, 0, implicit $exec :: (store (s32) into %ir.gep11, addrspace 1)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.4.bb.3:
- ; CHECK-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_LOAD_DWORDX2_IMM]].sub0, %subreg.sub0, [[S_LOAD_DWORDX2_IMM]].sub1, %subreg.sub1
- ; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 255
- ; CHECK-NEXT: [[V_AND_B32_e64_8:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[S_MOV_B32_9]], implicit $exec
- ; CHECK-NEXT: [[V_LSHRREV_B32_e64_3:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
- ; CHECK-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 -256
- ; CHECK-NEXT: [[V_AND_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[S_MOV_B32_10]], 0, 6, 0, 5, 6, implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa10:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_AND_B32_sdwa1]], 0, 5, 0, 0, 6, implicit $exec
- ; CHECK-NEXT: [[V_LSHRREV_B16_e64_2:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 8, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
- ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8, implicit $exec
- ; CHECK-NEXT: [[V_LSHLREV_B16_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_sdwa 0, [[V_MOV_B32_e32_4]], 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, 6, 0, 6, 5, implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa11:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_LSHLREV_B16_sdwa1]], 0, 6, 0, 1, 6, implicit $exec
- ; CHECK-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
- ; CHECK-NEXT: [[V_AND_B32_e64_9:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_11]], [[V_OR_B32_sdwa11]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa12:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa11]], 0, [[V_OR_B32_sdwa10]], 0, 6, 0, 4, 6, implicit $exec
- ; CHECK-NEXT: [[V_LSHLREV_B16_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 8, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
- ; CHECK-NEXT: [[V_AND_B32_sdwa2:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[S_MOV_B32_9]], 0, 6, 0, 5, 6, implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa13:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_AND_B32_sdwa2]], 0, [[V_LSHLREV_B16_e64_1]], 0, 5, 0, 6, 6, implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa14:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_AND_B32_sdwa1]], 0, 6, 0, 1, 6, implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa15:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_LSHLREV_B16_sdwa1]], 0, 5, 0, 0, 6, implicit $exec
- ; CHECK-NEXT: [[V_LSHRREV_B32_e64_4:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 24, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa16:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_LSHLREV_B16_e64_1]], 0, 5, 0, 3, 6, implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa17:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa11]], 0, [[V_OR_B32_sdwa16]], 0, 6, 0, 4, 6, implicit $exec
- ; CHECK-NEXT: [[V_AND_B32_e64_10:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_11]], [[V_OR_B32_sdwa14]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa18:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa14]], 0, [[V_OR_B32_sdwa15]], 0, 6, 0, 4, 6, implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_sdwa19:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa14]], 0, [[V_OR_B32_sdwa13]], 0, 6, 0, 4, 6, implicit $exec
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_3]], killed [[V_OR_B32_sdwa12]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s32) into %ir.12, addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_3]], killed [[V_OR_B32_sdwa17]], [[REG_SEQUENCE3]], 8, 0, implicit $exec :: (store (s32) into %ir.gep13, addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_3]], killed [[V_OR_B32_sdwa18]], [[REG_SEQUENCE3]], 16, 0, implicit $exec :: (store (s32) into %ir.gep14, addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_3]], killed [[V_OR_B32_sdwa19]], [[REG_SEQUENCE3]], 24, 0, implicit $exec :: (store (s32) into %ir.gep15, addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- %s1 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %s2 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
- %s3 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
- %s4 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
- %gep4 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 0
- %gep5 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 1
- %gep6 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 2
- %gep7 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 3
- store <4 x i8> %s1, ptr addrspace(1) %gep4, align 4
- store <4 x i8> %s2, ptr addrspace(1) %gep5, align 4
- store <4 x i8> %s3, ptr addrspace(1) %gep6, align 4
- store <4 x i8> %s4, ptr addrspace(1) %gep7, align 4
- %cmp2 = icmp ult i32 %idx, 7
- br i1 %cmp2, label %bb.2, label %bb.3
-
-bb.2:
- %s5 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
- %s6 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
- %s7 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
- %s8 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
- %gep8 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 0
- %gep9 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 1
- %gep10 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 2
- %gep11 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 3
- store <4 x i8> %s5, ptr addrspace(1) %gep8, align 4
- store <4 x i8> %s6, ptr addrspace(1) %gep9, align 4
- store <4 x i8> %s7, ptr addrspace(1) %gep10, align 4
- store <4 x i8> %s8, ptr addrspace(1) %gep11, align 4
- br label %bb.3
-
-bb.3:
- %s9 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
- %s10 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
- %s11 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
- %s12 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
- %gep12 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 0
- %gep13 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 1
- %gep14 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 2
- %gep15 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 3
- store <4 x i8> %s9, ptr addrspace(1) %gep12, align 4
- store <4 x i8> %s10, ptr addrspace(1) %gep13, align 4
- store <4 x i8> %s11, ptr addrspace(1) %gep14, align 4
- store <4 x i8> %s12, ptr addrspace(1) %gep15, align 4
- ret void
-}
-
-
-declare i32 @llvm.amdgcn.workitem.id.x()
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SDWA: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
index 4c61e6803febf..c05e87736c16e 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
@@ -36,7 +36,7 @@ body: |
; SDWA-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[FLAT_LOAD_DWORD]], 8, 8, implicit $exec
; SDWA-NEXT: [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 24, [[FLAT_LOAD_DWORD1]], implicit $exec
; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec
- ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F32_sdwa]](tied-def 0)
+ ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F32_sdwa]](tied-def 0)
; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_ADD_F16_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
; SDWA-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
; SDWA-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
@@ -185,7 +185,7 @@ body: |
; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec
; SDWA-NEXT: {{ $}}
; SDWA-NEXT: bb.2:
- ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F32_sdwa]](tied-def 0)
+ ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F32_sdwa]](tied-def 0)
; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_ADD_F16_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
; SDWA-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
; SDWA-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
@@ -217,3 +217,36 @@ body: |
$sgpr30_sgpr31 = COPY %2
S_SETPC_B64_return $sgpr30_sgpr31
...
+
+# Should not add kill flag to reused ops in SDWAInst
+
+---
+name: multiuse_kill
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ ; SDWA-LABEL: name: multiuse_kill
+ ; SDWA: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; SDWA-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; SDWA-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; SDWA-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, killed [[DEF]], implicit $exec
+ ; SDWA-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+ ; SDWA-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[S_MOV_B32_]], [[DEF1]], implicit $exec
+ ; SDWA-NEXT: [[V_OR_B32_sdwa:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[DEF1]], 0, [[V_LSHLREV_B32_e64_]], 0, 6, 0, 4, 6, implicit $exec
+ ; SDWA-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, killed [[DEF2]], implicit $exec
+ ; SDWA-NEXT: [[V_OR_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[DEF1]], 0, [[V_LSHLREV_B32_e64_1]], 0, 6, 0, 4, 6, implicit $exec
+ ; SDWA-NEXT: S_ENDPGM 0
+ %68:vgpr_32 = IMPLICIT_DEF
+ %65:vgpr_32 = IMPLICIT_DEF
+ %57:vgpr_32 = IMPLICIT_DEF
+ %71:vgpr_32 = V_LSHLREV_B32_e64 16, killed %68:vgpr_32, implicit $exec
+ %72:sreg_32 = S_MOV_B32 65535
+ %73:vgpr_32 = V_AND_B32_e64 killed %72:sreg_32, killed %65:vgpr_32, implicit $exec
+ %74:vgpr_32 = V_OR_B32_e64 %73:vgpr_32, killed %71:vgpr_32, implicit $exec
+ %75:vgpr_32 = V_LSHLREV_B32_e64 16, killed %57:vgpr_32, implicit $exec
+ %76:vgpr_32 = V_OR_B32_e64 %73:vgpr_32, killed %75:vgpr_32, implicit $exec
+
+ S_ENDPGM 0
+
+...
More information about the llvm-commits
mailing list