[llvm] [AMDGPU] Reset kill flags for multiple uses of SDWAInst Ops (PR #97135)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 28 19:54:44 PDT 2024
https://github.com/jrbyrnes created https://github.com/llvm/llvm-project/pull/97135
https://github.com/llvm/llvm-project/commit/ded956440739ae326a99cbaef18ce4362e972679 exposed an issue in PeepholeSDWA that is causing expensive check failures.
This seems to have been introduced by https://github.com/llvm/llvm-project/commit/e7e90dd1c1014b4a7ef77f74af3682168d23ddbf, and reverting that commit resolves the issue.
The issue is when we promote to SDWA for multiple uses, we may run into issues with killed flag (as in https://lab.llvm.org/buildbot/#/builders/16/builds/870/steps/6/logs/FAIL__LLVM__vni8-across-blocks_ll ).
This PR just resets the kill flags for any multi-use SDWA promotions.
>From 9620b78eff2038cdb91a04ec6274d6413331341f Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 28 Jun 2024 19:03:46 -0700
Subject: [PATCH] [AMDGPU] Reset kill flags for multiple uses of SDWAInst Ops
Change-Id: I8b56d86a55c397623567945a87ad2f55749680bc
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 9 +
.../AMDGPU/sdwa-peephole-multiuse-kill.ll | 194 ++++++++++++++++++
2 files changed, 203 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-multiuse-kill.ll
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index f47731bf6aac3..bf33a1982c5fa 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1184,8 +1184,17 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
if (PotentialMatches.count(Operand->getParentInst()) == 0)
Converted |= Operand->convertToSDWA(*SDWAInst, TII);
}
+
if (Converted) {
ConvertedInstructions.push_back(SDWAInst);
+ auto &MRI = SDWAInst->getParent()->getParent()->getRegInfo();
+ for (MachineOperand &MO : SDWAInst->uses()) {
+ if (!MO.isReg())
+ continue;
+
+ if (!MRI.hasOneUse(MO.getReg()))
+ MRI.clearKillFlags(MO.getReg());
+ }
} else {
SDWAInst->eraseFromParent();
return false;
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-multiuse-kill.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-multiuse-kill.ll
new file mode 100644
index 0000000000000..61cc34bf4fa9c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-multiuse-kill.ll
@@ -0,0 +1,194 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -stop-after=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck %s
+
+
+define amdgpu_kernel void @multiuse(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst1, ptr addrspace(1) nocapture %dst2, ptr addrspace(1) nocapture %dst3) {
+ ; CHECK-LABEL: name: multiuse
+ ; CHECK: bb.0.entry:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s256) from %ir.src1.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 68, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset + 32, align 4, addrspace 4)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 3, [[COPY1]](s32), implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed [[REG_SEQUENCE]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.gep1, addrspace 1)
+ ; CHECK-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[COPY1]](s32), 14, implicit $exec
+ ; CHECK-NEXT: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 [[COPY1]](s32), 15, implicit $exec
+ ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1.bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -256
+ ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[S_MOV_B32_]], implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 24, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_AND_B32_e64_]], 0, 5, 0, 3, 6, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+ ; CHECK-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, killed [[S_MOV_B32_1]], implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8, implicit $exec
+ ; CHECK-NEXT: [[V_LSHLREV_B16_sdwa:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_sdwa 0, [[V_MOV_B32_e32_1]], 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, 6, 0, 6, 5, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_LSHLREV_B16_sdwa]], 0, 6, 0, 0, 6, implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 8, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_AND_B32_sdwa:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[S_MOV_B32_]], 0, 6, 0, 5, 6, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa2:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_AND_B32_sdwa]], 0, 5, 0, 1, 6, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 101123332
+ ; CHECK-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, killed [[S_MOV_B32_2]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+ ; CHECK-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[S_MOV_B32_3]], [[V_OR_B32_sdwa1]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa3:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa1]], 0, [[V_OR_B32_sdwa2]], 0, 6, 0, 4, 6, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa4:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa1]], 0, [[V_OR_B32_sdwa]], 0, 6, 0, 4, 6, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s32) into %ir.6, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[V_PERM_B32_e64_]], [[REG_SEQUENCE1]], 8, 0, implicit $exec :: (store (s32) into %ir.gep5, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[V_OR_B32_sdwa3]], [[REG_SEQUENCE1]], 16, 0, implicit $exec :: (store (s32) into %ir.gep6, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[V_OR_B32_sdwa4]], [[REG_SEQUENCE1]], 24, 0, implicit $exec :: (store (s32) into %ir.gep7, addrspace 1)
+ ; CHECK-NEXT: [[V_CMP_LT_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 [[COPY1]](s32), 7, implicit $exec
+ ; CHECK-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[V_CMP_GT_U32_e64_]], $exec, implicit-def $scc
+ ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_LT_U32_e64_1]], $exec, implicit-def $scc
+ ; CHECK-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_ANDN2_B64_]], [[S_AND_B64_]], implicit-def $scc
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2.Flow:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[V_CMP_GT_U32_e64_]], %bb.0, [[S_OR_B64_]], %bb.1
+ ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[PHI]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3.bb.2:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
+ ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B16_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 8, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 8, [[V_LSHRREV_B32_e64_2]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa5:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_LSHLREV_B16_e64_]], 0, 5, 0, 1, 6, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+ ; CHECK-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[S_MOV_B32_4]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 -256
+ ; CHECK-NEXT: [[V_AND_B32_e64_4:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHRREV_B32_e64_2]], [[S_MOV_B32_5]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa6:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_AND_B32_e64_4]], 0, 6, 0, 0, 6, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+ ; CHECK-NEXT: [[V_AND_B32_e64_5:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[S_MOV_B32_6]], [[V_OR_B32_sdwa6]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa7:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa6]], 0, [[V_OR_B32_sdwa5]], 0, 6, 0, 4, 6, implicit $exec
+ ; CHECK-NEXT: [[V_AND_B32_e64_6:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[S_MOV_B32_5]], implicit $exec
+ ; CHECK-NEXT: [[V_AND_B32_e64_7:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHRREV_B32_e64_2]], [[S_MOV_B32_4]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa8:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_LSHRREV_B32_e64_2]], 0, [[V_AND_B32_e64_6]], 0, 5, 0, 0, 6, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 202113025
+ ; CHECK-NEXT: [[V_PERM_B32_e64_1:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[S_MOV_B32_7]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 -65536
+ ; CHECK-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, killed [[S_MOV_B32_8]], [[V_PERM_B32_e64_1]], implicit $exec
+ ; CHECK-NEXT: [[V_PERM_B32_e64_2:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[V_LSHRREV_B32_e64_2]], [[S_MOV_B32_7]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa9:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa6]], 0, [[V_OR_B32_sdwa8]], 0, 6, 0, 4, 6, implicit $exec
+ ; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, killed [[V_PERM_B32_e64_2]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_PERM_B32_e64_1]], killed [[V_LSHLREV_B32_e64_1]], implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_2]], killed [[V_OR_B32_sdwa7]], [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s32) into %ir.11, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_2]], killed [[V_OR_B32_sdwa9]], [[REG_SEQUENCE2]], 8, 0, implicit $exec :: (store (s32) into %ir.gep9, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_2]], killed [[V_AND_OR_B32_e64_]], [[REG_SEQUENCE2]], 16, 0, implicit $exec :: (store (s32) into %ir.gep10, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_2]], killed [[V_OR_B32_e64_]], [[REG_SEQUENCE2]], 24, 0, implicit $exec :: (store (s32) into %ir.gep11, addrspace 1)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4.bb.3:
+ ; CHECK-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_LOAD_DWORDX2_IMM]].sub0, %subreg.sub0, [[S_LOAD_DWORDX2_IMM]].sub1, %subreg.sub1
+ ; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+ ; CHECK-NEXT: [[V_AND_B32_e64_8:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, [[S_MOV_B32_9]], implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B32_e64_3:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 -256
+ ; CHECK-NEXT: [[V_AND_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[S_MOV_B32_10]], 0, 6, 0, 5, 6, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa10:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_AND_B32_sdwa1]], 0, 5, 0, 0, 6, implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B16_e64_2:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 8, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8, implicit $exec
+ ; CHECK-NEXT: [[V_LSHLREV_B16_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_sdwa 0, [[V_MOV_B32_e32_4]], 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, 6, 0, 6, 5, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa11:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_LSHLREV_B16_sdwa1]], 0, 6, 0, 1, 6, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+ ; CHECK-NEXT: [[V_AND_B32_e64_9:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_11]], [[V_OR_B32_sdwa11]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa12:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa11]], 0, [[V_OR_B32_sdwa10]], 0, 6, 0, 4, 6, implicit $exec
+ ; CHECK-NEXT: [[V_LSHLREV_B16_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 8, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_AND_B32_sdwa2:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[S_MOV_B32_9]], 0, 6, 0, 5, 6, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa13:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_AND_B32_sdwa2]], 0, [[V_LSHLREV_B16_e64_1]], 0, 5, 0, 6, 6, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa14:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_AND_B32_sdwa1]], 0, 6, 0, 1, 6, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa15:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_LSHLREV_B16_sdwa1]], 0, 5, 0, 0, 6, implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B32_e64_4:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 24, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa16:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0, 0, [[V_LSHLREV_B16_e64_1]], 0, 5, 0, 3, 6, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa17:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa11]], 0, [[V_OR_B32_sdwa16]], 0, 6, 0, 4, 6, implicit $exec
+ ; CHECK-NEXT: [[V_AND_B32_e64_10:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_11]], [[V_OR_B32_sdwa14]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa18:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa14]], 0, [[V_OR_B32_sdwa15]], 0, 6, 0, 4, 6, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa19:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_OR_B32_sdwa14]], 0, [[V_OR_B32_sdwa13]], 0, 6, 0, 4, 6, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_3]], killed [[V_OR_B32_sdwa12]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s32) into %ir.12, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_3]], killed [[V_OR_B32_sdwa17]], [[REG_SEQUENCE3]], 8, 0, implicit $exec :: (store (s32) into %ir.gep13, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_3]], killed [[V_OR_B32_sdwa18]], [[REG_SEQUENCE3]], 16, 0, implicit $exec :: (store (s32) into %ir.gep14, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_3]], killed [[V_OR_B32_sdwa19]], [[REG_SEQUENCE3]], 24, 0, implicit $exec :: (store (s32) into %ir.gep15, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ %s1 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %s2 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+ %s3 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ %s4 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
+ %gep4 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 0
+ %gep5 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 1
+ %gep6 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 2
+ %gep7 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 3
+ store <4 x i8> %s1, ptr addrspace(1) %gep4, align 4
+ store <4 x i8> %s2, ptr addrspace(1) %gep5, align 4
+ store <4 x i8> %s3, ptr addrspace(1) %gep6, align 4
+ store <4 x i8> %s4, ptr addrspace(1) %gep7, align 4
+ %cmp2 = icmp ult i32 %idx, 7
+ br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+ %s5 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
+ %s6 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+ %s7 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+ %s8 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ %gep8 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 0
+ %gep9 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 1
+ %gep10 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 2
+ %gep11 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 3
+ store <4 x i8> %s5, ptr addrspace(1) %gep8, align 4
+ store <4 x i8> %s6, ptr addrspace(1) %gep9, align 4
+ store <4 x i8> %s7, ptr addrspace(1) %gep10, align 4
+ store <4 x i8> %s8, ptr addrspace(1) %gep11, align 4
+ br label %bb.3
+
+bb.3:
+ %s9 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
+ %s10 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+ %s11 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+ %s12 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+ %gep12 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 0
+ %gep13 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 1
+ %gep14 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 2
+ %gep15 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 3
+ store <4 x i8> %s9, ptr addrspace(1) %gep12, align 4
+ store <4 x i8> %s10, ptr addrspace(1) %gep13, align 4
+ store <4 x i8> %s11, ptr addrspace(1) %gep14, align 4
+ store <4 x i8> %s12, ptr addrspace(1) %gep15, align 4
+ ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SDWA: {{.*}}
More information about the llvm-commits
mailing list