[llvm] [AMDGPU] BackOffBarrier feature added to gfx1250; Removed incorrect "DS Store drain" check. (PR #179818)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 10 13:46:48 PST 2026
https://github.com/hidekisaito updated https://github.com/llvm/llvm-project/pull/179818
>From f00de0836c0b4edf422f685a3a6cfe79bf699a16 Mon Sep 17 00:00:00 2001
From: Hideki Saito <hidekido at amd.com>
Date: Wed, 4 Feb 2026 18:39:57 -0500
Subject: [PATCH] [AMDGPU] Removed incorrect "DS Store drain" check.
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 1 +
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 23 +-
.../back-off-barrier-subtarget-feature.ll | 59 +
.../CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll | 1178 +++++++++++++++++
.../AMDGPU/waitcnt-loop-ds-store-barrier.mir | 77 ++
5 files changed, 1325 insertions(+), 13 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-store-barrier.mir
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 828f87ccfaf97..1a215f9cfaf1b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1935,6 +1935,7 @@ def FeatureISAVersion12 : FeatureSet<
def FeatureISAVersion12_50_Common : FeatureSet<
[FeatureGFX12,
FeatureGFX1250Insts,
+ FeatureBackOffBarrier,
FeatureRequiresAlignedVGPRs,
FeatureCuMode,
Feature1024AddressableVGPRs,
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 9a2a7b4923881..e9b38b3a132d0 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -3229,7 +3229,6 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
PreheaderFlushFlags Flags;
bool HasVMemLoad = false;
bool HasVMemStore = false;
- bool SeenDSStoreInLoop = false;
bool UsesVgprLoadedOutsideVMEM = false;
bool UsesVgprLoadedOutsideDS = false;
bool VMemInvalidated = false;
@@ -3240,20 +3239,20 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
DenseSet<MCRegUnit> VgprDefDS;
for (MachineBasicBlock *MBB : ML->blocks()) {
- bool SeenDSStoreInCurrMBB = false;
for (MachineInstr &MI : *MBB) {
if (isVMEMOrFlatVMEM(MI)) {
HasVMemLoad |= MI.mayLoad();
HasVMemStore |= MI.mayStore();
}
- if (mayStoreIncrementingDSCNT(MI))
- SeenDSStoreInCurrMBB = true;
- // Stores postdominated by a barrier will have a wait at the barrier
- // and thus no need to be waited at the loop header. Barrier found
- // later in the same MBB during in-order traversal is used here as a
- // cheaper alternative to postdomination check.
- if (MI.getOpcode() == AMDGPU::S_BARRIER)
- SeenDSStoreInCurrMBB = false;
+ // TODO: Can we relax DSStore check? There may be cases where
+ // these DS stores are drained prior to the end of MBB (or loop).
+ if (mayStoreIncrementingDSCNT(MI)) {
+ // Early exit if both optimizations are invalidated.
+ // Otherwise, set invalid status and continue.
+ if (VMemInvalidated)
+ return Flags;
+ DSInvalidated = true;
+ }
for (const MachineOperand &Op : MI.all_uses()) {
if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
continue;
@@ -3324,8 +3323,6 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
}
}
}
- // Accumulate unprotected DS stores from this MBB
- SeenDSStoreInLoop |= SeenDSStoreInCurrMBB;
}
// VMEM flush decision
@@ -3339,7 +3336,7 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
// are not used in the loop.
// DSInvalidated is pre-set to true on non-GFX12+ targets where DS_CNT
// is LGKM_CNT which also tracks FLAT/SMEM.
- if (!DSInvalidated && !SeenDSStoreInLoop && UsesVgprLoadedOutsideDS)
+ if (!DSInvalidated && UsesVgprLoadedOutsideDS)
Flags.FlushDsCnt = true;
return Flags;
diff --git a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
index 122e683f79664..f1332b883c488 100644
--- a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
@@ -5,6 +5,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-back-off-barrier < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-BACKOFF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-BACKOFF %s
; Subtargets must wait for outstanding memory instructions before a barrier if
; they cannot back off of the barrier.
@@ -59,6 +61,33 @@ define void @back_off_barrier_no_fence(ptr %in, ptr %out) #0 {
; GFX11-BACKOFF-NEXT: flat_store_b32 v[2:3], v0
; GFX11-BACKOFF-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-BACKOFF-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-BACKOFF-LABEL: back_off_barrier_no_fence:
+; GFX12-BACKOFF: ; %bb.0:
+; GFX12-BACKOFF-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-BACKOFF-NEXT: s_wait_expcnt 0x0
+; GFX12-BACKOFF-NEXT: s_wait_samplecnt 0x0
+; GFX12-BACKOFF-NEXT: s_wait_bvhcnt 0x0
+; GFX12-BACKOFF-NEXT: s_wait_kmcnt 0x0
+; GFX12-BACKOFF-NEXT: flat_load_b32 v0, v[0:1]
+; GFX12-BACKOFF-NEXT: s_barrier_signal -1
+; GFX12-BACKOFF-NEXT: s_barrier_wait -1
+; GFX12-BACKOFF-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-BACKOFF-NEXT: flat_store_b32 v[2:3], v0
+; GFX12-BACKOFF-NEXT: s_wait_dscnt 0x0
+; GFX12-BACKOFF-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-BACKOFF-LABEL: back_off_barrier_no_fence:
+; GFX1250-BACKOFF: ; %bb.0:
+; GFX1250-BACKOFF-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-BACKOFF-NEXT: s_wait_kmcnt 0x0
+; GFX1250-BACKOFF-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-BACKOFF-NEXT: s_barrier_signal -1
+; GFX1250-BACKOFF-NEXT: s_barrier_wait -1
+; GFX1250-BACKOFF-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-BACKOFF-NEXT: flat_store_b32 v[2:3], v0
+; GFX1250-BACKOFF-NEXT: s_wait_dscnt 0x0
+; GFX1250-BACKOFF-NEXT: s_set_pc_i64 s[30:31]
%load = load i32, ptr %in
call void @llvm.amdgcn.s.barrier()
store i32 %load, ptr %out
@@ -121,6 +150,36 @@ define void @back_off_barrier_with_fence(ptr %in, ptr %out) #0 {
; GFX11-BACKOFF-NEXT: flat_store_b32 v[2:3], v0
; GFX11-BACKOFF-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-BACKOFF-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-BACKOFF-LABEL: back_off_barrier_with_fence:
+; GFX12-BACKOFF: ; %bb.0:
+; GFX12-BACKOFF-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-BACKOFF-NEXT: s_wait_expcnt 0x0
+; GFX12-BACKOFF-NEXT: s_wait_samplecnt 0x0
+; GFX12-BACKOFF-NEXT: s_wait_bvhcnt 0x0
+; GFX12-BACKOFF-NEXT: s_wait_kmcnt 0x0
+; GFX12-BACKOFF-NEXT: flat_load_b32 v0, v[0:1]
+; GFX12-BACKOFF-NEXT: s_wait_storecnt 0x0
+; GFX12-BACKOFF-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-BACKOFF-NEXT: s_barrier_signal -1
+; GFX12-BACKOFF-NEXT: s_barrier_wait -1
+; GFX12-BACKOFF-NEXT: global_inv scope:SCOPE_SE
+; GFX12-BACKOFF-NEXT: flat_store_b32 v[2:3], v0
+; GFX12-BACKOFF-NEXT: s_wait_dscnt 0x0
+; GFX12-BACKOFF-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-BACKOFF-LABEL: back_off_barrier_with_fence:
+; GFX1250-BACKOFF: ; %bb.0:
+; GFX1250-BACKOFF-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-BACKOFF-NEXT: s_wait_kmcnt 0x0
+; GFX1250-BACKOFF-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-BACKOFF-NEXT: s_wait_storecnt 0x0
+; GFX1250-BACKOFF-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-BACKOFF-NEXT: s_barrier_signal -1
+; GFX1250-BACKOFF-NEXT: s_barrier_wait -1
+; GFX1250-BACKOFF-NEXT: flat_store_b32 v[2:3], v0
+; GFX1250-BACKOFF-NEXT: s_wait_dscnt 0x0
+; GFX1250-BACKOFF-NEXT: s_set_pc_i64 s[30:31]
%load = load i32, ptr %in
fence syncscope("workgroup") release
call void @llvm.amdgcn.s.barrier()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index fd59fddda0c07..fc6ac2ed85ad3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -3,6 +3,8 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga -O0 -mattr=-flat-for-global -amdgpu-dpp-combine=false < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-NOOPT %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-dpp-combine=false < %s | FileCheck --check-prefixes=GCN,GFX10,GCN-OPT %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -amdgpu-dpp-combine=false < %s | FileCheck --check-prefixes=GCN,GFX11,GCN-OPT %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -amdgpu-dpp-combine=false < %s | FileCheck --check-prefixes=GCN,GFX12,GCN-OPT %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -amdgpu-dpp-combine=false < %s | FileCheck --check-prefixes=GCN,GFX1250,GCN-OPT %s
define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
; GFX8-OPT-LABEL: dpp_test:
@@ -65,6 +67,33 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) #0
store i32 %tmp0, ptr addrspace(1) %out
ret void
@@ -131,6 +160,33 @@ define amdgpu_kernel void @dpp_test_bc(ptr addrspace(1) %out, i32 %in1, i32 %in2
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_bc:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_bc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 true) #0
store i32 %tmp0, ptr addrspace(1) %out
ret void
@@ -222,6 +278,49 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0
+; GFX12-NEXT: ds_load_b32 v1, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v0, s0, s0, v0
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v3, v1, v1
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX12-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX12-NEXT: s_barrier_signal -1
+; GFX12-NEXT: s_barrier_wait -1
+; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: flat_store_b32 v[0:1], v2
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test1:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffc, v0
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_add_nc_u32_e32 v1, v1, v1
+; GFX1250-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_add_nc_u32_e32 v1, v2, v1
+; GFX1250-NEXT: s_barrier_signal -1
+; GFX1250-NEXT: s_barrier_wait -1
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -323,6 +422,39 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: update_dppi64_test:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: update_dppi64_test:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-NEXT: v_mov_b32_e32 v3, s3
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: global_store_b64 v4, v[2:3], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
%load = load i64, ptr addrspace(1) %gep
@@ -416,6 +548,39 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: update_dppf64_test:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: update_dppf64_test:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-NEXT: v_mov_b32_e32 v3, s3
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: global_store_b64 v4, v[2:3], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
%load = load double, ptr addrspace(1) %gep
@@ -509,6 +674,39 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: update_dppv2i32_test:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: update_dppv2i32_test:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-NEXT: v_mov_b32_e32 v3, s3
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: global_store_b64 v4, v[2:3], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i32>, ptr addrspace(1) %arg, i32 %id
%load = load <2 x i32>, ptr addrspace(1) %gep
@@ -602,6 +800,39 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: update_dppv2f32_test:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: update_dppv2f32_test:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-NEXT: v_mov_b32_e32 v3, s3
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: global_store_b64 v4, v[2:3], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep
@@ -695,6 +926,39 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: update_dpp_p0_test:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: update_dpp_p0_test:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-NEXT: v_mov_b32_e32 v3, s3
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: global_store_b64 v4, v[2:3], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds ptr, ptr addrspace(1) %arg, i32 %id
%load = load ptr, ptr addrspace(1) %gep
@@ -762,6 +1026,35 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa
; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: ds_store_b32 v0, v2
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: update_dpp_p3_test:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: ds_load_b32 v1, v0
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: ds_store_b32 v0, v2
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: update_dpp_p3_test:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: ds_store_b32 v0, v2
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %arg, i32 %id
%load = load ptr addrspace(3), ptr addrspace(3) %gep
@@ -844,6 +1137,34 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa
; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: scratch_store_b32 v0, v2, off
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: update_dpp_p5_test:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: scratch_load_b32 v1, v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: scratch_store_b32 v0, v2, s0
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: update_dpp_p5_test:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: scratch_load_b32 v1, v0, s0 scale_offset
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: scratch_store_b32 v0, v2, s0 scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds ptr addrspace(5), ptr addrspace(5) %arg, i32 %id
%load = load ptr addrspace(5), ptr addrspace(5) %gep
@@ -939,6 +1260,37 @@ define amdgpu_kernel void @update_dppi64_imm_old_test(ptr addrspace(1) %arg, i64
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: update_dppi64_imm_old_test:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: v_mov_b32_e32 v3, 0x7047
+; GFX12-NEXT: v_mov_b32_e32 v2, 0x3afaedd9
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: update_dppi64_imm_old_test:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-NEXT: v_mov_b32_e32 v3, 0x7047
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0x3afaedd9
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: global_store_b64 v4, v[2:3], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
%load = load i64, ptr addrspace(1) %gep
@@ -1034,6 +1386,37 @@ define amdgpu_kernel void @update_dppf64_imm_old_test(ptr addrspace(1) %arg, dou
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: update_dppf64_imm_old_test:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: v_mov_b32_e32 v3, 0x405edce1
+; GFX12-NEXT: v_mov_b32_e32 v2, 0x6b8564a
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: update_dppf64_imm_old_test:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-NEXT: v_mov_b32_e32 v3, 0x405edce1
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0x6b8564a
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: global_store_b64 v4, v[2:3], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
%load = load double, ptr addrspace(1) %gep
@@ -1125,6 +1508,39 @@ define amdgpu_kernel void @update_dppi64_imm_src_test(ptr addrspace(1) %out, i64
; GFX11-NEXT: v_mov_b32_dpp v0, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: update_dppi64_imm_src_test:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0x7047
+; GFX12-NEXT: v_mov_b32_e32 v3, 0x3afaedd9
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: v_mov_b32_dpp v0, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: update_dppi64_imm_src_test:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0x7047
+; GFX1250-NEXT: v_mov_b32_e32 v3, 0x3afaedd9
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: v_mov_b32_dpp v0, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 123451234512345, i32 1, i32 1, i32 1, i1 false) #0
store i64 %tmp0, ptr addrspace(1) %out
ret void
@@ -1213,6 +1629,39 @@ define amdgpu_kernel void @update_dppf64_imm_src_test(ptr addrspace(1) %out, dou
; GFX11-NEXT: v_mov_b32_dpp v0, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: update_dppf64_imm_src_test:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0x405edce1
+; GFX12-NEXT: v_mov_b32_e32 v3, 0x6b8564a
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: v_mov_b32_dpp v0, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: update_dppf64_imm_src_test:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0x405edce1
+; GFX1250-NEXT: v_mov_b32_e32 v3, 0x6b8564a
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: v_mov_b32_dpp v0, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call double @llvm.amdgcn.update.dpp.f64(double %in1, double 123.451234512345, i32 1, i32 1, i32 1, i1 false) #0
store double %tmp0, ptr addrspace(1) %out
ret void
@@ -1279,6 +1728,33 @@ define amdgpu_kernel void @dpp_test_f32(ptr addrspace(1) %out, float %in1, float
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 1, i32 1, i32 1, i1 false)
store float %tmp0, ptr addrspace(1) %out
ret void
@@ -1345,6 +1821,33 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb1(ptr addrspace(1) %out, float %
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_f32_imm_comb1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_f32_imm_comb1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 0, i32 0, i32 0, i1 false)
store float %tmp0, ptr addrspace(1) %out
ret void
@@ -1411,6 +1914,33 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb2(ptr addrspace(1) %out, float %
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_f32_imm_comb2:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_f32_imm_comb2:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 3, i32 3, i32 3, i1 false)
store float %tmp0, ptr addrspace(1) %out
ret void
@@ -1477,6 +2007,33 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb3(ptr addrspace(1) %out, float %
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_f32_imm_comb3:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_f32_imm_comb3:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 1, i32 2, i32 3, i1 true)
store float %tmp0, ptr addrspace(1) %out
ret void
@@ -1543,6 +2100,33 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb4(ptr addrspace(1) %out, float %
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_f32_imm_comb4:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_f32_imm_comb4:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 4, i32 3, i32 2, i1 true)
store float %tmp0, ptr addrspace(1) %out
ret void
@@ -1609,6 +2193,33 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb5(ptr addrspace(1) %out, float %
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_f32_imm_comb5:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_f32_imm_comb5:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 14, i32 13, i1 true)
store float %tmp0, ptr addrspace(1) %out
ret void
@@ -1675,6 +2286,33 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb6(ptr addrspace(1) %out, float %
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_f32_imm_comb6:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_f32_imm_comb6:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 15, i32 15, i1 true)
store float %tmp0, ptr addrspace(1) %out
ret void
@@ -1742,6 +2380,33 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb7(ptr addrspace(1) %out, float %
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_f32_imm_comb7:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_f32_imm_comb7:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 64, i32 0, i32 0, i1 true)
store float %tmp0, ptr addrspace(1) %out
ret void
@@ -1808,6 +2473,33 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb8(ptr addrspace(1) %out, float %
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_f32_imm_comb8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_f32_imm_comb8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 31, i32 15, i32 0, i1 true)
store float %tmp0, ptr addrspace(1) %out
ret void
@@ -1874,6 +2566,33 @@ define amdgpu_kernel void @dpp_test_v2i16(ptr addrspace(1) %out, <2 x i16> %in1,
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_v2i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_v2i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 1, i32 1, i32 1, i1 false)
store <2 x i16> %tmp0, ptr addrspace(1) %out
ret void
@@ -1940,6 +2659,33 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb1(ptr addrspace(1) %out, <2 x
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_v2i16_imm_comb1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_v2i16_imm_comb1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 0, i32 0, i32 0, i1 false)
store <2 x i16> %tmp0, ptr addrspace(1) %out
ret void
@@ -2006,6 +2752,33 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb2(ptr addrspace(1) %out, <2 x
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_v2i16_imm_comb2:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_v2i16_imm_comb2:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 3, i32 3, i32 3, i1 false)
store <2 x i16> %tmp0, ptr addrspace(1) %out
ret void
@@ -2072,6 +2845,33 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb3(ptr addrspace(1) %out, <2 x
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_v2i16_imm_comb3:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_v2i16_imm_comb3:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 1, i32 2, i32 3, i1 true)
store <2 x i16> %tmp0, ptr addrspace(1) %out
ret void
@@ -2138,6 +2938,33 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb4(ptr addrspace(1) %out, <2 x
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_v2i16_imm_comb4:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_v2i16_imm_comb4:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 4, i32 3, i32 2, i1 true)
store <2 x i16> %tmp0, ptr addrspace(1) %out
ret void
@@ -2204,6 +3031,33 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb5(ptr addrspace(1) %out, <2 x
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_v2i16_imm_comb5:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_v2i16_imm_comb5:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 14, i32 13, i1 true)
store <2 x i16> %tmp0, ptr addrspace(1) %out
ret void
@@ -2270,6 +3124,33 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb6(ptr addrspace(1) %out, <2 x
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_v2i16_imm_comb6:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_v2i16_imm_comb6:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 15, i32 15, i1 true)
store <2 x i16> %tmp0, ptr addrspace(1) %out
ret void
@@ -2336,6 +3217,33 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb7(ptr addrspace(1) %out, <2 x
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_v2i16_imm_comb7:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_v2i16_imm_comb7:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 64, i32 0, i32 0, i1 true)
store <2 x i16> %tmp0, ptr addrspace(1) %out
ret void
@@ -2402,6 +3310,33 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb8(ptr addrspace(1) %out, <2 x
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_v2i16_imm_comb8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_v2i16_imm_comb8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 31, i32 15, i32 0, i1 true)
store <2 x i16> %tmp0, ptr addrspace(1) %out
ret void
@@ -2468,6 +3403,33 @@ define amdgpu_kernel void @dpp_test_v2f16(ptr addrspace(1) %out, <2 x half> %in1
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_v2f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_v2f16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 1, i32 1, i32 1, i1 false)
store <2 x half> %tmp0, ptr addrspace(1) %out
ret void
@@ -2534,6 +3496,33 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb1(ptr addrspace(1) %out, <2 x
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_v2f16_imm_comb1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_v2f16_imm_comb1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 0, i32 0, i32 0, i1 false)
store <2 x half> %tmp0, ptr addrspace(1) %out
ret void
@@ -2600,6 +3589,33 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb2(ptr addrspace(1) %out, <2 x
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_v2f16_imm_comb2:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_v2f16_imm_comb2:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 3, i32 3, i32 3, i1 false)
store <2 x half> %tmp0, ptr addrspace(1) %out
ret void
@@ -2666,6 +3682,33 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb3(ptr addrspace(1) %out, <2 x
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_v2f16_imm_comb3:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_v2f16_imm_comb3:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 1, i32 2, i32 3, i1 true)
store <2 x half> %tmp0, ptr addrspace(1) %out
ret void
@@ -2732,6 +3775,33 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb4(ptr addrspace(1) %out, <2 x
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_v2f16_imm_comb4:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_v2f16_imm_comb4:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 4, i32 3, i32 2, i1 true)
store <2 x half> %tmp0, ptr addrspace(1) %out
ret void
@@ -2798,6 +3868,33 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb5(ptr addrspace(1) %out, <2 x
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_v2f16_imm_comb5:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_v2f16_imm_comb5:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 14, i32 13, i1 true)
store <2 x half> %tmp0, ptr addrspace(1) %out
ret void
@@ -2864,6 +3961,33 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb6(ptr addrspace(1) %out, <2 x
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_v2f16_imm_comb6:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_v2f16_imm_comb6:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 15, i32 15, i1 true)
store <2 x half> %tmp0, ptr addrspace(1) %out
ret void
@@ -2930,6 +4054,33 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb7(ptr addrspace(1) %out, <2 x
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_v2f16_imm_comb7:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_v2f16_imm_comb7:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 64, i32 0, i32 0, i1 true)
store <2 x half> %tmp0, ptr addrspace(1) %out
ret void
@@ -2996,6 +4147,33 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb8(ptr addrspace(1) %out, <2 x
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: dpp_test_v2f16_imm_comb8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: dpp_test_v2f16_imm_comb8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
%tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 31, i32 15, i32 0, i1 true)
store <2 x half> %tmp0, ptr addrspace(1) %out
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-store-barrier.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-store-barrier.mir
new file mode 100644
index 0000000000000..5b8b5207b1eee
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-store-barrier.mir
@@ -0,0 +1,77 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass=si-insert-waitcnts -o - %s | FileCheck %s
+
+# Test for the fix that removed the incorrect S_BARRIER check for DS stores.
+# Previously, the code would reset SeenDSStoreInCurrMBB when encountering an
+# S_BARRIER, incorrectly assuming that stores postdominated by a barrier would
+# be waited at the barrier. This was wrong because:
+# 1. S_BARRIER without AutoWaitcntBeforeBarrier does not automatically wait for DS stores to complete
+# 2. S_BARRIER with BackOffBarrier feature does not later flush memory ops by adding ZERO waitcnt
+#
+# This test ensures that when a loop has a DS store followed by S_BARRIER,
+# the preheader flush optimization is NOT applied (no S_WAIT_DSCNT in preheader).
+# The wait should happen inside the loop instead.
+
+---
+# Test: DS store followed by S_BARRIER in loop.
+# DS load in preheader, value used in loop.
+# The preheader should NOT have S_WAIT_DSCNT because SeenDSStoreInLoop = true.
+# Instead, the wait should be inside the loop.
+name: ds_store_barrier_no_preheader_flush
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; CHECK-LABEL: name: ds_store_barrier_no_preheader_flush
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ ; Verify NO S_WAIT_DSCNT in preheader - the wait must be inside the loop
+ ; CHECK-NOT: S_WAIT_DSCNT
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_WAIT_DSCNT 0
+ ; CHECK-NEXT: $vgpr30 = V_ADD_F32_e32 $vgpr10, $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: DS_WRITE_B32 $vgpr2, $vgpr30, 0, 0, implicit $m0, implicit $exec
+ ; With BackOffBarrier, no S_WAIT_DSCNT needed before S_BARRIER
+ ; CHECK-NEXT: S_BARRIER
+ ; CHECK-NEXT: $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1
+ liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2
+
+ ; Preheader: DS load
+ $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+ liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr10
+
+ ; Use DS-loaded value from preheader
+ $vgpr30 = V_ADD_F32_e32 $vgpr10, $vgpr1, implicit $mode, implicit $exec
+
+ ; DS store followed by barrier - this should NOT reset SeenDSStoreInCurrMBB
+ DS_WRITE_B32 $vgpr2, $vgpr30, 0, 0, implicit $m0, implicit $exec
+ S_BARRIER
+
+ ; Loop control
+ $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
More information about the llvm-commits
mailing list