[llvm] Main merge true16 codegen dsload (PR #131427)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 14 22:40:34 PDT 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/131427
>From b2c7a7636101dfd2c56be5d328818ae9b2df0e77 Mon Sep 17 00:00:00 2001
From: Abhinav <abhinav.garg at amd.com>
Date: Thu, 7 Nov 2024 11:34:15 +0530
Subject: [PATCH 1/2] CodeGen using True16 D16 LDS ld/st pseudo instructions
Implement new pseudos with the suffix _t16 which have VGPR_16 as the
store src or load dst. This affects LDS 8 and 16-bit loads and stores.
Lower the pseudos to the existing real instructions in MC inst layer
with VGPR_32 src or dst (which makes them consistent with the hardware
encoding). This patch reduces VGPR usage by making hi halves of VGPRs
available for other values.
Modified lit tests.
---
llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 41 ++
llvm/lib/Target/AMDGPU/DSInstructions.td | 87 ++-
.../AMDGPU/GlobalISel/store-local.128.ll | 61 ++-
.../AMDGPU/GlobalISel/store-local.96.ll | 44 +-
llvm/test/CodeGen/AMDGPU/atomic_load_local.ll | 509 ++++++++++++++----
.../test/CodeGen/AMDGPU/atomic_store_local.ll | 486 ++++++++++++++---
llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 92 ++--
llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll | 28 +-
8 files changed, 1043 insertions(+), 305 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 895d1e77bf1c4..edf62d25e0f8e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -187,6 +187,47 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
OutMI.addOperand(Dest);
OutMI.addOperand(Src);
return;
+ } else if (const auto *Info = AMDGPU::getT16D16Helper(Opcode)) {
+ uint16_t OpName = AMDGPU::OpName::OPERAND_LAST;
+ if (TII->isDS(Opcode)) {
+ if (MI->mayLoad())
+ OpName = llvm::AMDGPU::OpName::vdst;
+ else if (MI->mayStore())
+ OpName = llvm::AMDGPU::OpName::data0;
+ else
+ llvm_unreachable("LDS load or store expected");
+ } else {
+ OpName = AMDGPU::hasNamedOperand(Opcode, llvm::AMDGPU::OpName::vdata)
+ ? llvm::AMDGPU::OpName::vdata
+ : llvm::AMDGPU::OpName::vdst;
+ }
+ int VDstOrVDataIdx = AMDGPU::getNamedOperandIdx(Opcode, OpName);
+ MachineOperand MIVDstOrVData = MI->getOperand(VDstOrVDataIdx);
+ bool IsHi = AMDGPU::isHi16Reg(MIVDstOrVData.getReg(), TRI);
+ Opcode = IsHi ? Info->HiOp : Info->LoOp;
+ MIVDstOrVData.clearParent(); // Avoid use list error in setReg call
+ MIVDstOrVData.setReg(TRI.get32BitRegister(MIVDstOrVData.getReg()));
+
+ int MCOpcode = TII->pseudoToMCOpcode(Opcode);
+ assert(MCOpcode != -1 &&
+ "Pseudo instruction doesn't have a target-specific version");
+ OutMI.setOpcode(MCOpcode);
+ for (int I = 0, E = MI->getNumExplicitOperands(); I < E; I++) {
+ const MachineOperand &MO = MI->getOperand(I);
+ MCOperand MCOp;
+ if (I == VDstOrVDataIdx)
+ lowerOperand(MIVDstOrVData, MCOp);
+ else
+ lowerOperand(MO, MCOp);
+ OutMI.addOperand(MCOp);
+ }
+
+ if (AMDGPU::hasNamedOperand(MCOpcode, AMDGPU::OpName::vdst_in)) {
+ MCOperand MCOp;
+ lowerOperand(MIVDstOrVData, MCOp);
+ OutMI.addOperand(MCOp);
+ }
+ return;
} else if (Opcode == AMDGPU::SI_TCRETURN ||
Opcode == AMDGPU::SI_TCRETURN_GFX) {
// TODO: How to use branch immediate and avoid register+add?
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index d3487daee364f..e1e7433b04697 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -127,6 +127,15 @@ multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
}
}
+multiclass DS_1A1D_NORET_t16<string opName, RegisterClass rc = VGPR_32>
+: DS_1A1D_NORET_mc<opName, rc> {
+ let has_m0_read = 0 in {
+ let True16Predicate = UseRealTrue16Insts in {
+ def "_t16" : DS_1A1D_NORET<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_D16_HI", NAME>;
+ }
+ }
+}
+
multiclass DS_1A1D_NORET_mc_gfx9<string opName, RegisterClass rc = VGPR_32> {
let has_m0_read = 0 in {
def "" : DS_1A1D_NORET<opName, rc>;
@@ -294,6 +303,15 @@ multiclass DS_1A_RET_mc<string opName, RegisterClass rc = VGPR_32, bit HasTiedOu
}
}
+multiclass DS_1A_RET_t16<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = Offset>
+: DS_1A_RET_mc<opName, rc, HasTiedOutput, ofs> {
+ let has_m0_read = 0 in {
+ let True16Predicate = UseRealTrue16Insts in {
+ def "_t16" : DS_1A_RET<opName#"_t16", VGPR_16, HasTiedOutput, ofs>, True16D16Table<NAME#"_D16_HI", NAME#"_D16">;
+ }
+ }
+}
+
multiclass DS_1A_RET_NoM0<string opName, RegisterClass rc = VGPR_32> {
let has_m0_read = 0 in {
def "" : DS_1A_RET<opName, rc>;
@@ -457,8 +475,6 @@ defm DS_MIN_F32 : DS_1A1D_NORET_mc<"ds_min_f32">;
defm DS_MAX_F32 : DS_1A1D_NORET_mc<"ds_max_f32">;
let mayLoad = 0 in {
-defm DS_WRITE_B8 : DS_1A1D_NORET_mc<"ds_write_b8">;
-defm DS_WRITE_B16 : DS_1A1D_NORET_mc<"ds_write_b16">;
defm DS_WRITE_B32 : DS_1A1D_NORET_mc<"ds_write_b32">;
defm DS_WRITE2_B32 : DS_1A2D_Off8_NORET_mc<"ds_write2_b32">;
defm DS_WRITE2ST64_B32: DS_1A2D_Off8_NORET_mc<"ds_write2st64_b32">;
@@ -473,6 +489,9 @@ def DS_WRITE_B16_D16_HI : DS_1A1D_NORET<"ds_write_b16_d16_hi">;
} // End has_m0_read = 0
+defm DS_WRITE_B8 : DS_1A1D_NORET_t16<"ds_write_b8">;
+defm DS_WRITE_B16 : DS_1A1D_NORET_t16<"ds_write_b16">;
+
let SubtargetPredicate = HasDSAddTid in {
def DS_WRITE_ADDTID_B32 : DS_0A1D_NORET<"ds_write_addtid_b32">;
}
@@ -625,10 +644,7 @@ def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, Swizzle>;
}
let mayStore = 0 in {
-defm DS_READ_I8 : DS_1A_RET_mc<"ds_read_i8">;
-defm DS_READ_U8 : DS_1A_RET_mc<"ds_read_u8">;
defm DS_READ_I16 : DS_1A_RET_mc<"ds_read_i16">;
-defm DS_READ_U16 : DS_1A_RET_mc<"ds_read_u16">;
defm DS_READ_B32 : DS_1A_RET_mc<"ds_read_b32">;
defm DS_READ_B64 : DS_1A_RET_mc<"ds_read_b64", VReg_64>;
@@ -649,6 +665,10 @@ def DS_READ_U16_D16_HI : DS_1A_RET_Tied<"ds_read_u16_d16_hi">;
}
} // End has_m0_read = 0
+defm DS_READ_I8 : DS_1A_RET_t16<"ds_read_i8">;
+defm DS_READ_U8 : DS_1A_RET_t16<"ds_read_u8">;
+defm DS_READ_U16 : DS_1A_RET_t16<"ds_read_u16">;
+
let SubtargetPredicate = HasDSAddTid in {
def DS_READ_ADDTID_B32 : DS_0A_RET<"ds_read_addtid_b32">;
}
@@ -784,34 +804,51 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
}
}
+multiclass DSReadPat_t16<DS_Pseudo inst, ValueType vt, string frag> {
+
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSReadPat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+ }
+
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+ let True16Predicate = p in {
+ def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ }
+ let True16Predicate = UseRealTrue16Insts in {
+ def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>;
+ }
+ }
+}
+
class DSReadPat_D16 <DS_Pseudo inst, PatFrag frag, ValueType vt> : GCNPat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in),
(inst $ptr, Offset:$offset, (i1 0), $in)
>;
defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">;
-defm : DSReadPat_mc <DS_READ_I8, i16, "sextloadi8_local">;
defm : DSReadPat_mc <DS_READ_U8, i32, "extloadi8_local">;
defm : DSReadPat_mc <DS_READ_U8, i32, "zextloadi8_local">;
-defm : DSReadPat_mc <DS_READ_U8, i16, "extloadi8_local">;
-defm : DSReadPat_mc <DS_READ_U8, i16, "zextloadi8_local">;
defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "extloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "zextloadi16_local">;
-defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">;
+defm : DSReadPat_t16 <DS_READ_I8, i16, "sextloadi8_local">;
+defm : DSReadPat_t16 <DS_READ_U8, i16, "extloadi8_local">;
+defm : DSReadPat_t16 <DS_READ_U8, i16, "zextloadi8_local">;
+defm : DSReadPat_t16 <DS_READ_U16, i16, "load_local">;
foreach vt = Reg32Types.types in {
defm : DSReadPat_mc <DS_READ_B32, vt, "load_local">;
}
-defm : DSReadPat_mc <DS_READ_U8, i16, "atomic_load_8_local">;
+defm : DSReadPat_t16 <DS_READ_U8, i16, "atomic_load_8_local">;
defm : DSReadPat_mc <DS_READ_U8, i32, "atomic_load_8_local">;
-defm : DSReadPat_mc <DS_READ_U8, i16, "atomic_load_zext_8_local">;
+defm : DSReadPat_t16 <DS_READ_U8, i16, "atomic_load_zext_8_local">;
defm : DSReadPat_mc <DS_READ_U8, i32, "atomic_load_zext_8_local">;
-defm : DSReadPat_mc <DS_READ_I8, i16, "atomic_load_sext_8_local">;
+defm : DSReadPat_t16 <DS_READ_I8, i16, "atomic_load_sext_8_local">;
defm : DSReadPat_mc <DS_READ_I8, i32, "atomic_load_sext_8_local">;
-defm : DSReadPat_mc <DS_READ_U16, i16, "atomic_load_16_local">;
+defm : DSReadPat_t16 <DS_READ_U16, i16, "atomic_load_16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "atomic_load_16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "atomic_load_zext_16_local">;
defm : DSReadPat_mc <DS_READ_I16, i32, "atomic_load_sext_16_local">;
@@ -850,18 +887,34 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
}
}
+multiclass DSWritePat_t16 <DS_Pseudo inst, ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSWritePat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+ }
+
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+ let True16Predicate = p in {
+ def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ }
+ let True16Predicate = UseRealTrue16Insts in {
+ def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>;
+ }
+ }
+}
+
defm : DSWritePat_mc <DS_WRITE_B8, i32, "truncstorei8_local">;
defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">;
-defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
-defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
+defm : DSWritePat_t16 <DS_WRITE_B8, i16, "truncstorei8_local">;
+defm : DSWritePat_t16 <DS_WRITE_B16, i16, "store_local">;
foreach vt = Reg32Types.types in {
defm : DSWritePat_mc <DS_WRITE_B32, vt, "store_local">;
}
-defm : DSWritePat_mc <DS_WRITE_B8, i16, "atomic_store_8_local">;
+defm : DSWritePat_t16 <DS_WRITE_B8, i16, "atomic_store_8_local">;
defm : DSWritePat_mc <DS_WRITE_B8, i32, "atomic_store_8_local">;
-defm : DSWritePat_mc <DS_WRITE_B16, i16, "atomic_store_16_local">;
+defm : DSWritePat_t16 <DS_WRITE_B16, i16, "atomic_store_16_local">;
defm : DSWritePat_mc <DS_WRITE_B16, i32, "atomic_store_16_local">;
defm : DSWritePat_mc <DS_WRITE_B32, i32, "atomic_store_32_local">;
defm : DSWritePat_mc <DS_WRITE_B64, i64, "atomic_store_64_local">;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index e81bae5d3a416..f6fbae88dc84a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -239,48 +239,53 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s6, 0xffff, s0
; GFX11-NEXT: s_lshr_b32 s5, s0, 16
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4
+; GFX11-NEXT: v_mov_b16_e32 v0.l, s0
+; GFX11-NEXT: v_mov_b32_e32 v5, s4
; GFX11-NEXT: s_lshr_b32 s0, s1, 16
; GFX11-NEXT: s_and_b32 s4, 0xffff, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2
+; GFX11-NEXT: v_mov_b16_e32 v0.h, s1
; GFX11-NEXT: s_lshr_b32 s1, s2, 16
; GFX11-NEXT: s_and_b32 s7, 0xffff, s2
+; GFX11-NEXT: v_mov_b16_e32 v1.l, s2
; GFX11-NEXT: s_lshr_b32 s2, s6, 8
; GFX11-NEXT: s_lshr_b32 s6, s5, 8
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s6
-; GFX11-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_mov_b16_e32 v2.h, s2
+; GFX11-NEXT: v_mov_b16_e32 v1.h, s5
; GFX11-NEXT: s_lshr_b32 s4, s4, 8
; GFX11-NEXT: s_lshr_b32 s5, s0, 8
+; GFX11-NEXT: v_mov_b16_e32 v2.l, s0
; GFX11-NEXT: s_lshr_b32 s0, s7, 8
-; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5
-; GFX11-NEXT: ds_store_b8 v1, v0
-; GFX11-NEXT: ds_store_b8 v1, v6 offset:1
-; GFX11-NEXT: ds_store_b8 v1, v4 offset:2
-; GFX11-NEXT: ds_store_b8 v1, v7 offset:3
-; GFX11-NEXT: ds_store_b8 v1, v2 offset:4
-; GFX11-NEXT: ds_store_b8 v1, v8 offset:5
-; GFX11-NEXT: ds_store_b8 v1, v5 offset:6
-; GFX11-NEXT: ds_store_b8 v1, v9 offset:7
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s3
+; GFX11-NEXT: v_mov_b16_e32 v3.l, s6
+; GFX11-NEXT: v_mov_b16_e32 v3.h, s4
+; GFX11-NEXT: v_mov_b16_e32 v4.l, s5
+; GFX11-NEXT: ds_store_b8 v5, v0
+; GFX11-NEXT: ds_store_b8_d16_hi v5, v2 offset:1
+; GFX11-NEXT: ds_store_b8_d16_hi v5, v1 offset:2
+; GFX11-NEXT: ds_store_b8 v5, v3 offset:3
+; GFX11-NEXT: ds_store_b8_d16_hi v5, v0 offset:4
+; GFX11-NEXT: ds_store_b8_d16_hi v5, v3 offset:5
+; GFX11-NEXT: ds_store_b8 v5, v2 offset:6
+; GFX11-NEXT: ds_store_b8 v5, v4 offset:7
+; GFX11-NEXT: v_mov_b16_e32 v0.l, s0
; GFX11-NEXT: s_lshr_b32 s0, s1, 8
-; GFX11-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-NEXT: v_mov_b32_e32 v4, s0
+; GFX11-NEXT: v_mov_b16_e32 v0.h, s1
+; GFX11-NEXT: v_mov_b16_e32 v1.h, s0
; GFX11-NEXT: s_and_b32 s0, 0xffff, s3
; GFX11-NEXT: s_lshr_b32 s1, s3, 16
; GFX11-NEXT: s_lshr_b32 s0, s0, 8
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX11-NEXT: v_mov_b16_e32 v2.l, s3
+; GFX11-NEXT: v_mov_b16_e32 v2.h, s0
; GFX11-NEXT: s_lshr_b32 s0, s1, 8
-; GFX11-NEXT: v_mov_b32_e32 v8, s0
-; GFX11-NEXT: ds_store_b8 v1, v3 offset:8
-; GFX11-NEXT: ds_store_b8 v1, v0 offset:9
-; GFX11-NEXT: ds_store_b8 v1, v2 offset:10
-; GFX11-NEXT: ds_store_b8 v1, v4 offset:11
-; GFX11-NEXT: ds_store_b8 v1, v5 offset:12
-; GFX11-NEXT: ds_store_b8 v1, v6 offset:13
-; GFX11-NEXT: ds_store_b8 v1, v7 offset:14
-; GFX11-NEXT: ds_store_b8 v1, v8 offset:15
+; GFX11-NEXT: v_mov_b16_e32 v3.l, s1
+; GFX11-NEXT: v_mov_b16_e32 v3.h, s0
+; GFX11-NEXT: ds_store_b8 v5, v1 offset:8
+; GFX11-NEXT: ds_store_b8 v5, v0 offset:9
+; GFX11-NEXT: ds_store_b8_d16_hi v5, v0 offset:10
+; GFX11-NEXT: ds_store_b8_d16_hi v5, v1 offset:11
+; GFX11-NEXT: ds_store_b8 v5, v2 offset:12
+; GFX11-NEXT: ds_store_b8_d16_hi v5, v2 offset:13
+; GFX11-NEXT: ds_store_b8 v5, v3 offset:14
+; GFX11-NEXT: ds_store_b8_d16_hi v5, v3 offset:15
; GFX11-NEXT: s_endpgm
store <4 x i32> %x, ptr addrspace(3) %out, align 1
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
index 030f01a8bd5ea..27816a9375d30 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
@@ -207,36 +207,42 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
; GFX11-NEXT: s_and_b32 s5, 0xffff, s0
; GFX11-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11-NEXT: v_mov_b16_e32 v0.l, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v6, s3
; GFX11-NEXT: s_lshr_b32 s0, s1, 16
; GFX11-NEXT: s_and_b32 s3, 0xffff, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2
+; GFX11-NEXT: v_mov_b16_e32 v0.h, s1
; GFX11-NEXT: s_lshr_b32 s1, s2, 16
; GFX11-NEXT: s_and_b32 s6, 0xffff, s2
+; GFX11-NEXT: v_mov_b16_e32 v1.l, s2
; GFX11-NEXT: s_lshr_b32 s2, s5, 8
; GFX11-NEXT: s_lshr_b32 s5, s4, 8
-; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s2
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_mov_b16_e32 v3.l, s2
+; GFX11-NEXT: v_mov_b16_e32 v1.h, s4
; GFX11-NEXT: s_lshr_b32 s3, s3, 8
; GFX11-NEXT: s_lshr_b32 s4, s0, 8
+; GFX11-NEXT: v_mov_b16_e32 v2.l, s0
; GFX11-NEXT: s_lshr_b32 s0, s6, 8
; GFX11-NEXT: s_lshr_b32 s6, s1, 8
-; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s3
-; GFX11-NEXT: v_dual_mov_b32 v10, s4 :: v_dual_mov_b32 v11, s0
-; GFX11-NEXT: v_mov_b32_e32 v12, s6
-; GFX11-NEXT: ds_store_b8 v1, v0
-; GFX11-NEXT: ds_store_b8 v1, v7 offset:1
-; GFX11-NEXT: ds_store_b8 v1, v4 offset:2
-; GFX11-NEXT: ds_store_b8 v1, v8 offset:3
-; GFX11-NEXT: ds_store_b8 v1, v2 offset:4
-; GFX11-NEXT: ds_store_b8 v1, v9 offset:5
-; GFX11-NEXT: ds_store_b8 v1, v5 offset:6
-; GFX11-NEXT: ds_store_b8 v1, v10 offset:7
-; GFX11-NEXT: ds_store_b8 v1, v3 offset:8
-; GFX11-NEXT: ds_store_b8 v1, v11 offset:9
-; GFX11-NEXT: ds_store_b8 v1, v6 offset:10
-; GFX11-NEXT: ds_store_b8 v1, v12 offset:11
+; GFX11-NEXT: v_mov_b16_e32 v3.h, s5
+; GFX11-NEXT: v_mov_b16_e32 v2.h, s1
+; GFX11-NEXT: v_mov_b16_e32 v4.l, s3
+; GFX11-NEXT: v_mov_b16_e32 v4.h, s4
+; GFX11-NEXT: v_mov_b16_e32 v5.l, s0
+; GFX11-NEXT: v_mov_b16_e32 v5.h, s6
+; GFX11-NEXT: ds_store_b8 v6, v0
+; GFX11-NEXT: ds_store_b8 v6, v3 offset:1
+; GFX11-NEXT: ds_store_b8_d16_hi v6, v1 offset:2
+; GFX11-NEXT: ds_store_b8_d16_hi v6, v3 offset:3
+; GFX11-NEXT: ds_store_b8_d16_hi v6, v0 offset:4
+; GFX11-NEXT: ds_store_b8 v6, v4 offset:5
+; GFX11-NEXT: ds_store_b8 v6, v2 offset:6
+; GFX11-NEXT: ds_store_b8_d16_hi v6, v4 offset:7
+; GFX11-NEXT: ds_store_b8 v6, v1 offset:8
+; GFX11-NEXT: ds_store_b8 v6, v5 offset:9
+; GFX11-NEXT: ds_store_b8_d16_hi v6, v2 offset:10
+; GFX11-NEXT: ds_store_b8_d16_hi v6, v5 offset:11
; GFX11-NEXT: s_endpgm
store <3 x i32> %x, ptr addrspace(3) %out, align 1
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
index a3b6c283512f3..7f45b038b6d0d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
@@ -1,208 +1,493 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-; GCN-LABEL: {{^}}atomic_load_monotonic_i8:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_u8 v0, v0{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i8 @atomic_load_monotonic_i8(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i8:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_u8 v0, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i8:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_u8 v0, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_load_monotonic_i8:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_u8_d16 v0, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_load_monotonic_i8:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_u8 v0, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i8, ptr addrspace(3) %ptr monotonic, align 1
ret i8 %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_i8_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_u8 v0, v0 offset:16{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i8 @atomic_load_monotonic_i8_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i8_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_u8 v0, v0 offset:16
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i8_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_u8 v0, v0 offset:16
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_load_monotonic_i8_offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_u8_d16 v0, v0 offset:16
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_load_monotonic_i8_offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_u8 v0, v0 offset:16
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 16
%load = load atomic i8, ptr addrspace(3) %gep monotonic, align 1
ret i8 %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_i16:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_u16 v0, v0{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i16 @atomic_load_monotonic_i16(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i16:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_u16 v0, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_u16 v0, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_load_monotonic_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_load_monotonic_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2
ret i16 %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_i16_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_u16 v0, v0 offset:32{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i16 @atomic_load_monotonic_i16_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i16_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_u16 v0, v0 offset:32
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i16_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_u16 v0, v0 offset:32
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_load_monotonic_i16_offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 offset:32
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_load_monotonic_i16_offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i16, ptr addrspace(3) %ptr, i16 16
%load = load atomic i16, ptr addrspace(3) %gep monotonic, align 2
ret i16 %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_i32:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_b32 v0, v0{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i32 @atomic_load_monotonic_i32(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i32:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_b32 v0, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_b32 v0, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_load_monotonic_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: ds_load_b32 v0, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i32, ptr addrspace(3) %ptr monotonic, align 4
ret i32 %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_i32_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i32 @atomic_load_monotonic_i32_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i32_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_b32 v0, v0 offset:64
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i32_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_b32 v0, v0 offset:64
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_load_monotonic_i32_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: ds_load_b32 v0, v0 offset:64
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i32, ptr addrspace(3) %ptr, i32 16
%load = load atomic i32, ptr addrspace(3) %gep monotonic, align 4
ret i32 %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_i64:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_b64 v[0:1], v0{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i64 @atomic_load_monotonic_i64(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i64:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_b64 v[0:1], v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_b64 v[0:1], v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_load_monotonic_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: ds_load_b64 v[0:1], v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i64, ptr addrspace(3) %ptr monotonic, align 8
ret i64 %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_i64_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i64 @atomic_load_monotonic_i64_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_i64_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_b64 v[0:1], v0 offset:128
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_i64_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_b64 v[0:1], v0 offset:128
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_load_monotonic_i64_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i64, ptr addrspace(3) %ptr, i32 16
%load = load atomic i64, ptr addrspace(3) %gep monotonic, align 8
ret i64 %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_f32_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define float @atomic_load_monotonic_f32_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_f32_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_b32 v0, v0 offset:64
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_f32_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_b32 v0, v0 offset:64
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_load_monotonic_f32_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: ds_load_b32 v0, v0 offset:64
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds float, ptr addrspace(3) %ptr, i32 16
%load = load atomic float, ptr addrspace(3) %gep monotonic, align 4
ret float %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_f64_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define double @atomic_load_monotonic_f64_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_f64_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_b64 v[0:1], v0 offset:128
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_f64_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_b64 v[0:1], v0 offset:128
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_load_monotonic_f64_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds double, ptr addrspace(3) %ptr, i32 16
%load = load atomic double, ptr addrspace(3) %gep monotonic, align 8
ret double %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_p0i8_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define ptr @atomic_load_monotonic_p0i8_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_p0i8_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_b64 v[0:1], v0 offset:128
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_p0i8_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_b64 v[0:1], v0 offset:128
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_load_monotonic_p0i8_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds ptr, ptr addrspace(3) %ptr, i32 16
%load = load atomic ptr, ptr addrspace(3) %gep monotonic, align 8
ret ptr %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_p3i8_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define ptr addrspace(3) @atomic_load_monotonic_p3i8_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_p3i8_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_b32 v0, v0 offset:64
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_p3i8_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_b32 v0, v0 offset:64
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_load_monotonic_p3i8_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: ds_load_b32 v0, v0 offset:64
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %ptr, i32 16
%load = load atomic ptr addrspace(3), ptr addrspace(3) %gep monotonic, align 4
ret ptr addrspace(3) %load
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_f16:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_u16 v0, v0{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i16 @atomic_load_monotonic_f16(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_f16:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_u16 v0, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_u16 v0, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_load_monotonic_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_load_monotonic_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%load = load atomic half, ptr addrspace(3) %ptr monotonic, align 2
%ret = bitcast half %load to i16
ret i16 %ret
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_f16_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_u16 v0, v0 offset:32{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i16 @atomic_load_monotonic_f16_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_f16_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_u16 v0, v0 offset:32
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_f16_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_u16 v0, v0 offset:32
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_load_monotonic_f16_offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 offset:32
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_load_monotonic_f16_offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds half, ptr addrspace(3) %ptr, i32 16
%load = load atomic half, ptr addrspace(3) %gep monotonic, align 2
%ret = bitcast half %load to i16
ret i16 %ret
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_bf16:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_u16 v0, v0{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i16 @atomic_load_monotonic_bf16(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_bf16:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_u16 v0, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_u16 v0, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_load_monotonic_bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_load_monotonic_bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%load = load atomic bfloat, ptr addrspace(3) %ptr monotonic, align 2
%ret = bitcast bfloat %load to i16
ret i16 %ret
}
-; GCN-LABEL: {{^}}atomic_load_monotonic_bf16_offset:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_read_u16 v0, v0 offset:32{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define i16 @atomic_load_monotonic_bf16_offset(ptr addrspace(3) %ptr) {
+; CI-LABEL: atomic_load_monotonic_bf16_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_read_u16 v0, v0 offset:32
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_monotonic_bf16_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_read_u16 v0, v0 offset:32
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_load_monotonic_bf16_offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 offset:32
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_load_monotonic_bf16_offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr addrspace(3) %ptr, i32 16
%load = load atomic bfloat, ptr addrspace(3) %gep monotonic, align 2
%ret = bitcast bfloat %load to i16
ret i16 %ret
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
index cd1e1fb1add47..9236b4018317a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
@@ -1,156 +1,470 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-; GCN-LABEL: {{^}}atomic_store_monotonic_i8:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_write_b8 v0, v1{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define void @atomic_store_monotonic_i8(ptr addrspace(3) %ptr, i8 %val) {
+; CI-LABEL: atomic_store_monotonic_i8:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1
+; CI-NEXT: ds_write_b8 v0, v1
+; CI-NEXT: ds_write_b8 v0, v2
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_store_monotonic_i8:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u16_e32 v2, 2, v1
+; GFX9-NEXT: ds_write_b8 v0, v1
+; GFX9-NEXT: ds_write_b8 v0, v2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_store_monotonic_i8:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2
+; GFX11-TRUE16-NEXT: ds_store_b8 v0, v1
+; GFX11-TRUE16-NEXT: ds_store_b8_d16_hi v0, v1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_store_monotonic_i8:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
+; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1
+; GFX11-FAKE16-NEXT: ds_store_b8 v0, v2
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %val1 = add i8 %val, 2
store atomic i8 %val, ptr addrspace(3) %ptr monotonic, align 1
+ store atomic i8 %val1, ptr addrspace(3) %ptr monotonic, align 1
ret void
}
-; GCN-LABEL: {{^}}atomic_store_monotonic_offset_i8:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_write_b8 v0, v1 offset:16{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define void @atomic_store_monotonic_offset_i8(ptr addrspace(3) %ptr, i8 %val) {
- %gep = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 16
- store atomic i8 %val, ptr addrspace(3) %gep monotonic, align 1
+; CI-LABEL: atomic_store_monotonic_offset_i8:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1
+; CI-NEXT: ds_write_b8 v0, v1 offset:8
+; CI-NEXT: ds_write_b8 v0, v2 offset:16
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_store_monotonic_offset_i8:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u16_e32 v2, 2, v1
+; GFX9-NEXT: ds_write_b8 v0, v1 offset:8
+; GFX9-NEXT: ds_write_b8 v0, v2 offset:16
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_store_monotonic_offset_i8:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2
+; GFX11-TRUE16-NEXT: ds_store_b8 v0, v1 offset:8
+; GFX11-TRUE16-NEXT: ds_store_b8_d16_hi v0, v1 offset:16
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_store_monotonic_offset_i8:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
+; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1 offset:8
+; GFX11-FAKE16-NEXT: ds_store_b8 v0, v2 offset:16
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %val1 = add i8 %val, 2
+ %gep_1 = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 8
+ %gep_2 = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 16
+ store atomic i8 %val, ptr addrspace(3) %gep_1 monotonic, align 1
+ store atomic i8 %val1, ptr addrspace(3) %gep_2 monotonic, align 1
ret void
}
-; GCN-LABEL: {{^}}atomic_store_monotonic_i16:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_write_b16 v0, v1{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define void @atomic_store_monotonic_i16(ptr addrspace(3) %ptr, i16 %val) {
+; CI-LABEL: atomic_store_monotonic_i16:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1
+; CI-NEXT: ds_write_b16 v0, v1
+; CI-NEXT: ds_write_b16 v0, v2
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_store_monotonic_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u16_e32 v2, 2, v1
+; GFX9-NEXT: ds_write_b16 v0, v1
+; GFX9-NEXT: ds_write_b16 v0, v2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_store_monotonic_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2
+; GFX11-TRUE16-NEXT: ds_store_b16 v0, v1
+; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_store_monotonic_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
+; GFX11-FAKE16-NEXT: ds_store_b16 v0, v1
+; GFX11-FAKE16-NEXT: ds_store_b16 v0, v2
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %val1 = add i16 %val, 2
store atomic i16 %val, ptr addrspace(3) %ptr monotonic, align 2
+ store atomic i16 %val1, ptr addrspace(3) %ptr monotonic, align 2
ret void
}
-; GCN-LABEL: {{^}}atomic_store_monotonic_offset_i16:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_write_b16 v0, v1 offset:32{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define void @atomic_store_monotonic_offset_i16(ptr addrspace(3) %ptr, i16 %val) {
+; CI-LABEL: atomic_store_monotonic_offset_i16:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1
+; CI-NEXT: ds_write_b16 v0, v1 offset:32
+; CI-NEXT: ds_write_b16 v0, v2 offset:32
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_store_monotonic_offset_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u16_e32 v2, 2, v1
+; GFX9-NEXT: ds_write_b16 v0, v1 offset:32
+; GFX9-NEXT: ds_write_b16 v0, v2 offset:32
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_store_monotonic_offset_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2
+; GFX11-TRUE16-NEXT: ds_store_b16 v0, v1 offset:32
+; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1 offset:32
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_store_monotonic_offset_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
+; GFX11-FAKE16-NEXT: ds_store_b16 v0, v1 offset:32
+; GFX11-FAKE16-NEXT: ds_store_b16 v0, v2 offset:32
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %val1 = add i16 %val, 2
%gep = getelementptr inbounds i16, ptr addrspace(3) %ptr, i16 16
store atomic i16 %val, ptr addrspace(3) %gep monotonic, align 2
+ store atomic i16 %val1, ptr addrspace(3) %gep monotonic, align 2
ret void
}
-; GCN-LABEL: {{^}}atomic_store_monotonic_i32:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_write_b32 v0, v1{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define void @atomic_store_monotonic_i32(ptr addrspace(3) %ptr, i32 %val) {
+; CI-LABEL: atomic_store_monotonic_i32:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_write_b32 v0, v1
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_store_monotonic_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_write_b32 v0, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_store_monotonic_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: ds_store_b32 v0, v1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
store atomic i32 %val, ptr addrspace(3) %ptr monotonic, align 4
ret void
}
-; GCN-LABEL: {{^}}atomic_store_monotonic_offset_i32:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_write_b32 v0, v1 offset:64{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define void @atomic_store_monotonic_offset_i32(ptr addrspace(3) %ptr, i32 %val) {
+; CI-LABEL: atomic_store_monotonic_offset_i32:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_write_b32 v0, v1 offset:64
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_store_monotonic_offset_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_write_b32 v0, v1 offset:64
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_store_monotonic_offset_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: ds_store_b32 v0, v1 offset:64
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i32, ptr addrspace(3) %ptr, i32 16
store atomic i32 %val, ptr addrspace(3) %gep monotonic, align 4
ret void
}
-; GCN-LABEL: {{^}}atomic_store_monotonic_i64:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_write_b64 v0, v[1:2]{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define void @atomic_store_monotonic_i64(ptr addrspace(3) %ptr, i64 %val) {
+; CI-LABEL: atomic_store_monotonic_i64:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_write_b64 v0, v[1:2]
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_store_monotonic_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_write_b64 v0, v[1:2]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_store_monotonic_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: ds_store_b64 v0, v[1:2]
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
store atomic i64 %val, ptr addrspace(3) %ptr monotonic, align 8
ret void
}
-; GCN-LABEL: {{^}}atomic_store_monotonic_offset_i64:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_write_b64 v0, v[1:2] offset:128{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define void @atomic_store_monotonic_offset_i64(ptr addrspace(3) %ptr, i64 %val) {
+; CI-LABEL: atomic_store_monotonic_offset_i64:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: ds_write_b64 v0, v[1:2] offset:128
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_store_monotonic_offset_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ds_write_b64 v0, v[1:2] offset:128
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: atomic_store_monotonic_offset_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: ds_store_b64 v0, v[1:2] offset:128
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i64, ptr addrspace(3) %ptr, i64 16
store atomic i64 %val, ptr addrspace(3) %gep monotonic, align 8
ret void
}
-; GCN-LABEL: {{^}}atomic_store_monotonic_f16:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_write_b16 v0, v1{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define void @atomic_store_monotonic_f16(ptr addrspace(3) %ptr, i16 %arg.val) {
+; CI-LABEL: atomic_store_monotonic_f16:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1
+; CI-NEXT: ds_write_b16 v0, v1
+; CI-NEXT: ds_write_b16 v0, v2
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_store_monotonic_f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u16_e32 v2, 2, v1
+; GFX9-NEXT: ds_write_b16 v0, v1
+; GFX9-NEXT: ds_write_b16 v0, v2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_store_monotonic_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2
+; GFX11-TRUE16-NEXT: ds_store_b16 v0, v1
+; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_store_monotonic_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
+; GFX11-FAKE16-NEXT: ds_store_b16 v0, v1
+; GFX11-FAKE16-NEXT: ds_store_b16 v0, v2
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %arg.val1 = add i16 %arg.val, 2
%val = bitcast i16 %arg.val to half
+ %val1 = bitcast i16 %arg.val1 to half
store atomic half %val, ptr addrspace(3) %ptr monotonic, align 2
+ store atomic half %val1, ptr addrspace(3) %ptr monotonic, align 2
ret void
}
-; GCN-LABEL: {{^}}atomic_store_monotonic_offset_f16:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_write_b16 v0, v1 offset:32{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define void @atomic_store_monotonic_offset_f16(ptr addrspace(3) %ptr, i16 %arg.val) {
+; CI-LABEL: atomic_store_monotonic_offset_f16:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1
+; CI-NEXT: ds_write_b16 v0, v1 offset:32
+; CI-NEXT: ds_write_b16 v0, v2 offset:32
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_store_monotonic_offset_f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u16_e32 v2, 2, v1
+; GFX9-NEXT: ds_write_b16 v0, v1 offset:32
+; GFX9-NEXT: ds_write_b16 v0, v2 offset:32
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_store_monotonic_offset_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2
+; GFX11-TRUE16-NEXT: ds_store_b16 v0, v1 offset:32
+; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1 offset:32
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_store_monotonic_offset_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
+; GFX11-FAKE16-NEXT: ds_store_b16 v0, v1 offset:32
+; GFX11-FAKE16-NEXT: ds_store_b16 v0, v2 offset:32
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %arg.val1 = add i16 %arg.val, 2
+ %val1 = bitcast i16 %arg.val1 to half
%val = bitcast i16 %arg.val to half
%gep = getelementptr inbounds half, ptr addrspace(3) %ptr, i32 16
store atomic half %val, ptr addrspace(3) %gep monotonic, align 2
+ store atomic half %val1, ptr addrspace(3) %gep monotonic, align 2
ret void
}
-; GCN-LABEL: {{^}}atomic_store_monotonic_bf16:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_write_b16 v0, v1{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define void @atomic_store_monotonic_bf16(ptr addrspace(3) %ptr, i16 %arg.val) {
+; CI-LABEL: atomic_store_monotonic_bf16:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1
+; CI-NEXT: ds_write_b16 v0, v1
+; CI-NEXT: ds_write_b16 v0, v2
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_store_monotonic_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u16_e32 v2, 2, v1
+; GFX9-NEXT: ds_write_b16 v0, v1
+; GFX9-NEXT: ds_write_b16 v0, v2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_store_monotonic_bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2
+; GFX11-TRUE16-NEXT: ds_store_b16 v0, v1
+; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_store_monotonic_bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
+; GFX11-FAKE16-NEXT: ds_store_b16 v0, v1
+; GFX11-FAKE16-NEXT: ds_store_b16 v0, v2
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %arg.val1 = add i16 %arg.val, 2
+ %val1 = bitcast i16 %arg.val1 to bfloat
%val = bitcast i16 %arg.val to bfloat
store atomic bfloat %val, ptr addrspace(3) %ptr monotonic, align 2
+ store atomic bfloat %val1, ptr addrspace(3) %ptr monotonic, align 2
ret void
}
-; GCN-LABEL: {{^}}atomic_store_monotonic_offset_bf16:
-; GCN: s_waitcnt
-; GFX9-NOT: s_mov_b32 m0
-; CI-NEXT: s_mov_b32 m0
-; GCN-NEXT: ds_write_b16 v0, v1 offset:32{{$}}
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64
define void @atomic_store_monotonic_offset_bf16(ptr addrspace(3) %ptr, i16 %arg.val) {
+; CI-LABEL: atomic_store_monotonic_offset_bf16:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1
+; CI-NEXT: ds_write_b16 v0, v1 offset:32
+; CI-NEXT: ds_write_b16 v0, v2 offset:32
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_store_monotonic_offset_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u16_e32 v2, 2, v1
+; GFX9-NEXT: ds_write_b16 v0, v1 offset:32
+; GFX9-NEXT: ds_write_b16 v0, v2 offset:32
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: atomic_store_monotonic_offset_bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2
+; GFX11-TRUE16-NEXT: ds_store_b16 v0, v1 offset:32
+; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1 offset:32
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: atomic_store_monotonic_offset_bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
+; GFX11-FAKE16-NEXT: ds_store_b16 v0, v1 offset:32
+; GFX11-FAKE16-NEXT: ds_store_b16 v0, v2 offset:32
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %arg.val1 = add i16 %arg.val, 2
+ %val1 = bitcast i16 %arg.val1 to bfloat
%val = bitcast i16 %arg.val to bfloat
%gep = getelementptr inbounds bfloat, ptr addrspace(3) %ptr, i32 16
store atomic bfloat %val, ptr addrspace(3) %gep monotonic, align 2
+ store atomic bfloat %val1, ptr addrspace(3) %gep monotonic, align 2
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index 806fe899a9149..c739ba2183ef9 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -224,15 +224,25 @@ define <2 x half> @chain_hi_to_lo_group() {
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: chain_hi_to_lo_group:
-; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: ds_load_u16 v0, v1 offset:2
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: ds_load_u16_d16_hi v0, v1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: chain_hi_to_lo_group:
+; GFX11-TRUE16: ; %bb.0: ; %bb
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v1 offset:2
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v0, v1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: chain_hi_to_lo_group:
+; GFX11-FAKE16: ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT: ds_load_u16 v0, v1 offset:2
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_u16_d16_hi v0, v1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds half, ptr addrspace(3) null, i64 1
%load_lo = load half, ptr addrspace(3) %gep_lo
@@ -263,14 +273,23 @@ define <2 x half> @chain_hi_to_lo_group_different_bases(ptr addrspace(3) %base_l
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: chain_hi_to_lo_group_different_bases:
-; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_u16 v0, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: ds_load_u16_d16_hi v0, v1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: chain_hi_to_lo_group_different_bases:
+; GFX11-TRUE16: ; %bb.0: ; %bb
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v0, v1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: chain_hi_to_lo_group_different_bases:
+; GFX11-FAKE16: ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_u16_d16_hi v0, v1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
bb:
%load_lo = load half, ptr addrspace(3) %base_lo
%load_hi = load half, ptr addrspace(3) %base_hi
@@ -780,16 +799,27 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(ptr addrspace(3) %p
; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
-; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_u16 v1, v0 offset:2
-; GFX11-NEXT: ds_load_u16_d16_hi v0, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
+; GFX11-TRUE16: ; %bb.0: ; %bb
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_u16_d16 v1, v0 offset:2
+; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v0, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
+; GFX11-FAKE16: ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_u16 v1, v0 offset:2
+; GFX11-FAKE16-NEXT: ds_load_u16_d16_hi v0, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1
%load_lo = load volatile i16, ptr addrspace(3) %gep_lo
@@ -1047,12 +1077,12 @@ define <2 x i16> @chain_hi_to_lo_group_may_alias_store(ptr addrspace(3) %ptr, pt
; GFX11-TRUE16-LABEL: chain_hi_to_lo_group_may_alias_store:
; GFX11-TRUE16: ; %bb.0: ; %bb
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0x7b
-; GFX11-TRUE16-NEXT: ds_load_u16 v3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0x7b
+; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v2, v0
; GFX11-TRUE16-NEXT: ds_store_b16 v1, v2
-; GFX11-TRUE16-NEXT: ds_load_u16 v0, v0 offset:2
+; GFX11-TRUE16-NEXT: ds_load_u16_d16 v2, v0 offset:2
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: chain_hi_to_lo_group_may_alias_store:
diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
index 8f702da64c508..bd4d640efb050 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -229,10 +229,11 @@ define amdgpu_kernel void @add_x_shl_max_offset() #1 {
;
; GFX11-LABEL: add_x_shl_max_offset:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX11-NEXT: ds_store_b8 v0, v1 offset:65535
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v0
+; GFX11-NEXT: v_mov_b16_e32 v0.l, 13
+; GFX11-NEXT: ds_store_b8 v1, v0 offset:65535
; GFX11-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
%shl = shl i32 %x.i, 4
@@ -273,11 +274,12 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 {
;
; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: ds_store_b8 v0, v1
+; GFX11-NEXT: v_xor_b32_e32 v1, 0xffff, v0
+; GFX11-NEXT: v_mov_b16_e32 v0.l, 13
+; GFX11-NEXT: ds_store_b8 v1, v0
; GFX11-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
%.neg = mul i32 %x.i, -4
@@ -318,11 +320,12 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 {
;
; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: ds_store_b8 v0, v1
+; GFX11-NEXT: v_xor_b32_e32 v1, 0xffff, v0
+; GFX11-NEXT: v_mov_b16_e32 v0.l, 13
+; GFX11-NEXT: ds_store_b8 v1, v0
; GFX11-NEXT: s_endpgm
%x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
%neg = sub i32 0, %x.i
@@ -361,11 +364,12 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
;
; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0
-; GFX11-NEXT: ds_store_b8 v0, v1
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x10000, v0
+; GFX11-NEXT: v_mov_b16_e32 v0.l, 13
+; GFX11-NEXT: ds_store_b8 v1, v0
; GFX11-NEXT: s_endpgm
%x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
%neg = sub i32 0, %x.i
>From 518d4ec9ad6706494accd83cf1a3bec809cc8e20 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Sat, 15 Mar 2025 00:34:28 -0400
Subject: [PATCH 2/2] fix test
---
llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 41 -
.../AMDGPU/GlobalISel/store-local.128.ll | 61 +-
.../AMDGPU/GlobalISel/store-local.96.ll | 44 +-
llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll | 108 +-
.../CodeGen/AMDGPU/integer-mad-patterns.ll | 1748 +++++++++++------
5 files changed, 1301 insertions(+), 701 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index edf62d25e0f8e..895d1e77bf1c4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -187,47 +187,6 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
OutMI.addOperand(Dest);
OutMI.addOperand(Src);
return;
- } else if (const auto *Info = AMDGPU::getT16D16Helper(Opcode)) {
- uint16_t OpName = AMDGPU::OpName::OPERAND_LAST;
- if (TII->isDS(Opcode)) {
- if (MI->mayLoad())
- OpName = llvm::AMDGPU::OpName::vdst;
- else if (MI->mayStore())
- OpName = llvm::AMDGPU::OpName::data0;
- else
- llvm_unreachable("LDS load or store expected");
- } else {
- OpName = AMDGPU::hasNamedOperand(Opcode, llvm::AMDGPU::OpName::vdata)
- ? llvm::AMDGPU::OpName::vdata
- : llvm::AMDGPU::OpName::vdst;
- }
- int VDstOrVDataIdx = AMDGPU::getNamedOperandIdx(Opcode, OpName);
- MachineOperand MIVDstOrVData = MI->getOperand(VDstOrVDataIdx);
- bool IsHi = AMDGPU::isHi16Reg(MIVDstOrVData.getReg(), TRI);
- Opcode = IsHi ? Info->HiOp : Info->LoOp;
- MIVDstOrVData.clearParent(); // Avoid use list error in setReg call
- MIVDstOrVData.setReg(TRI.get32BitRegister(MIVDstOrVData.getReg()));
-
- int MCOpcode = TII->pseudoToMCOpcode(Opcode);
- assert(MCOpcode != -1 &&
- "Pseudo instruction doesn't have a target-specific version");
- OutMI.setOpcode(MCOpcode);
- for (int I = 0, E = MI->getNumExplicitOperands(); I < E; I++) {
- const MachineOperand &MO = MI->getOperand(I);
- MCOperand MCOp;
- if (I == VDstOrVDataIdx)
- lowerOperand(MIVDstOrVData, MCOp);
- else
- lowerOperand(MO, MCOp);
- OutMI.addOperand(MCOp);
- }
-
- if (AMDGPU::hasNamedOperand(MCOpcode, AMDGPU::OpName::vdst_in)) {
- MCOperand MCOp;
- lowerOperand(MIVDstOrVData, MCOp);
- OutMI.addOperand(MCOp);
- }
- return;
} else if (Opcode == AMDGPU::SI_TCRETURN ||
Opcode == AMDGPU::SI_TCRETURN_GFX) {
// TODO: How to use branch immediate and avoid register+add?
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index f6fbae88dc84a..e81bae5d3a416 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -239,53 +239,48 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s6, 0xffff, s0
; GFX11-NEXT: s_lshr_b32 s5, s0, 16
-; GFX11-NEXT: v_mov_b16_e32 v0.l, s0
-; GFX11-NEXT: v_mov_b32_e32 v5, s4
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4
; GFX11-NEXT: s_lshr_b32 s0, s1, 16
; GFX11-NEXT: s_and_b32 s4, 0xffff, s1
-; GFX11-NEXT: v_mov_b16_e32 v0.h, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2
; GFX11-NEXT: s_lshr_b32 s1, s2, 16
; GFX11-NEXT: s_and_b32 s7, 0xffff, s2
-; GFX11-NEXT: v_mov_b16_e32 v1.l, s2
; GFX11-NEXT: s_lshr_b32 s2, s6, 8
; GFX11-NEXT: s_lshr_b32 s6, s5, 8
-; GFX11-NEXT: v_mov_b16_e32 v2.h, s2
-; GFX11-NEXT: v_mov_b16_e32 v1.h, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s6
+; GFX11-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: s_lshr_b32 s4, s4, 8
; GFX11-NEXT: s_lshr_b32 s5, s0, 8
-; GFX11-NEXT: v_mov_b16_e32 v2.l, s0
; GFX11-NEXT: s_lshr_b32 s0, s7, 8
-; GFX11-NEXT: v_mov_b16_e32 v3.l, s6
-; GFX11-NEXT: v_mov_b16_e32 v3.h, s4
-; GFX11-NEXT: v_mov_b16_e32 v4.l, s5
-; GFX11-NEXT: ds_store_b8 v5, v0
-; GFX11-NEXT: ds_store_b8_d16_hi v5, v2 offset:1
-; GFX11-NEXT: ds_store_b8_d16_hi v5, v1 offset:2
-; GFX11-NEXT: ds_store_b8 v5, v3 offset:3
-; GFX11-NEXT: ds_store_b8_d16_hi v5, v0 offset:4
-; GFX11-NEXT: ds_store_b8_d16_hi v5, v3 offset:5
-; GFX11-NEXT: ds_store_b8 v5, v2 offset:6
-; GFX11-NEXT: ds_store_b8 v5, v4 offset:7
-; GFX11-NEXT: v_mov_b16_e32 v0.l, s0
+; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5
+; GFX11-NEXT: ds_store_b8 v1, v0
+; GFX11-NEXT: ds_store_b8 v1, v6 offset:1
+; GFX11-NEXT: ds_store_b8 v1, v4 offset:2
+; GFX11-NEXT: ds_store_b8 v1, v7 offset:3
+; GFX11-NEXT: ds_store_b8 v1, v2 offset:4
+; GFX11-NEXT: ds_store_b8 v1, v8 offset:5
+; GFX11-NEXT: ds_store_b8 v1, v5 offset:6
+; GFX11-NEXT: ds_store_b8 v1, v9 offset:7
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s3
; GFX11-NEXT: s_lshr_b32 s0, s1, 8
-; GFX11-NEXT: v_mov_b16_e32 v0.h, s1
-; GFX11-NEXT: v_mov_b16_e32 v1.h, s0
+; GFX11-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-NEXT: v_mov_b32_e32 v4, s0
; GFX11-NEXT: s_and_b32 s0, 0xffff, s3
; GFX11-NEXT: s_lshr_b32 s1, s3, 16
; GFX11-NEXT: s_lshr_b32 s0, s0, 8
-; GFX11-NEXT: v_mov_b16_e32 v2.l, s3
-; GFX11-NEXT: v_mov_b16_e32 v2.h, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
; GFX11-NEXT: s_lshr_b32 s0, s1, 8
-; GFX11-NEXT: v_mov_b16_e32 v3.l, s1
-; GFX11-NEXT: v_mov_b16_e32 v3.h, s0
-; GFX11-NEXT: ds_store_b8 v5, v1 offset:8
-; GFX11-NEXT: ds_store_b8 v5, v0 offset:9
-; GFX11-NEXT: ds_store_b8_d16_hi v5, v0 offset:10
-; GFX11-NEXT: ds_store_b8_d16_hi v5, v1 offset:11
-; GFX11-NEXT: ds_store_b8 v5, v2 offset:12
-; GFX11-NEXT: ds_store_b8_d16_hi v5, v2 offset:13
-; GFX11-NEXT: ds_store_b8 v5, v3 offset:14
-; GFX11-NEXT: ds_store_b8_d16_hi v5, v3 offset:15
+; GFX11-NEXT: v_mov_b32_e32 v8, s0
+; GFX11-NEXT: ds_store_b8 v1, v3 offset:8
+; GFX11-NEXT: ds_store_b8 v1, v0 offset:9
+; GFX11-NEXT: ds_store_b8 v1, v2 offset:10
+; GFX11-NEXT: ds_store_b8 v1, v4 offset:11
+; GFX11-NEXT: ds_store_b8 v1, v5 offset:12
+; GFX11-NEXT: ds_store_b8 v1, v6 offset:13
+; GFX11-NEXT: ds_store_b8 v1, v7 offset:14
+; GFX11-NEXT: ds_store_b8 v1, v8 offset:15
; GFX11-NEXT: s_endpgm
store <4 x i32> %x, ptr addrspace(3) %out, align 1
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
index 27816a9375d30..030f01a8bd5ea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
@@ -207,42 +207,36 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
; GFX11-NEXT: s_and_b32 s5, 0xffff, s0
; GFX11-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-NEXT: v_mov_b16_e32 v0.l, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, s3
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: s_lshr_b32 s0, s1, 16
; GFX11-NEXT: s_and_b32 s3, 0xffff, s1
-; GFX11-NEXT: v_mov_b16_e32 v0.h, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2
; GFX11-NEXT: s_lshr_b32 s1, s2, 16
; GFX11-NEXT: s_and_b32 s6, 0xffff, s2
-; GFX11-NEXT: v_mov_b16_e32 v1.l, s2
; GFX11-NEXT: s_lshr_b32 s2, s5, 8
; GFX11-NEXT: s_lshr_b32 s5, s4, 8
-; GFX11-NEXT: v_mov_b16_e32 v3.l, s2
-; GFX11-NEXT: v_mov_b16_e32 v1.h, s4
+; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s2
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: s_lshr_b32 s3, s3, 8
; GFX11-NEXT: s_lshr_b32 s4, s0, 8
-; GFX11-NEXT: v_mov_b16_e32 v2.l, s0
; GFX11-NEXT: s_lshr_b32 s0, s6, 8
; GFX11-NEXT: s_lshr_b32 s6, s1, 8
-; GFX11-NEXT: v_mov_b16_e32 v3.h, s5
-; GFX11-NEXT: v_mov_b16_e32 v2.h, s1
-; GFX11-NEXT: v_mov_b16_e32 v4.l, s3
-; GFX11-NEXT: v_mov_b16_e32 v4.h, s4
-; GFX11-NEXT: v_mov_b16_e32 v5.l, s0
-; GFX11-NEXT: v_mov_b16_e32 v5.h, s6
-; GFX11-NEXT: ds_store_b8 v6, v0
-; GFX11-NEXT: ds_store_b8 v6, v3 offset:1
-; GFX11-NEXT: ds_store_b8_d16_hi v6, v1 offset:2
-; GFX11-NEXT: ds_store_b8_d16_hi v6, v3 offset:3
-; GFX11-NEXT: ds_store_b8_d16_hi v6, v0 offset:4
-; GFX11-NEXT: ds_store_b8 v6, v4 offset:5
-; GFX11-NEXT: ds_store_b8 v6, v2 offset:6
-; GFX11-NEXT: ds_store_b8_d16_hi v6, v4 offset:7
-; GFX11-NEXT: ds_store_b8 v6, v1 offset:8
-; GFX11-NEXT: ds_store_b8 v6, v5 offset:9
-; GFX11-NEXT: ds_store_b8_d16_hi v6, v2 offset:10
-; GFX11-NEXT: ds_store_b8_d16_hi v6, v5 offset:11
+; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s3
+; GFX11-NEXT: v_dual_mov_b32 v10, s4 :: v_dual_mov_b32 v11, s0
+; GFX11-NEXT: v_mov_b32_e32 v12, s6
+; GFX11-NEXT: ds_store_b8 v1, v0
+; GFX11-NEXT: ds_store_b8 v1, v7 offset:1
+; GFX11-NEXT: ds_store_b8 v1, v4 offset:2
+; GFX11-NEXT: ds_store_b8 v1, v8 offset:3
+; GFX11-NEXT: ds_store_b8 v1, v2 offset:4
+; GFX11-NEXT: ds_store_b8 v1, v9 offset:5
+; GFX11-NEXT: ds_store_b8 v1, v5 offset:6
+; GFX11-NEXT: ds_store_b8 v1, v10 offset:7
+; GFX11-NEXT: ds_store_b8 v1, v3 offset:8
+; GFX11-NEXT: ds_store_b8 v1, v11 offset:9
+; GFX11-NEXT: ds_store_b8 v1, v6 offset:10
+; GFX11-NEXT: ds_store_b8 v1, v12 offset:11
; GFX11-NEXT: s_endpgm
store <3 x i32> %x, ptr addrspace(3) %out, align 1
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
index bd4d640efb050..7819da8b97e55 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -2,7 +2,8 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
@@ -227,14 +228,22 @@ define amdgpu_kernel void @add_x_shl_max_offset() #1 {
; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: add_x_shl_max_offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v0
-; GFX11-NEXT: v_mov_b16_e32 v0.l, 13
-; GFX11-NEXT: ds_store_b8 v1, v0 offset:65535
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: add_x_shl_max_offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 4, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 13
+; GFX11-TRUE16-NEXT: ds_store_b8 v1, v0 offset:65535
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: add_x_shl_max_offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1 offset:65535
+; GFX11-FAKE16-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
%shl = shl i32 %x.i, 4
%add = add i32 %shl, 65535
@@ -272,15 +281,24 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 {
; GFX10-NEXT: ds_write_b8 v0, v1
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: v_xor_b32_e32 v1, 0xffff, v0
-; GFX11-NEXT: v_mov_b16_e32 v0.l, 13
-; GFX11-NEXT: ds_store_b8 v1, v0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 13
+; GFX11-TRUE16-NEXT: ds_store_b8 v1, v0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1
+; GFX11-FAKE16-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
%.neg = mul i32 %x.i, -4
%add = add i32 %.neg, 65535
@@ -318,15 +336,24 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 {
; GFX10-NEXT: ds_write_b8 v0, v1
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: v_xor_b32_e32 v1, 0xffff, v0
-; GFX11-NEXT: v_mov_b16_e32 v0.l, 13
-; GFX11-NEXT: ds_store_b8 v1, v0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 13
+; GFX11-TRUE16-NEXT: ds_store_b8 v1, v0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1
+; GFX11-FAKE16-NEXT: s_endpgm
%x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
%neg = sub i32 0, %x.i
%shl = shl i32 %neg, 2
@@ -362,15 +389,24 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
; GFX10-NEXT: ds_write_b8 v0, v1
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x10000, v0
-; GFX11-NEXT: v_mov_b16_e32 v0.l, 13
-; GFX11-NEXT: ds_store_b8 v1, v0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v1, 0x10000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 13
+; GFX11-TRUE16-NEXT: ds_store_b8 v1, v0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0
+; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1
+; GFX11-FAKE16-NEXT: s_endpgm
%x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
%neg = sub i32 0, %x.i
%shl = shl i32 %neg, 2
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index dcb1d0e8c20a1..027576630c877 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -17,11 +17,15 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-SDAG,GFX1200-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-SDAG,GFX1200-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-GISEL,GFX1200-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-GISEL,GFX1200-GISEL-FAKE16 %s
; Test for integer mad formation for patterns used in clpeak
@@ -324,71 +328,137 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) {
; GFX10-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: clpeak_imad_pat_i16:
-; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
-; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: clpeak_imad_pat_i16:
-; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1
-; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1
-; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1200-SDAG-LABEL: clpeak_imad_pat_i16:
-; GFX1200-SDAG: ; %bb.0: ; %entry
-; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
-; GFX1200-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1200-GISEL-LABEL: clpeak_imad_pat_i16:
-; GFX1200-GISEL: ; %bb.0: ; %entry
-; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1
-; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1
-; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: clpeak_imad_pat_i16:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: clpeak_imad_pat_i16:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: clpeak_imad_pat_i16:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1
+; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: clpeak_imad_pat_i16:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1
+; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-TRUE16-LABEL: clpeak_imad_pat_i16:
+; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: clpeak_imad_pat_i16:
+; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0
+; GFX1200-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-TRUE16-LABEL: clpeak_imad_pat_i16:
+; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-FAKE16-LABEL: clpeak_imad_pat_i16:
+; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
entry:
%conv33 = add i16 %x, 1
%add = mul i16 %conv33, %y
@@ -1461,71 +1531,137 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: clpeak_umad_pat_i16:
-; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
-; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: clpeak_umad_pat_i16:
-; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1
-; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1
-; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1200-SDAG-LABEL: clpeak_umad_pat_i16:
-; GFX1200-SDAG: ; %bb.0: ; %entry
-; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
-; GFX1200-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1200-GISEL-LABEL: clpeak_umad_pat_i16:
-; GFX1200-GISEL: ; %bb.0: ; %entry
-; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1
-; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1
-; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: clpeak_umad_pat_i16:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: clpeak_umad_pat_i16:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1
+; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1
+; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-TRUE16-LABEL: clpeak_umad_pat_i16:
+; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16:
+; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0
+; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-TRUE16-LABEL: clpeak_umad_pat_i16:
+; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16:
+; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
entry:
%conv33 = add i16 %x, 1
%add = mul i16 %conv33, %y
@@ -4315,71 +4451,137 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) {
; GFX10-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: clpeak_imad_pat_i8:
-; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
-; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: clpeak_imad_pat_i8:
-; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1
-; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1
-; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1200-SDAG-LABEL: clpeak_imad_pat_i8:
-; GFX1200-SDAG: ; %bb.0: ; %entry
-; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
-; GFX1200-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1200-GISEL-LABEL: clpeak_imad_pat_i8:
-; GFX1200-GISEL: ; %bb.0: ; %entry
-; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1
-; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1
-; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: clpeak_imad_pat_i8:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: clpeak_imad_pat_i8:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: clpeak_imad_pat_i8:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1
+; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: clpeak_imad_pat_i8:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1
+; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-TRUE16-LABEL: clpeak_imad_pat_i8:
+; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: clpeak_imad_pat_i8:
+; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0
+; GFX1200-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-TRUE16-LABEL: clpeak_imad_pat_i8:
+; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-FAKE16-LABEL: clpeak_imad_pat_i8:
+; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
entry:
%conv33 = add i8 %x, 1
%add = mul i8 %conv33, %y
@@ -4584,113 +4786,221 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: clpeak_imad_pat_v2i8:
-; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_nc_u16 v1, v1, 1
-; GFX11-SDAG-NEXT: v_add_nc_u16 v0, v0, 1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_mad_u16 v4, v1, v3, v1
-; GFX11-SDAG-NEXT: v_mad_u16 v5, v0, v2, v0
-; GFX11-SDAG-NEXT: v_mul_lo_u16 v1, v1, v3
-; GFX11-SDAG-NEXT: v_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT: v_mul_lo_u16 v3, v4, v3
-; GFX11-SDAG-NEXT: v_mul_lo_u16 v2, v5, v2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_mad_u16 v1, v3, v1, v3
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v2, v0, v2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_mad_u16 v1, v1, v3, v1
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v2, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_lshlrev_b16 v2, 8, v1
-; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i8:
-; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1
-; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_mad_u16 v4, v0, v2, v0
-; GFX11-GISEL-NEXT: v_mad_u16 v5, v1, v3, v1
-; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v2, 1
-; GFX11-GISEL-NEXT: v_mad_u16 v1, v1, v3, 1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v6, v4, v2
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v7, v5, v3
-; GFX11-GISEL-NEXT: v_mad_u16 v2, v4, v2, 1
-; GFX11-GISEL-NEXT: v_mad_u16 v3, v5, v3, 1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v6, v0
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v7, v1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1200-SDAG-LABEL: clpeak_imad_pat_v2i8:
-; GFX1200-SDAG: ; %bb.0: ; %entry
-; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1200-SDAG-NEXT: v_add_nc_u16 v1, v1, 1
-; GFX1200-SDAG-NEXT: v_add_nc_u16 v0, v0, 1
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1200-SDAG-NEXT: v_mad_u16 v4, v1, v3, v1
-; GFX1200-SDAG-NEXT: v_mad_u16 v5, v0, v2, v0
-; GFX1200-SDAG-NEXT: v_mul_lo_u16 v1, v1, v3
-; GFX1200-SDAG-NEXT: v_mul_lo_u16 v0, v0, v2
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1200-SDAG-NEXT: v_mul_lo_u16 v3, v4, v3
-; GFX1200-SDAG-NEXT: v_mul_lo_u16 v2, v5, v2
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1200-SDAG-NEXT: v_mad_u16 v1, v3, v1, v3
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v2, v0, v2
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1200-SDAG-NEXT: v_mad_u16 v1, v1, v3, v1
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v2, v0
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1200-SDAG-NEXT: v_lshlrev_b16 v2, 8, v1
-; GFX1200-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX1200-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1200-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1200-GISEL-LABEL: clpeak_imad_pat_v2i8:
-; GFX1200-GISEL: ; %bb.0: ; %entry
-; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1
-; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1200-GISEL-NEXT: v_mad_u16 v4, v0, v2, v0
-; GFX1200-GISEL-NEXT: v_mad_u16 v5, v1, v3, v1
-; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v2, 1
-; GFX1200-GISEL-NEXT: v_mad_u16 v1, v1, v3, 1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v6, v4, v2
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v7, v5, v3
-; GFX1200-GISEL-NEXT: v_mad_u16 v2, v4, v2, 1
-; GFX1200-GISEL-NEXT: v_mad_u16 v3, v5, v3, 1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v6, v0
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v7, v1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3
-; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: clpeak_imad_pat_v2i8:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v1.l, v0.h, v3.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v1.h, v0.l, v2.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_mul_lo_u16 v1.l, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_mul_lo_u16 v1.h, v1.h, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v1.l, v0.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.h, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: clpeak_imad_pat_v2i8:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u16 v1, v1, 1
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v4, v1, v3, v1
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v5, v0, v2, v0
+; GFX11-SDAG-FAKE16-NEXT: v_mul_lo_u16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT: v_mul_lo_u16 v3, v4, v3
+; GFX11-SDAG-FAKE16-NEXT: v_mul_lo_u16 v2, v5, v2
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v3, v1, v3
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v2, v0, v2
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v1, v3, v1
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v2, v0
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v2, 8, v1
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: clpeak_imad_pat_v2i8:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1
+; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v1.l, v0.l, v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v1.h, v0.h, v3.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v2.l, 1
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v3.l, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v2.h, v1.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v3.h, v1.h, v3.l
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v1.l, v1.l, v2.l, 1
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v1.h, v1.h, v3.l, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v2.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v3.h, v0.h
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: clpeak_imad_pat_v2i8:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1
+; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v1, v1, 1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v4, v0, v2, v0
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v5, v1, v3, v1
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v2, 1
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v1, v1, v3, 1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v6, v4, v2
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v7, v5, v3
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v2, v4, v2, 1
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v3, v5, v3, 1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v6, v0
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v7, v1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v1, v3
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-TRUE16-LABEL: clpeak_imad_pat_v2i8:
+; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1
+; GFX1200-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v1.l, v0.h, v3.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v1.h, v0.l, v2.l, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v3.l
+; GFX1200-SDAG-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v2.l
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-TRUE16-NEXT: v_mul_lo_u16 v1.l, v1.l, v3.l
+; GFX1200-SDAG-TRUE16-NEXT: v_mul_lo_u16 v1.h, v1.h, v2.l
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v1.l, v0.h, v1.l
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.h, v0.l, v1.h
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.h, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v0.h
+; GFX1200-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
+; GFX1200-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v0.h
+; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: clpeak_imad_pat_v2i8:
+; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: v_add_nc_u16 v1, v1, 1
+; GFX1200-SDAG-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v4, v1, v3, v1
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v5, v0, v2, v0
+; GFX1200-SDAG-FAKE16-NEXT: v_mul_lo_u16 v1, v1, v3
+; GFX1200-SDAG-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v2
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-FAKE16-NEXT: v_mul_lo_u16 v3, v4, v3
+; GFX1200-SDAG-FAKE16-NEXT: v_mul_lo_u16 v2, v5, v2
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v3, v1, v3
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v2, v0, v2
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v1, v3, v1
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v2, v0
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b16 v2, 8, v1
+; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-TRUE16-LABEL: clpeak_imad_pat_v2i8:
+; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v1.l, v0.l, v2.l, v0.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v1.h, v0.h, v3.l, v0.h
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v2.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v3.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v2.h, v1.l, v2.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v3.h, v1.h, v3.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v1.l, v1.l, v2.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v1.h, v1.h, v3.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v2.h, v0.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v3.h, v0.h
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-FAKE16-LABEL: clpeak_imad_pat_v2i8:
+; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v1, v1, 1
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v4, v0, v2, v0
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v5, v1, v3, v1
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v2, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v1, v1, v3, 1
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v6, v4, v2
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v7, v5, v3
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v2, v4, v2, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v3, v5, v3, 1
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v6, v0
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v7, v1
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v2
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v1, v3
+; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <2 x i8> %x, <i8 1, i8 1>
%add = mul <2 x i8> %y18, %y
@@ -7656,103 +7966,201 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) {
; GFX10-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: clpeak_imad_pat_i16_x2:
-; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
-; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: clpeak_imad_pat_i16_x2:
-; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1
-; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1
-; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3
-; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1
-; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3
-; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1
-; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1200-SDAG-LABEL: clpeak_imad_pat_i16_x2:
-; GFX1200-SDAG: ; %bb.0: ; %entry
-; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
-; GFX1200-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1200-GISEL-LABEL: clpeak_imad_pat_i16_x2:
-; GFX1200-GISEL: ; %bb.0: ; %entry
-; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1
-; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1
-; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3
-; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1
-; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3
-; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1
-; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: clpeak_imad_pat_i16_x2:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: clpeak_imad_pat_i16_x2:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: clpeak_imad_pat_i16_x2:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1
+; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: clpeak_imad_pat_i16_x2:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1
+; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-TRUE16-LABEL: clpeak_imad_pat_i16_x2:
+; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: clpeak_imad_pat_i16_x2:
+; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0
+; GFX1200-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-TRUE16-LABEL: clpeak_imad_pat_i16_x2:
+; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-FAKE16-LABEL: clpeak_imad_pat_i16_x2:
+; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
entry:
%conv69 = add i16 %x, 1
%add = mul i16 %conv69, %y
@@ -7915,103 +8323,201 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: clpeak_umad_pat_i16_x2:
-; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
-; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: clpeak_umad_pat_i16_x2:
-; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1
-; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1
-; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3
-; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1
-; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3
-; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1
-; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1
-; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1200-SDAG-LABEL: clpeak_umad_pat_i16_x2:
-; GFX1200-SDAG: ; %bb.0: ; %entry
-; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
-; GFX1200-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1200-GISEL-LABEL: clpeak_umad_pat_i16_x2:
-; GFX1200-GISEL: ; %bb.0: ; %entry
-; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1
-; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1
-; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3
-; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1
-; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3
-; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1
-; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1
-; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: clpeak_umad_pat_i16_x2:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16_x2:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: clpeak_umad_pat_i16_x2:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1
+; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16_x2:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1
+; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-TRUE16-LABEL: clpeak_umad_pat_i16_x2:
+; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16_x2:
+; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0
+; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-TRUE16-LABEL: clpeak_umad_pat_i16_x2:
+; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16_x2:
+; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
entry:
%conv69 = add i16 %x, 1
%add = mul i16 %conv69, %y
@@ -8792,51 +9298,95 @@ define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) {
; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: multi_use_mul_mad_i16_var:
-; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mad_u16 v2, v0, v1, v2
-; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: multi_use_mul_mad_i16_var:
-; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mad_u16 v2, v0, v1, v2
-; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, v3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1200-SDAG-LABEL: multi_use_mul_mad_i16_var:
-; GFX1200-SDAG: ; %bb.0: ; %entry
-; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1200-SDAG-NEXT: v_mad_u16 v2, v0, v1, v2
-; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v3
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-SDAG-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
-; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1200-GISEL-LABEL: multi_use_mul_mad_i16_var:
-; GFX1200-GISEL: ; %bb.0: ; %entry
-; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1200-GISEL-NEXT: v_mad_u16 v2, v0, v1, v2
-; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, v3
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX1200-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: multi_use_mul_mad_i16_var:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: multi_use_mul_mad_i16_var:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v2, v0, v1, v2
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: multi_use_mul_mad_i16_var:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v2.l, v0.l, v1.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v2.h, v0.l, v1.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: multi_use_mul_mad_i16_var:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v2, v0, v1, v2
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v3
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX11-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-TRUE16-LABEL: multi_use_mul_mad_i16_var:
+; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v2.l
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.l, v3.l
+; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: multi_use_mul_mad_i16_var:
+; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v2, v0, v1, v2
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v3
+; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-TRUE16-LABEL: multi_use_mul_mad_i16_var:
+; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v2.l, v0.l, v1.l, v2.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v2.h, v0.l, v1.l, v3.l
+; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-FAKE16-LABEL: multi_use_mul_mad_i16_var:
+; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v2, v0, v1, v2
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v3
+; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX1200-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = mul i16 %x, %y
%add0 = add i16 %mul, %z0
@@ -8956,27 +9506,93 @@ define i16 @other_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z, ptr addrspace(3) %
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: other_use_mul_mad_i16_var:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mul_lo_u16 v4, v0, v1
-; GFX11-NEXT: v_mad_u16 v0, v0, v1, v2
-; GFX11-NEXT: ds_store_b16 v3, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1200-LABEL: other_use_mul_mad_i16_var:
-; GFX1200: ; %bb.0: ; %entry
-; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1200-NEXT: s_wait_expcnt 0x0
-; GFX1200-NEXT: s_wait_samplecnt 0x0
-; GFX1200-NEXT: s_wait_bvhcnt 0x0
-; GFX1200-NEXT: s_wait_kmcnt 0x0
-; GFX1200-NEXT: v_mul_lo_u16 v4, v0, v1
-; GFX1200-NEXT: v_mad_u16 v0, v0, v1, v2
-; GFX1200-NEXT: ds_store_b16 v3, v4
-; GFX1200-NEXT: s_wait_dscnt 0x0
-; GFX1200-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: other_use_mul_mad_i16_var:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: ds_store_b16_d16_hi v3, v0
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: other_use_mul_mad_i16_var:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_mul_lo_u16 v4, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2
+; GFX11-SDAG-FAKE16-NEXT: ds_store_b16 v3, v4
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: other_use_mul_mad_i16_var:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT: ds_store_b16_d16_hi v3, v0
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: other_use_mul_mad_i16_var:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v4, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2
+; GFX11-GISEL-FAKE16-NEXT: ds_store_b16 v3, v4
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-TRUE16-LABEL: other_use_mul_mad_i16_var:
+; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.l, v1.l
+; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l
+; GFX1200-SDAG-TRUE16-NEXT: ds_store_b16_d16_hi v3, v0
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: other_use_mul_mad_i16_var:
+; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: v_mul_lo_u16 v4, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2
+; GFX1200-SDAG-FAKE16-NEXT: ds_store_b16 v3, v4
+; GFX1200-SDAG-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-TRUE16-LABEL: other_use_mul_mad_i16_var:
+; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.l, v1.l
+; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l
+; GFX1200-GISEL-TRUE16-NEXT: ds_store_b16_d16_hi v3, v0
+; GFX1200-GISEL-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1200-GISEL-FAKE16-LABEL: other_use_mul_mad_i16_var:
+; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v4, v0, v1
+; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2
+; GFX1200-GISEL-FAKE16-NEXT: ds_store_b16 v3, v4
+; GFX1200-GISEL-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = mul i16 %x, %y
%add0 = add i16 %mul, %z
More information about the llvm-commits
mailing list