[llvm] [AMDGPU] Restrict packed math FP32 instructions to read only one SGPR per operand on gfx12+ (PR #152465)
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 7 02:28:00 PDT 2025
https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/152465
Sec. 4.6.7.1 of the gfx1250 SPG states that if an SGPR is used
as an operand, only one SGPR will be read for both the low and high
operations. As a result, the corresponding bits in `op_sel` and
`op_sel_hi` must be the same when the operand is an SGPR.
Co-authored-by: Tian, Shilei <Shilei.Tian at amd.com>
>From 79ac07e41777838efd94d909c243cfec8e1e4ff3 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 7 Aug 2025 02:24:35 -0700
Subject: [PATCH] [AMDGPU] Restrict packed math FP32 instructions to read only
one SGPR per operand on gfx12+
Sec. 4.6.7.1 of the gfx1250 SPG states that if an SGPR is used
as an operand, only one SGPR will be read for both the low and high
operations. As a result, the corresponding bits in `op_sel` and
`op_sel_hi` must be the same when the operand is an SGPR.
Co-authored-by: Tian, Shilei <Shilei.Tian at amd.com>
---
.../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 37 +
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 67 ++
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 13 +
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 14 +
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 2 +
llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 917 +++++++++++-------
llvm/test/MC/AMDGPU/gfx1250_asm_vop3p_err.s | 74 ++
7 files changed, 753 insertions(+), 371 deletions(-)
create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_vop3p_err.s
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index ff8efd2debc21..0d2feeb4edea3 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -4933,6 +4933,43 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
return false;
}
+ // Packed math FP32 instructions typically accept SGPRs or VGPRs as source
+ // operands. On gfx12+, if a source operand uses SGPRs, the HW can only read
+ // the first SGPR and use it for both the low and high operations.
+ if (isPackedFP32Inst(Opc) && isGFX12Plus()) {
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+ int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+ int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi);
+
+ const MCOperand &Src0 = Inst.getOperand(Src0Idx);
+ const MCOperand &Src1 = Inst.getOperand(Src1Idx);
+ unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+ unsigned OpSelHi = Inst.getOperand(OpSelHiIdx).getImm();
+
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+
+ auto VerifyOneSGPR = [OpSel, OpSelHi](unsigned Index) -> bool {
+ unsigned Mask = 1U << Index;
+ return ((OpSel & Mask) == 0) && ((OpSelHi & Mask) == 0);
+ };
+
+ if (Src0.isReg() && isSGPR(Src0.getReg(), TRI) &&
+ !VerifyOneSGPR(/*Index=*/0))
+ return false;
+ if (Src1.isReg() && isSGPR(Src1.getReg(), TRI) &&
+ !VerifyOneSGPR(/*Index=*/1))
+ return false;
+
+ int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+ if (Src2Idx != -1) {
+ const MCOperand &Src2 = Inst.getOperand(Src2Idx);
+ if (Src2.isReg() && isSGPR(Src2.getReg(), TRI) &&
+ !VerifyOneSGPR(/*Index=*/2))
+ return false;
+ }
+ }
+
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f20b22d14c984..19e6bcf6a219d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -18,6 +18,7 @@
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/LiveIntervals.h"
@@ -5534,6 +5535,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
+ // information.
+ if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
+ for (unsigned I = 0; I < 3; ++I) {
+ if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, I))
+ return false;
+ }
+ }
+
return true;
}
@@ -6005,6 +6015,21 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
unsigned Opc = MI.getOpcode();
+ // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
+ // information.
+ if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
+ MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
+ constexpr const AMDGPU::OpName OpNames[] = {
+ AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
+
+ for (auto [I, OpName] : enumerate(OpNames)) {
+ int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
+ if (static_cast<unsigned>(SrcIdx) == OpIdx &&
+ !isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, I, &MO))
+ return false;
+ }
+ }
+
if (!isLegalRegOperand(MRI, OpInfo, MO))
return false;
@@ -6053,6 +6078,39 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
return true;
}
+bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(
+ const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
+ const MachineOperand *MO) const {
+ constexpr const unsigned NumOps = 3;
+ constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
+ AMDGPU::OpName::src0, AMDGPU::OpName::src1,
+ AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
+ AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
+
+ assert(SrcN < NumOps);
+
+ if (!MO) {
+ int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
+ if (SrcIdx == -1)
+ return true;
+ MO = &MI.getOperand(SrcIdx);
+ }
+
+ if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
+ return true;
+
+ int ModsIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
+ if (ModsIdx == -1)
+ return true;
+
+ unsigned Mods = MI.getOperand(ModsIdx).getImm();
+ bool OpSel = Mods & SISrcMods::OP_SEL_0;
+ bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
+
+ return !OpSel && !OpSelHi;
+}
+
bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand *MO) const {
const MachineFunction &MF = *MI.getParent()->getParent();
@@ -6390,6 +6448,15 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
!RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
legalizeOpWithMove(MI, VOP3Idx[2]);
+
+ // Fix the register class of packed FP32 instructions on gfx12+. See
+ // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
+ if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(ST)) {
+ for (unsigned I = 0; I < 3; ++I) {
+ if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I))
+ legalizeOpWithMove(MI, VOP3Idx[I]);
+ }
+ }
}
Register SIInstrInfo::readlaneVGPRToSGPR(
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e042b59eb0f04..6b9403f9c7a21 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1287,6 +1287,19 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
const MachineOperand &MO) const;
bool isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand &MO) const;
+
+ /// Check if \p MO would be a legal operand for gfx12+ packed math FP32
+ /// instructions. Packed math FP32 instructions typically accept SGPRs or
+ /// VGPRs as source operands. On gfx12+, if a source operand uses SGPRs, the
+ /// HW can only read the first SGPR and use it for both the low and high
+ /// operations.
+ /// \p SrcN can be 0, 1, or 2, representing src0, src1, and src2,
+ /// respectively. If \p MO is nullptr, the operand corresponding to SrcN will
+ /// be used.
+ bool isLegalGFX12PlusPackedMathFP32Operand(
+ const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
+ const MachineOperand *MO = nullptr) const;
+
/// Legalize operands in \p MI by either commuting it or inserting a
/// copy of src1.
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 00dcb9b52d4bd..1e3e9a20afb2e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -3318,6 +3318,20 @@ unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
return 128;
}
+bool isPackedFP32Inst(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::V_PK_ADD_F32:
+ case AMDGPU::V_PK_ADD_F32_gfx12:
+ case AMDGPU::V_PK_MUL_F32:
+ case AMDGPU::V_PK_MUL_F32_gfx12:
+ case AMDGPU::V_PK_FMA_F32:
+ case AMDGPU::V_PK_FMA_F32_gfx12:
+ return true;
+ default:
+ return false;
+ }
+}
+
} // namespace AMDGPU
raw_ostream &operator<<(raw_ostream &OS,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 1252e35d81e82..1bcd36cf6241c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1709,6 +1709,8 @@ bool isArgPassedInSGPR(const Argument *Arg);
bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo);
+LLVM_READONLY bool isPackedFP32Inst(unsigned Opc);
+
LLVM_READONLY
bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
int64_t EncodedOffset);
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 42401afb6edf2..8304be958f1ad 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -78,12 +78,14 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
; GFX1250-LABEL: fadd_v2_vs:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -142,13 +144,16 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
-; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1]
-; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[6:7]
+; GFX1250-SDAG-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fadd_v4_vs:
@@ -156,13 +161,16 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1]
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[6:7]
+; GFX1250-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
@@ -332,56 +340,69 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fadd_v32_vs:
; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v40, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v32, s[0:1] offset:16
-; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v32, s[0:1]
-; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v32, s[0:1] offset:48
-; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v32, s[0:1] offset:32
-; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v32, s[0:1] offset:80
-; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v32, s[0:1] offset:64
-; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v32, s[0:1] offset:112
-; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v32, s[0:1] offset:96
-; GFX1250-SDAG-NEXT: s_clause 0x1
-; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v40, s[34:35] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v40, s[34:35] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v40, s[34:35] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v40, s[34:35]
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v40, s[34:35] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v40, s[34:35] offset:96
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v40, s[34:35] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v40, s[34:35] offset:112
+; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[12:13]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[14:15]
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s20 :: v_dual_mov_b32 v35, s21
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s22 :: v_dual_mov_b32 v39, s23
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s18 :: v_dual_mov_b32 v37, s29
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s30 :: v_dual_mov_b32 v43, s31
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v44, s24 :: v_dual_mov_b32 v33, s19
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s28 :: v_dual_mov_b32 v57, s15
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s3 :: v_dual_mov_b32 v54, s12
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s13 :: v_dual_mov_b32 v56, s14
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s7 :: v_dual_mov_b32 v52, s2
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s27 :: v_dual_mov_b32 v48, s4
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s5 :: v_dual_mov_b32 v50, s6
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s25 :: v_dual_mov_b32 v46, s26
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[34:35]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[38:39]
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s9
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v39, s11
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[10:11]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[16:17]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[40:41]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], v[42:43]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[0:1]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], v[36:37]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[16:17]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[38:39]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[48:49]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[34:35]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[38:39]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[44:45]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[46:47]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[50:51]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[36:37]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[42:43]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[18:19]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[20:21]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[22:23]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[8:9]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[54:55]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[56:57]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[52:53]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[42:43]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[48:49]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[50:51]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[44:45]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[46:47]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[32:33]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[36:37]
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:96
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:112
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:80
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:48
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[4:7], s[0:1]
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:16
+; GFX1250-SDAG-NEXT: global_store_b128 v40, v[16:19], s[34:35] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v40, v[12:15], s[34:35] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v40, v[8:11], s[34:35] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v40, v[4:7], s[34:35] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v40, v[20:23], s[34:35] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v40, v[24:27], s[34:35] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v40, v[0:3], s[34:35]
+; GFX1250-SDAG-NEXT: global_store_b128 v40, v[28:31], s[34:35] offset:16
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fadd_v32_vs:
@@ -389,54 +410,70 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v32, s[34:35]
-; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v32, s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v32, s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v32, s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v32, s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v32, s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v32, s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v32, s[34:35] offset:112
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35]
+; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[16:17]
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[18:19]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[20:21]
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[22:23]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[36:37]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[38:39]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[24:25]
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[26:27]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[40:41]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[42:43]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[28:29]
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[30:31]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[44:45]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[46:47]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[0:1]
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[2:3]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[48:49]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[50:51]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[4:5]
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[6:7]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[52:53]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[54:55]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[8:9]
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[10:11]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[24:25], v[24:25], v[32:33]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[26:27], v[26:27], v[34:35]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[12:13]
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[14:15]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[36:37]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[38:39]
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[0:3], s[34:35]
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[4:7], s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[8:11], s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[12:15], s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[16:19], s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[20:23], s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[24:27], s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35]
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
@@ -502,15 +539,16 @@ define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) {
; GFX1250-GISEL-LABEL: fadd_v2_v_imm:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -645,15 +683,16 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
; GFX1250-GISEL-LABEL: fadd_v2_v_lit_splat:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1.0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -703,13 +742,15 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) {
; GFX1250-GISEL-LABEL: fadd_v2_v_lit_hi0:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], 0x3f800000
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -746,17 +787,31 @@ define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) {
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
;
-; GFX1250-LABEL: fadd_v2_v_lit_lo0:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1250-NEXT: s_mov_b64 s[2:3], lit64(0x3f80000000000000)
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
-; GFX1250-NEXT: s_endpgm
+; GFX1250-SDAG-LABEL: fadd_v2_v_lit_lo0:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x3f80000000000000)
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v2_v_lit_lo0:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x3f80000000000000)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -792,17 +847,31 @@ define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
;
-; GFX1250-LABEL: fadd_v2_v_unfoldable_lit:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1250-NEXT: s_mov_b64 s[2:3], lit64(0x400000003f800000)
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
-; GFX1250-NEXT: s_endpgm
+; GFX1250-SDAG-LABEL: fadd_v2_v_unfoldable_lit:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x400000003f800000)
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v2_v_unfoldable_lit:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x400000003f800000)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1085,12 +1154,14 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, flo
; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_lo2:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] neg_lo:[0,1]
-; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] neg_lo:[0,1]
+; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_lo2:
@@ -1159,12 +1230,14 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, flo
; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_hi2:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1]
-; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1]
+; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_hi2:
@@ -1262,12 +1335,14 @@ define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
; GFX1250-LABEL: fmul_v2_vs:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -1326,13 +1401,16 @@ define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
-; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[0:1]
-; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[4:5]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[6:7]
+; GFX1250-SDAG-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fmul_v4_vs:
@@ -1340,13 +1418,16 @@ define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[0:1]
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[6:7]
+; GFX1250-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
@@ -1516,56 +1597,69 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fmul_v32_vs:
; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v40, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v32, s[0:1] offset:16
-; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v32, s[0:1]
-; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v32, s[0:1] offset:48
-; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v32, s[0:1] offset:32
-; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v32, s[0:1] offset:80
-; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v32, s[0:1] offset:64
-; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v32, s[0:1] offset:112
-; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v32, s[0:1] offset:96
-; GFX1250-SDAG-NEXT: s_clause 0x1
-; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v40, s[34:35] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v40, s[34:35] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v40, s[34:35] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v40, s[34:35]
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v40, s[34:35] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v40, s[34:35] offset:96
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v40, s[34:35] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v40, s[34:35] offset:112
+; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[12:13]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[14:15]
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s20 :: v_dual_mov_b32 v35, s21
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s22 :: v_dual_mov_b32 v39, s23
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s18 :: v_dual_mov_b32 v37, s29
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s30 :: v_dual_mov_b32 v43, s31
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v44, s24 :: v_dual_mov_b32 v33, s19
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s28 :: v_dual_mov_b32 v57, s15
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s3 :: v_dual_mov_b32 v54, s12
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s13 :: v_dual_mov_b32 v56, s14
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s7 :: v_dual_mov_b32 v52, s2
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s27 :: v_dual_mov_b32 v48, s4
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s5 :: v_dual_mov_b32 v50, s6
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s25 :: v_dual_mov_b32 v46, s26
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[34:35]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[38:39]
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s9
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v39, s11
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], s[10:11]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], s[16:17]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], s[40:41]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[42:43]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[0:1]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[36:37]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[16:17]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[18:19], v[18:19], s[38:39]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[24:25], v[24:25], s[48:49]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[34:35]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[38:39]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], s[44:45]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], s[46:47]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], s[50:51]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], s[36:37]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], s[42:43]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], s[18:19]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[8:9], v[8:9], s[20:21]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], s[22:23]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], s[8:9]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[54:55]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[56:57]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[52:53]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[42:43]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[48:49]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[50:51]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[44:45]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[46:47]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[32:33]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[36:37]
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:96
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:112
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:80
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:48
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[4:7], s[0:1]
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:16
+; GFX1250-SDAG-NEXT: global_store_b128 v40, v[16:19], s[34:35] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v40, v[12:15], s[34:35] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v40, v[8:11], s[34:35] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v40, v[4:7], s[34:35] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v40, v[20:23], s[34:35] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v40, v[24:27], s[34:35] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v40, v[0:3], s[34:35]
+; GFX1250-SDAG-NEXT: global_store_b128 v40, v[28:31], s[34:35] offset:16
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fmul_v32_vs:
@@ -1573,54 +1667,70 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v32, s[34:35]
-; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v32, s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v32, s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v32, s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v32, s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v32, s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v32, s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v32, s[34:35] offset:112
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35]
+; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[16:17]
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[18:19]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], s[20:21]
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], s[22:23]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[36:37]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[38:39]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], s[24:25]
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], s[26:27]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[40:41]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[42:43]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[12:13], v[12:13], s[28:29]
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[14:15], v[14:15], s[30:31]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[44:45]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[46:47]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[16:17], v[16:17], s[0:1]
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[18:19], v[18:19], s[2:3]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[48:49]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[50:51]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[20:21], v[20:21], s[4:5]
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[22:23], v[22:23], s[6:7]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[52:53]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[54:55]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[24:25], v[24:25], s[8:9]
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[26:27], v[26:27], s[10:11]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[32:33]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[34:35]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], s[12:13]
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], s[14:15]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[36:37]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[38:39]
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[0:3], s[34:35]
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[4:7], s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[8:11], s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[12:15], s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[16:19], s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[20:23], s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[24:27], s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35]
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
@@ -1685,15 +1795,16 @@ define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) {
; GFX1250-GISEL-LABEL: fmul_v2_v_imm:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -1828,15 +1939,16 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
; GFX1250-GISEL-LABEL: fmul_v2_v_lit_splat:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 4.0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -1873,17 +1985,31 @@ define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
;
-; GFX1250-LABEL: fmul_v2_v_unfoldable_lit:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1250-NEXT: s_mov_b64 s[2:3], lit64(0x4040000040800000)
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
-; GFX1250-NEXT: s_endpgm
+; GFX1250-SDAG-LABEL: fmul_v2_v_unfoldable_lit:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x4040000040800000)
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fmul_v2_v_unfoldable_lit:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x4040000040800000)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -2040,12 +2166,14 @@ define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
; GFX1250-LABEL: fma_v2_vs:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3]
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3]
+; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -2104,13 +2232,16 @@ define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
-; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[0:1], s[0:1]
-; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[4:5], v[4:5]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[6:7], v[6:7]
+; GFX1250-SDAG-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fma_v4_vs:
@@ -2118,13 +2249,16 @@ define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[0:1], s[0:1]
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[4:5], v[4:5]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[6:7], v[6:7]
+; GFX1250-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
@@ -2294,56 +2428,68 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fma_v32_vs:
; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v34, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v32, s[0:1] offset:16
-; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v32, s[0:1]
-; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v32, s[0:1] offset:48
-; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v32, s[0:1] offset:32
-; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v32, s[0:1] offset:80
-; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v32, s[0:1] offset:64
-; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v32, s[0:1] offset:112
-; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v32, s[0:1] offset:96
-; GFX1250-SDAG-NEXT: s_clause 0x1
-; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v34, s[34:35] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v34, s[34:35] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v34, s[34:35] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v34, s[34:35]
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v34, s[34:35] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v34, s[34:35] offset:96
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v34, s[34:35] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v34, s[34:35] offset:112
+; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[12:13], s[12:13]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[14:15], s[14:15]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[30:31]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[28:29]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[54:55], s[12:13]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[56:57], s[14:15]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[52:53], s[2:3]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[4:5]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[6:7]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[44:45], s[24:25]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[46:47], s[26:27]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[18:19]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[36:37], v[36:37]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[38:39], v[38:39]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[8:9]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[10:11]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], s[10:11], s[10:11]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], s[16:17], s[16:17]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], s[40:41], s[40:41]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[18:19], v[18:19], s[38:39], s[38:39]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[24:25], v[24:25], s[48:49], s[48:49]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], v[42:43], v[42:43]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[0:1]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[24:25], v[24:25], v[40:41], v[40:41]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[16:17]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], s[44:45], s[44:45]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], s[46:47], s[46:47]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], s[50:51], s[50:51]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[16:17], v[16:17], s[36:37], s[36:37]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], s[42:43], s[42:43]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], s[18:19], s[18:19]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[8:9], v[8:9], s[20:21], s[20:21]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], s[22:23], s[22:23]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], s[8:9], s[8:9]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[54:55], v[54:55]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[16:17], v[16:17], v[36:37], v[36:37]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[18:19], v[18:19], v[38:39], v[38:39]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[56:57], v[56:57]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[52:53], v[52:53]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[42:43], v[42:43]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[48:49], v[48:49]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[50:51], v[50:51]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[44:45], v[44:45]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], v[46:47], v[46:47]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[32:33], v[32:33]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[40:41], v[40:41]
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:96
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:112
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:80
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:48
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[4:7], s[0:1]
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:16
+; GFX1250-SDAG-NEXT: global_store_b128 v34, v[16:19], s[34:35] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v34, v[12:15], s[34:35] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v34, v[8:11], s[34:35] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v34, v[4:7], s[34:35] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v34, v[20:23], s[34:35] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v34, v[24:27], s[34:35] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v34, v[0:3], s[34:35]
+; GFX1250-SDAG-NEXT: global_store_b128 v34, v[28:31], s[34:35] offset:16
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fma_v32_vs:
@@ -2351,54 +2497,70 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v32, s[34:35]
-; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v32, s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v32, s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v32, s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v32, s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v32, s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v32, s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v32, s[34:35] offset:112
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35]
+; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[16:17], s[16:17]
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[18:19], s[18:19]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], s[20:21], s[20:21]
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], s[22:23], s[22:23]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[36:37], v[36:37]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[38:39], v[38:39]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], s[24:25], s[24:25]
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], s[26:27], s[26:27]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[40:41], v[40:41]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[42:43], v[42:43]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[12:13], v[12:13], s[28:29], s[28:29]
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[14:15], v[14:15], s[30:31], s[30:31]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[44:45], v[44:45]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[46:47], v[46:47]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[16:17], v[16:17], s[0:1], s[0:1]
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[18:19], v[18:19], s[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[16:17], v[16:17], v[48:49], v[48:49]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[18:19], v[18:19], v[50:51], v[50:51]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[20:21], v[20:21], s[4:5], s[4:5]
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[22:23], v[22:23], s[6:7], s[6:7]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[52:53], v[52:53]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[22:23], v[22:23], v[54:55], v[54:55]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[24:25], v[24:25], s[8:9], s[8:9]
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[26:27], v[26:27], s[10:11], s[10:11]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[24:25], v[24:25], v[32:33], v[32:33]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[26:27], v[26:27], v[34:35], v[34:35]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], s[12:13], s[12:13]
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], s[14:15], s[14:15]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[36:37], v[36:37]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[38:39], v[38:39]
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[0:3], s[34:35]
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[4:7], s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[8:11], s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[12:15], s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[16:19], s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[20:23], s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[24:27], s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35]
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
@@ -2488,17 +2650,19 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) {
; GFX1250-GISEL-LABEL: fma_v2_v_imm:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0x43480000
; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
; GFX1250-GISEL-NEXT: s_mov_b32 s5, s4
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v6, s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[4:5]
-; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX1250-GISEL-NEXT: global_store_b64 v6, v[0:1], s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -2653,17 +2817,19 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
; GFX1250-GISEL-LABEL: fma_v2_v_lit_splat:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 4.0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1.0
; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
; GFX1250-GISEL-NEXT: s_mov_b32 s5, s4
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v6, s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[4:5]
-; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX1250-GISEL-NEXT: global_store_b64 v6, v[0:1], s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -2740,29 +2906,30 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
; GFX1250-SDAG-LABEL: fma_v2_v_unfoldable_lit:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[2:3], lit64(0x400000003f800000)
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[4:5], lit64(0x4040000040800000)
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v6, 0x3ff, v0
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x4040000040800000)
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[4:5], lit64(0x400000003f800000)
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v6, s[0:1] scale_offset
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[4:5], s[2:3]
-; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX1250-SDAG-NEXT: global_store_b64 v6, v[0:1], s[0:1] scale_offset
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fma_v2_v_unfoldable_lit:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x4040000040800000)
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_mov_b64 s[4:5], lit64(0x400000003f800000)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v6, s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[4:5]
-; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX1250-GISEL-NEXT: global_store_b64 v6, v[0:1], s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -3268,20 +3435,22 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
; GFX1250-SDAG-NEXT: s_add_f32 s1, s1, 0
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX1250-SDAG-NEXT: flat_store_b64 v[0:1], v[0:1]
+; GFX1250-SDAG-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fadd_fadd_fsub_0:
; GFX1250-GISEL: ; %bb.0: ; %bb
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], 0
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, v1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v0, v1
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v0
-; GFX1250-GISEL-NEXT: flat_store_b64 v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
bb:
%i12 = fadd <2 x float> zeroinitializer, %arg
@@ -3363,15 +3532,16 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: s_add_f32 s6, s1, s3
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], s[2:3], s[6:7] op_sel_hi:[1,0]
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v0
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1]
-; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[4:5]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-SDAG-NEXT: s_add_f32 s2, s1, s3
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_3)
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[0:1], s[2:3] op_sel_hi:[1,0]
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, v2
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[4:5], v[0:1] neg_lo:[0,1] neg_hi:[0,1]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fadd_fadd_fsub:
@@ -3380,13 +3550,16 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_sub_f32 s0, s0, s2
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_3)
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v2, s0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], s[2:3], v[0:1]
-; GFX1250-GISEL-NEXT: v_dual_subrev_f32 v3, s3, v0 :: v_dual_mov_b32 v0, 0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, v1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1]
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_subrev_f32 v3, s3, v0
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[4:5]
; GFX1250-GISEL-NEXT: s_endpgm
bb:
@@ -3593,7 +3766,9 @@ define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, s[2:3] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1]
; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-GISEL-NEXT: s_endpgm
%fneg = fsub <2 x float> <float -0.0, float -0.0>, %x
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p_err.s
new file mode 100644
index 0000000000000..1ea64de5cbc9e
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p_err.s
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_pk_fma_f32 v[8:9], s[0:1], v[0:1], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand
+
+v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand
+
+v_pk_fma_f32 v[8:9], v[0:1], v[4:5], s[0:1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand
+
+v_pk_fma_f32 v[8:9], s[0:1], v[0:1], v[4:5] op_sel:[1,0,0] op_sel_hi:[0,0,0]
+// GFX12-ERR: :[[@LINE-1]]:45: error: invalid op_sel operand
+
+v_pk_fma_f32 v[8:9], s[0:1], v[0:1], v[4:5] op_sel:[1,0,0] op_sel_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:45: error: invalid op_sel operand
+
+v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] op_sel:[0,1,0] op_sel_hi:[0,0,0]
+// GFX12-ERR: :[[@LINE-1]]:45: error: invalid op_sel operand
+
+v_pk_fma_f32 v[8:9], v[0:1], v[4:5], s[0:1] op_sel:[0,0,1] op_sel_hi:[0,0,0]
+// GFX12-ERR: :[[@LINE-1]]:45: error: invalid op_sel operand
+
+v_pk_mul_f32 v[8:9], s[0:1], v[0:1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand
+
+v_pk_mul_f32 v[8:9], v[0:1], s[0:1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand
+
+v_pk_mul_f32 v[8:9], s[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+// GFX12-ERR: :[[@LINE-1]]:37: error: invalid op_sel operand
+
+v_pk_mul_f32 v[8:9], v[0:1], s[0:1] op_sel:[0,1] op_sel_hi:[0,0]
+// GFX12-ERR: :[[@LINE-1]]:37: error: invalid op_sel operand
+
+v_pk_mul_f32 v[8:9], v[0:1], s[0:1] op_sel:[0,1] op_sel_hi:[0,1]
+// GFX12-ERR: :[[@LINE-1]]:37: error: invalid op_sel operand
+
+v_pk_add_f32 v[8:9], s[0:1], v[0:1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand
+
+v_pk_add_f32 v[8:9], v[0:1], s[0:1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand
+
+v_pk_add_f32 v[8:9], s[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+// GFX12-ERR: :[[@LINE-1]]:37: error: invalid op_sel operand
+
+v_pk_add_f32 v[8:9], v[0:1], s[0:1] op_sel:[0,1] op_sel_hi:[0,0]
+// GFX12-ERR: :[[@LINE-1]]:37: error: invalid op_sel operand
+
+v_pk_add_f32 v[8:9], v[0:1], s[0:1] op_sel:[0,1] op_sel_hi:[0,1]
+// GFX12-ERR: :[[@LINE-1]]:37: error: invalid op_sel operand
+
+v_pk_fma_f32 v[8:9], exec, v[0:1], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand
+
+v_pk_fma_f32 v[8:9], v[0:1], exec, v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand
+
+v_pk_fma_f32 v[8:9], v[0:1], v[4:5], exec
+// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand
+
+v_pk_mul_f32 v[8:9], exec, v[0:1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand
+
+v_pk_mul_f32 v[8:9], v[0:1], exec
+// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand
+
+v_pk_add_f32 v[8:9], exec, v[0:1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand
+
+v_pk_add_f32 v[8:9], v[0:1], exec
+// GFX12-ERR: :[[@LINE-1]]:1: error: invalid op_sel operand
More information about the llvm-commits
mailing list