[llvm-branch-commits] [llvm] [AMDGPU] Make v2x64 BUILD_VECTOR legal on gfx1251 (PR #204470)
Stanislav Mekhanoshin via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jun 17 18:09:46 PDT 2026
https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/204470
>From d58c35644d873288aba7f95af92d876fd6e818f0 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 17 Jun 2026 14:28:11 -0700
Subject: [PATCH] [AMDGPU] Make v2x64 BUILD_VECTOR legal on gfx1251
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 17 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 5 +-
llvm/test/CodeGen/AMDGPU/packed-fp64.ll | 294 +++++++-----------
llvm/test/CodeGen/AMDGPU/packed-u64.ll | 176 +++++------
llvm/test/CodeGen/AMDGPU/pk-lshl-add-u64.ll | 31 +-
llvm/test/CodeGen/AMDGPU/shl.v2i64.ll | 50 +--
6 files changed, 233 insertions(+), 340 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 76f6f091eeeec..42991b0f13299 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -554,13 +554,16 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
bool IsRegSeq = true;
unsigned NOps = N->getNumOperands();
+ unsigned EltSizeInRegs = EltVT.getSizeInBits() / 32;
+ assert(IsGCN || EltSizeInRegs == 1);
for (unsigned i = 0; i < NOps; i++) {
// XXX: Why is this here?
if (isa<RegisterSDNode>(N->getOperand(i))) {
IsRegSeq = false;
break;
}
- unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
+ unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(
+ i * EltSizeInRegs, EltSizeInRegs)
: R600RegisterInfo::getSubRegFromChannel(i);
RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
@@ -571,7 +574,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
DL, EltVT);
for (unsigned i = NOps; i < NumVectorElts; ++i) {
- unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
+ unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(
+ i * EltSizeInRegs, EltSizeInRegs)
: R600RegisterInfo::getSubRegFromChannel(i);
RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
RegSeqArgs[1 + (2 * i) + 1] =
@@ -741,11 +745,12 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
}
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
- assert(VT.getVectorElementType().bitsEq(MVT::i32));
+ EVT EltTy = VT.getVectorElementType();
+ assert(EltTy.bitsEq(MVT::i32) || EltTy.bitsEq(MVT::i64));
+ unsigned VecInBits = NumVectorElts * EltTy.getScalarSizeInBits();
const TargetRegisterClass *RegClass =
- N->isDivergent()
- ? TRI->getDefaultVectorSuperClassForBitWidth(NumVectorElts * 32)
- : SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32);
+ N->isDivergent() ? TRI->getDefaultVectorSuperClassForBitWidth(VecInBits)
+ : SIRegisterInfo::getSGPRClassForBitWidth(VecInBits);
SelectBuildVector(N, RegClass->getID());
return;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e0b0fe46d44a6..5f9ad3679120c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -907,7 +907,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasPackedFP64Ops()) {
setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG,
ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE,
- ISD::FCANONICALIZE},
+ ISD::FCANONICALIZE, ISD::BUILD_VECTOR},
MVT::v2f64, Legal);
setOperationAction(
{ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
@@ -920,7 +920,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
if (Subtarget->hasPackedU64Ops()) {
- setOperationAction({ISD::ADD, ISD::SUB, ISD::SHL}, MVT::v2i64, Legal);
+ setOperationAction({ISD::ADD, ISD::SUB, ISD::SHL, ISD::BUILD_VECTOR},
+ MVT::v2i64, Legal);
setOperationAction({ISD::ADD, ISD::SUB, ISD::SHL},
{MVT::v4i64, MVT::v8i64, MVT::v16i64, MVT::v32i64},
Custom);
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp64.ll b/llvm/test/CodeGen/AMDGPU/packed-fp64.ll
index 8ed6033809e97..f5e11ae7ee90e 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp64.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp64.ll
@@ -55,10 +55,10 @@ define amdgpu_kernel void @fadd_v2_ss(ptr addrspace(1) %a, <2 x double> %x, <2 x
; GFX1251-SDAG-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 nv
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: v_mov_b32_e32 v4, 0
@@ -107,12 +107,12 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x double> %x) {
; GFX1251-SDAG-NEXT: s_clause 0x1
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v16, s[0:1]
; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:16
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v10, s14 :: v_dual_mov_b32 v12, s8
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v13, s9 :: v_dual_mov_b32 v14, s10
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v15, s11 :: v_dual_mov_b32 v11, s15
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v9, s13
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x1
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[12:15]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_add_f64 v[4:7], v[4:7], v[8:11]
@@ -406,9 +406,9 @@ define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40590000
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 0x4059000000000000
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -441,35 +441,20 @@ define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) {
}
define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) {
-; GFX1251-SDAG-LABEL: fadd_v2_v_v_splat:
-; GFX1251-SDAG: ; %bb.0:
-; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
-; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, v0
-; GFX1251-SDAG-NEXT: v_mov_b32_e32 v3, v1
-; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
-; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1251-SDAG-NEXT: v_pk_add_f64 v[2:5], v[4:7], v[0:3]
-; GFX1251-SDAG-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
-; GFX1251-SDAG-NEXT: s_endpgm
-;
-; GFX1251-GISEL-LABEL: fadd_v2_v_v_splat:
-; GFX1251-GISEL: ; %bb.0:
-; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
-; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1251-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
-; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1251-GISEL-NEXT: v_pk_add_f64 v[2:5], v[4:7], v[0:3]
-; GFX1251-GISEL-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
-; GFX1251-GISEL-NEXT: s_endpgm
+; GFX1251-LABEL: fadd_v2_v_v_splat:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
+; GFX1251-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1251-NEXT: v_mov_b32_e32 v1, 0
+; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
+; GFX1251-NEXT: s_wait_loadcnt 0x0
+; GFX1251-NEXT: v_pk_add_f64 v[2:5], v[4:7], v[0:3]
+; GFX1251-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
+; GFX1251-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id
%load = load <2 x double>, ptr addrspace(1) %gep, align 8
@@ -490,9 +475,9 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x3ff00000
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 1.0
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -530,9 +515,8 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x3ff00000
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v4
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 1.0
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -569,9 +553,8 @@ define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0x3ff00000
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 0
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], 1.0
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -608,9 +591,8 @@ define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x3ff00000
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v7, 2.0 :: v_dual_mov_b32 v6, v4
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 1.0
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], 2.0
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -653,8 +635,8 @@ define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, double %x) {
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000
; GFX1251-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset
@@ -698,11 +680,12 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, double %x) {
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
-; GFX1251-SDAG-NEXT: s_xor_b32 s4, s3, 0x80000000
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s4
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX1251-SDAG-NEXT: s_xor_b32 s5, s3, 0x80000000
+; GFX1251-SDAG-NEXT: s_mov_b32 s4, s2
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_endpgm
@@ -745,9 +728,10 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, double %x) {
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
-; GFX1251-SDAG-NEXT: s_xor_b32 s4, s3, 0x80000000
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s4
+; GFX1251-SDAG-NEXT: s_xor_b32 s5, s3, 0x80000000
+; GFX1251-SDAG-NEXT: s_mov_b32 s4, s2
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7]
@@ -795,10 +779,10 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, double %x, do
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_endpgm
@@ -845,8 +829,8 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, double %x, do
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s7
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7]
@@ -937,10 +921,10 @@ define amdgpu_kernel void @fmul_v2_ss(ptr addrspace(1) %a, <2 x double> %x, <2 x
; GFX1251-SDAG-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 nv
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: v_mov_b32_e32 v4, 0
@@ -989,12 +973,12 @@ define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x double> %x) {
; GFX1251-SDAG-NEXT: s_clause 0x1
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v16, s[0:1]
; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:16
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v10, s14 :: v_dual_mov_b32 v12, s8
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v13, s9 :: v_dual_mov_b32 v14, s10
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v15, s11 :: v_dual_mov_b32 v11, s15
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v9, s13
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x1
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[12:15]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[4:7], v[4:7], v[8:11]
@@ -1288,9 +1272,9 @@ define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40590000
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 0x4059000000000000
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -1323,35 +1307,20 @@ define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) {
}
define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) {
-; GFX1251-SDAG-LABEL: fmul_v2_v_v_splat:
-; GFX1251-SDAG: ; %bb.0:
-; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
-; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, v0
-; GFX1251-SDAG-NEXT: v_mov_b32_e32 v3, v1
-; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
-; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[2:5], v[4:7], v[0:3]
-; GFX1251-SDAG-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
-; GFX1251-SDAG-NEXT: s_endpgm
-;
-; GFX1251-GISEL-LABEL: fmul_v2_v_v_splat:
-; GFX1251-GISEL: ; %bb.0:
-; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
-; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1251-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
-; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[2:5], v[4:7], v[0:3]
-; GFX1251-GISEL-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
-; GFX1251-GISEL-NEXT: s_endpgm
+; GFX1251-LABEL: fmul_v2_v_v_splat:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
+; GFX1251-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1251-NEXT: v_mov_b32_e32 v1, 0
+; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
+; GFX1251-NEXT: s_wait_loadcnt 0x0
+; GFX1251-NEXT: v_pk_mul_f64 v[2:5], v[4:7], v[0:3]
+; GFX1251-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
+; GFX1251-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id
%load = load <2 x double>, ptr addrspace(1) %gep, align 8
@@ -1370,9 +1339,9 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40100000
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 4.0
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -1410,9 +1379,8 @@ define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40100000
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v7, 0x40080000 :: v_dual_mov_b32 v6, v4
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 4.0
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], 0x4008000000000000
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -1453,8 +1421,8 @@ define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, double %x) {
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000
; GFX1251-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset
@@ -1874,12 +1842,11 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v12, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40690000
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v9, 0x40590000 :: v_dual_mov_b32 v6, v4
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v8, v4
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v10, v4 :: v_dual_mov_b32 v11, v9
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 0x4069000000000000
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[8:9], 0x4059000000000000
+; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[10:11], v[8:9]
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v12, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -1916,35 +1883,20 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) {
}
define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) {
-; GFX1251-SDAG-LABEL: fma_v2_v_v_splat:
-; GFX1251-SDAG: ; %bb.0:
-; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
-; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, v0
-; GFX1251-SDAG-NEXT: v_mov_b32_e32 v3, v1
-; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
-; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[2:5], v[4:7], v[0:3], v[0:3]
-; GFX1251-SDAG-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
-; GFX1251-SDAG-NEXT: s_endpgm
-;
-; GFX1251-GISEL-LABEL: fma_v2_v_v_splat:
-; GFX1251-GISEL: ; %bb.0:
-; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
-; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1251-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
-; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[2:5], v[4:7], v[0:3], v[0:3]
-; GFX1251-GISEL-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
-; GFX1251-GISEL-NEXT: s_endpgm
+; GFX1251-LABEL: fma_v2_v_v_splat:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
+; GFX1251-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1251-NEXT: v_mov_b32_e32 v1, 0
+; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
+; GFX1251-NEXT: s_wait_loadcnt 0x0
+; GFX1251-NEXT: v_pk_fma_f64 v[2:5], v[4:7], v[0:3], v[0:3]
+; GFX1251-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
+; GFX1251-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id
%load = load <2 x double>, ptr addrspace(1) %gep, align 8
@@ -1963,12 +1915,11 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v12, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x3ff00000
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v9, 0x40100000 :: v_dual_mov_b32 v6, v4
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v8, v4
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v10, v4 :: v_dual_mov_b32 v11, v9
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 1.0
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[8:9], 4.0
+; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[10:11], v[8:9]
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v12, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -2010,12 +1961,10 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v12, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x3ff00000
-; GFX1251-SDAG-NEXT: v_mov_b32_e32 v7, 2.0
-; GFX1251-SDAG-NEXT: v_mov_b32_e32 v9, 0x40100000
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v11, 0x40080000 :: v_dual_mov_b32 v10, v4
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v8, v4
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 1.0
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], 2.0
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[8:9], 4.0
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[10:11], 0x4008000000000000
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v12, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -2059,14 +2008,13 @@ define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, double %x) {
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
-; GFX1251-SDAG-NEXT: s_xor_b32 s5, s3, 0x80000000
-; GFX1251-SDAG-NEXT: s_mov_b32 s4, s2
-; GFX1251-SDAG-NEXT: s_mov_b32 s6, s2
-; GFX1251-SDAG-NEXT: s_mov_b32 s7, s5
+; GFX1251-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000
+; GFX1251-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1251-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX1251-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[0:3], v[0:3], v[4:7], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_endpgm
@@ -2493,19 +2441,17 @@ define amdgpu_kernel void @fneg_v2f64_pkfma(ptr addrspace(1) %out) {
; GFX1251-SDAG-LABEL: fneg_v2f64_pkfma:
; GFX1251-SDAG: ; %bb.0: ; %entry
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1251-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
-; GFX1251-SDAG-NEXT: s_mov_b32 s2, 0
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v2, s2
-; GFX1251-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1251-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX1251-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1251-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX1251-SDAG-NEXT: v_cndmask_b32_e64 v1, 0x3ff00000, 0, vcc_lo
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_mov_b32_e32 v3, v1
-; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[0:3], v[0:3], 0, v[0:3] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[2:5], v[0:3], 0, v[0:3] neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1251-SDAG-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1251-SDAG-NEXT: global_store_b128 v0, v[2:5], s[0:1]
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: fneg_v2f64_pkfma:
diff --git a/llvm/test/CodeGen/AMDGPU/packed-u64.ll b/llvm/test/CodeGen/AMDGPU/packed-u64.ll
index bbd87fad143e3..9995c1b986a6a 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-u64.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-u64.ll
@@ -54,12 +54,12 @@ define amdgpu_kernel void @add_v2_ss(ptr addrspace(1) %a, <2 x i64> %x, <2 x i64
; GFX1251-SDAG-NEXT: s_clause 0x1
; GFX1251-SDAG-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 nv
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
+; GFX1251-SDAG-NEXT: v_mov_b32_e32 v8, 0
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s8
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v2, s10
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v3, s11 :: v_dual_mov_b32 v4, s12
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v6, s14
-; GFX1251-SDAG-NEXT: v_mov_b32_e32 v7, s15
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1]
@@ -99,12 +99,12 @@ define amdgpu_kernel void @add_v4_vs(ptr addrspace(1) %a, <4 x i64> %x) {
; GFX1251-SDAG-NEXT: s_clause 0x1
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v16, s[0:1]
; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:16
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v10, s14 :: v_dual_mov_b32 v12, s8
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v13, s9 :: v_dual_mov_b32 v14, s10
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v15, s11 :: v_dual_mov_b32 v11, s15
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v9, s13
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x1
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[12:15]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[4:7], v[4:7], v[8:11]
@@ -398,9 +398,9 @@ define amdgpu_kernel void @add_v2_v_imm(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0x64 :: v_dual_mov_b32 v5, 0
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 0x64
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -433,35 +433,20 @@ define amdgpu_kernel void @add_v2_v_imm(ptr addrspace(1) %a) {
}
define amdgpu_kernel void @add_v2_v_v_splat(ptr addrspace(1) %a) {
-; GFX1251-SDAG-LABEL: add_v2_v_v_splat:
-; GFX1251-SDAG: ; %bb.0:
-; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
-; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, v0
-; GFX1251-SDAG-NEXT: v_mov_b32_e32 v3, v1
-; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
-; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[2:5], v[4:7], v[0:3]
-; GFX1251-SDAG-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
-; GFX1251-SDAG-NEXT: s_endpgm
-;
-; GFX1251-GISEL-LABEL: add_v2_v_v_splat:
-; GFX1251-GISEL: ; %bb.0:
-; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
-; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1251-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
-; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[2:5], v[4:7], v[0:3]
-; GFX1251-GISEL-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
-; GFX1251-GISEL-NEXT: s_endpgm
+; GFX1251-LABEL: add_v2_v_v_splat:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
+; GFX1251-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1251-NEXT: v_mov_b32_e32 v1, 0
+; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
+; GFX1251-NEXT: s_wait_loadcnt 0x0
+; GFX1251-NEXT: v_pk_add_nc_u64 v[2:5], v[4:7], v[0:3]
+; GFX1251-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
+; GFX1251-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <2 x i64>, ptr addrspace(1) %gep, align 8
@@ -482,9 +467,9 @@ define amdgpu_kernel void @add_v2_v_lit_splat(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 0
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 1
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -522,9 +507,8 @@ define amdgpu_kernel void @add_v2_v_lit_hi0(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 1
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v5 :: v_dual_mov_b32 v7, v5
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 1
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -561,9 +545,8 @@ define amdgpu_kernel void @add_v2_v_lit_lo0(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v6, 1
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v7, v4
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 0
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], 1
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -600,9 +583,8 @@ define amdgpu_kernel void @add_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 1
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, 2 :: v_dual_mov_b32 v7, v5
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 1
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], 2
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -689,12 +671,12 @@ define amdgpu_kernel void @sub_v2_ss(ptr addrspace(1) %a, <2 x i64> %x, <2 x i64
; GFX1251-SDAG-NEXT: s_clause 0x1
; GFX1251-SDAG-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 nv
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
+; GFX1251-SDAG-NEXT: v_mov_b32_e32 v8, 0
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s8
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v2, s10
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v3, s11 :: v_dual_mov_b32 v4, s12
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v6, s14
-; GFX1251-SDAG-NEXT: v_mov_b32_e32 v7, s15
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1]
@@ -734,12 +716,12 @@ define amdgpu_kernel void @sub_v4_vs(ptr addrspace(1) %a, <4 x i64> %x) {
; GFX1251-SDAG-NEXT: s_clause 0x1
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v16, s[0:1]
; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:16
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v10, s14 :: v_dual_mov_b32 v12, s8
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v13, s9 :: v_dual_mov_b32 v14, s10
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v15, s11 :: v_dual_mov_b32 v11, s15
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v9, s13
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x1
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[12:15]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[4:7], v[4:7], v[8:11]
@@ -1033,9 +1015,9 @@ define amdgpu_kernel void @sub_v2_v_imm(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0x64 :: v_dual_mov_b32 v5, 0
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 0x64
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -1073,9 +1055,9 @@ define amdgpu_kernel void @sub_v2_imm_v(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0x64 :: v_dual_mov_b32 v5, 0
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 0x64
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -1108,35 +1090,20 @@ define amdgpu_kernel void @sub_v2_imm_v(ptr addrspace(1) %a) {
}
define amdgpu_kernel void @sub_v2_v_v_splat(ptr addrspace(1) %a) {
-; GFX1251-SDAG-LABEL: sub_v2_v_v_splat:
-; GFX1251-SDAG: ; %bb.0:
-; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
-; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, v0
-; GFX1251-SDAG-NEXT: v_mov_b32_e32 v3, v1
-; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
-; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[2:5], v[4:7], v[0:3]
-; GFX1251-SDAG-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
-; GFX1251-SDAG-NEXT: s_endpgm
-;
-; GFX1251-GISEL-LABEL: sub_v2_v_v_splat:
-; GFX1251-GISEL: ; %bb.0:
-; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
-; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1251-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
-; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[2:5], v[4:7], v[0:3]
-; GFX1251-GISEL-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
-; GFX1251-GISEL-NEXT: s_endpgm
+; GFX1251-LABEL: sub_v2_v_v_splat:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
+; GFX1251-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1251-NEXT: v_mov_b32_e32 v1, 0
+; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
+; GFX1251-NEXT: s_wait_loadcnt 0x0
+; GFX1251-NEXT: v_pk_sub_nc_u64 v[2:5], v[4:7], v[0:3]
+; GFX1251-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
+; GFX1251-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <2 x i64>, ptr addrspace(1) %gep, align 8
@@ -1157,9 +1124,9 @@ define amdgpu_kernel void @sub_v2_v_lit_splat(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 0
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 1
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -1197,9 +1164,8 @@ define amdgpu_kernel void @sub_v2_v_lit_hi0(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 1
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v5 :: v_dual_mov_b32 v7, v5
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 1
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -1236,9 +1202,8 @@ define amdgpu_kernel void @sub_v2_v_lit_lo0(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v6, 1
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v7, v4
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 0
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], 1
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -1275,9 +1240,8 @@ define amdgpu_kernel void @sub_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 1
-; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, 2 :: v_dual_mov_b32 v7, v5
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], 1
+; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], 2
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/pk-lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/pk-lshl-add-u64.ll
index a045572322ee8..ee7e164057938 100644
--- a/llvm/test/CodeGen/AMDGPU/pk-lshl-add-u64.ll
+++ b/llvm/test/CodeGen/AMDGPU/pk-lshl-add-u64.ll
@@ -102,8 +102,8 @@ define amdgpu_kernel void @pk_lshl_add_u64_s2s(<2 x i64> %v, <2 x i64> %a) {
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], 2
; GFX1251-NEXT: s_lshl_b64 s[2:3], s[10:11], 2
-; GFX1251-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX1251-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -147,14 +147,13 @@ define i32 @pk_lshl_add_u64_maybe_oob(<2 x ptr> %p, <2 x i32> %i) {
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: v_dual_mov_b32 v6, v5 :: v_dual_ashrrev_i32 v5, 31, v4
; GFX1251-NEXT: s_mov_b32 s0, 2
-; GFX1251-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1251-NEXT: v_dual_mov_b32 v8, 12 :: v_dual_mov_b32 v11, s0
-; GFX1251-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_ashrrev_i32 v7, 31, v6
-; GFX1251-NEXT: v_mov_b32_e32 v9, 0
-; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1251-NEXT: v_pk_lshl_add_u64 v[0:3], v[4:7], v[10:11], v[0:3]
-; GFX1251-NEXT: v_dual_mov_b32 v10, v8 :: v_dual_mov_b32 v11, v9
-; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-NEXT: v_mov_b64_e32 v[8:9], 12
+; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1251-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_ashrrev_i32 v7, 31, v6
+; GFX1251-NEXT: v_mov_b32_e32 v12, s0
+; GFX1251-NEXT: v_mov_b64_e32 v[10:11], v[8:9]
+; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1251-NEXT: v_pk_lshl_add_u64 v[0:3], v[4:7], v[12:13], v[0:3]
; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[8:11]
; GFX1251-NEXT: flat_load_b32 v4, v[0:1]
; GFX1251-NEXT: flat_load_b32 v5, v[2:3]
@@ -180,8 +179,8 @@ define amdgpu_kernel void @pk_lshl_add_u64_s2s_shift2_3(<2 x i64> %v, <2 x i64>
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], 2
; GFX1251-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; GFX1251-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX1251-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -204,8 +203,8 @@ define amdgpu_kernel void @pk_lshl_add_u64_s2s_shift2_4(<2 x i64> %v, <2 x i64>
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], 2
; GFX1251-NEXT: s_lshl_b64 s[2:3], s[10:11], 4
-; GFX1251-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX1251-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -226,8 +225,8 @@ define amdgpu_kernel void @pk_lshl_add_u64_s2s_shift2_5(<2 x i64> %v, <2 x i64>
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], 2
; GFX1251-NEXT: s_lshl_b64 s[2:3], s[10:11], 5
-; GFX1251-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX1251-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i64.ll
index ecbb578957232..9b57faa81d46f 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i64.ll
@@ -94,14 +94,10 @@ define amdgpu_kernel void @s_shl_v2i64_imm_s(ptr addrspace(1) %out, <2 x i64> %r
; GFX1251-NEXT: s_clause 0x1
; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv
; GFX1251-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv
-; GFX1251-NEXT: s_mov_b64 s[8:9], 0x4d2
-; GFX1251-NEXT: s_wait_xcnt 0x0
-; GFX1251-NEXT: s_movk_i32 s4, 0x162e
-; GFX1251-NEXT: s_mov_b32 s5, s9
; GFX1251-NEXT: v_mov_b32_e32 v4, 0
; GFX1251-NEXT: s_wait_kmcnt 0x0
-; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], s0
-; GFX1251-NEXT: s_lshl_b64 s[2:3], s[4:5], s2
+; GFX1251-NEXT: s_lshl_b64 s[0:1], 0x4d2, s0
+; GFX1251-NEXT: s_lshl_b64 s[2:3], 0x162e, s2
; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1251-NEXT: global_store_b128 v4, v[0:3], s[6:7]
@@ -185,36 +181,18 @@ define amdgpu_kernel void @shl_s_v_v2i64(ptr addrspace(1) %out, ptr addrspace(1)
}
define amdgpu_kernel void @shl_imm_v_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; GFX1250-LABEL: shl_imm_v_v2i64:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
-; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b128 v[0:3], v4, s[2:3] scale_offset
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_lshlrev_b64_e64 v[2:3], v2, 8
-; GFX1250-NEXT: v_lshlrev_b64_e64 v[0:1], v0, 8
-; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset
-; GFX1250-NEXT: s_endpgm
-;
-; GFX1251-LABEL: shl_imm_v_v2i64:
-; GFX1251: ; %bb.0:
-; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
-; GFX1251-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX1251-NEXT: s_wait_kmcnt 0x0
-; GFX1251-NEXT: global_load_b128 v[0:3], v4, s[2:3] scale_offset
-; GFX1251-NEXT: s_wait_xcnt 0x0
-; GFX1251-NEXT: s_mov_b64 s[2:3], 8
-; GFX1251-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1251-NEXT: s_mov_b32 s4, s2
-; GFX1251-NEXT: s_mov_b32 s5, s3
-; GFX1251-NEXT: s_wait_loadcnt 0x0
-; GFX1251-NEXT: v_lshlrev_b64_e64 v[2:3], v2, s[4:5]
-; GFX1251-NEXT: v_lshlrev_b64_e64 v[0:1], v0, s[2:3]
-; GFX1251-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset
-; GFX1251-NEXT: s_endpgm
+; GCN-LABEL: shl_imm_v_v2i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GCN-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: global_load_b128 v[0:3], v4, s[2:3] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: v_lshlrev_b64_e64 v[2:3], v2, 8
+; GCN-NEXT: v_lshlrev_b64_e64 v[0:1], v0, 8
+; GCN-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext
More information about the llvm-branch-commits
mailing list