[llvm] [AMDGPU] Add V_ADD|SUB|MUL_U64 gfx1250 opcodes (PR #150291)
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 23 12:20:45 PDT 2025
https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/150291
None
>From 19ceda7b605f906190e4f64ab224f00876c5ea78 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 23 Jul 2025 11:16:26 -0700
Subject: [PATCH] [AMDGPU] Add V_ADD|SUB|MUL_U64 gfx1250 opcodes
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 8 +
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 3 +
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 9 +-
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 7 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 17 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 +
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 1 +
llvm/lib/Target/AMDGPU/VOP2Instructions.td | 17 +
llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 521 ++++++++++++++++++
llvm/test/CodeGen/AMDGPU/add_u64.ll | 129 +++++
.../AMDGPU/branch-relaxation-gfx1250.ll | 2 +-
.../test/CodeGen/AMDGPU/carryout-selection.ll | 398 +++++++++++++
.../test/CodeGen/AMDGPU/code-size-estimate.ll | 10 +-
.../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 268 +++++----
llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll | 10 +-
llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll | 117 ++--
llvm/test/CodeGen/AMDGPU/literal64.ll | 44 +-
llvm/test/CodeGen/AMDGPU/mul.ll | 422 ++++++++++++++
llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll | 4 +-
llvm/test/CodeGen/AMDGPU/sub_u64.ll | 146 +++++
llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s | 356 ++++++++++++
llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s | 5 +
.../Disassembler/AMDGPU/gfx1250_dasm_vop2.txt | 258 +++++++++
23 files changed, 2506 insertions(+), 250 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/add_u64.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/sub_u64.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 1c7ee724fef09..ff2595ef51869 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1355,6 +1355,10 @@ def FeatureLshlAddU64Inst
: SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true",
"Has v_lshl_add_u64 instruction">;
+def FeatureAddSubU64Insts
+ : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true",
+ "Has v_add_u64 and v_sub_u64 instructions">;
+
def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
"HasVMemToLDSLoad",
"true",
@@ -2010,6 +2014,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureMemoryAtomicFAddF32DenormalSupport,
FeatureKernargPreload,
FeatureLshlAddU64Inst,
+ FeatureAddSubU64Insts,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
]>;
@@ -2787,6 +2792,9 @@ def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">,
def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">,
AssemblerPredicate<(all_of FeatureLshlAddU64Inst)>;
+def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">,
+ AssemblerPredicate<(all_of FeatureAddSubU64Insts)>;
+
def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">,
AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e7bf88d2ee5b6..fedfa3f9dd900 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4208,6 +4208,9 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
assert(Ty.isScalar());
unsigned Size = Ty.getSizeInBits();
+ if (ST.hasVectorMulU64() && Size == 64)
+ return true;
+
unsigned NumParts = Size / 32;
assert((Size % 32) == 0);
assert(NumParts >= 2);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index f1caf2478e630..9b05f7c339738 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2528,7 +2528,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// Special case for s_mul_u64. There is not a vector equivalent of
// s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
// multiplications.
- if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) {
+ if (!Subtarget.hasVectorMulU64() && Opc == AMDGPU::G_MUL &&
+ DstTy.getSizeInBits() == 64) {
applyMappingSMULU64(B, OpdMapper);
return;
}
@@ -3973,7 +3974,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
} else {
- OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
+ if (MI.getOpcode() == AMDGPU::G_MUL && Subtarget.hasVectorMulU64())
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ else
+ OpdsMapping[0] =
+ getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 8b758b011f6ad..5eddde1f72ec7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -267,6 +267,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasMinimum3Maximum3F16 = false;
bool HasMinimum3Maximum3PKF16 = false;
bool HasLshlAddU64Inst = false;
+ bool HasAddSubU64Insts = false;
bool HasPointSampleAccel = false;
bool HasLdsBarrierArriveAtomic = false;
bool HasSetPrioIncWgInst = false;
@@ -1500,6 +1501,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasVOPD3() const { return GFX1250Insts; }
+ // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
+ bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }
+
+ // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
+ bool hasVectorMulU64() const { return GFX1250Insts; }
+
// \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 441034b508c10..92a56a1d5f492 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -874,7 +874,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
- if (Subtarget->hasScalarSMulU64())
+ if (Subtarget->hasVectorMulU64())
+ setOperationAction(ISD::MUL, MVT::i64, Legal);
+ else if (Subtarget->hasScalarSMulU64())
setOperationAction(ISD::MUL, MVT::i64, Custom);
if (Subtarget->hasMad64_32())
@@ -5421,6 +5423,19 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineOperand &Src0 = MI.getOperand(1);
MachineOperand &Src1 = MI.getOperand(2);
+ if (ST.hasAddSubU64Insts()) {
+ auto I = BuildMI(*BB, MI, DL,
+ TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
+ : AMDGPU::V_SUB_U64_e64),
+ Dest.getReg())
+ .add(Src0)
+ .add(Src1)
+ .addImm(0); // clamp
+ TII->legalizeOperands(*I);
+ MI.eraseFromParent();
+ return BB;
+ }
+
if (IsAdd && ST.hasLshlAddU64Inst()) {
auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
Dest.getReg())
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 571f3efd68260..40e687178fb01 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7361,6 +7361,10 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
case AMDGPU::S_MUL_U64:
+ if (ST.hasVectorMulU64()) {
+ NewOpcode = AMDGPU::V_MUL_U64_e64;
+ break;
+ }
// Split s_mul_u64 in 32-bit vector multiplications.
splitScalarSMulU64(Worklist, Inst, MDT);
Inst.eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index b8537513ce986..485ca78db93a7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2914,6 +2914,7 @@ def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
def VOP_I16_F32_F32 : VOPProfile <[i16, f32, f32, untyped]>;
def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=*/1>;
+def VOP_I64_I64_I64_ARITH : VOPProfile <[i64, i64, i64, untyped], /*EnableClamp=*/1>;
def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 030a6e1e978c1..550ec9d3f55ab 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -925,6 +925,17 @@ let isAdd = 1 in {
defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32">;
}
+let isReMaterializable = 1 in {
+let SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit] in {
+defm V_ADD_U64 : VOP2Inst <"v_add_nc_u64", VOP_I64_I64_I64_ARITH>;
+// We don't actually have something like V_SUBREV_U64 so V_SUB_U64 can't be treated as commutable.
+let isCommutable = 0 in
+defm V_SUB_U64 : VOP2Inst <"v_sub_nc_u64", VOP_I64_I64_I64_ARITH>;
+} // End SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit]
+let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDouble] in
+defm V_MUL_U64 : VOP2Inst <"v_mul_u64", VOP_I64_I64_I64, DivergentBinFrag<mul>>;
+} // End isReMaterializable = 1
+
} // End isCommutable = 1
// These are special and do not read the exec mask.
@@ -1754,6 +1765,9 @@ multiclass VOP2_Real_FULL_with_name<GFXGen Gen, bits<6> op, string opName,
VOP2_Realtriple_e64_with_name<Gen, op, opName, asmName>,
VOP2_Real_NO_VOP3_with_name<Gen, op, opName, asmName>;
+multiclass VOP2_Real_NO_DPP<GFXGen Gen, bits<6> op> :
+ VOP2_Real_e32<Gen, op>, VOP2_Real_e64<Gen, op>;
+
multiclass VOP2_Real_NO_DPP_with_name<GFXGen Gen, bits<6> op, string opName,
string asmName> {
defm NAME : VOP2_Real_e32_with_name<Gen, op, opName, asmName>,
@@ -1843,6 +1857,9 @@ defm V_FMAC_F64 : VOP2_Real_FULL<GFX12Gen, 0x17>;
defm V_FMAMK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x23>;
defm V_FMAAK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x24>;
+defm V_ADD_U64 : VOP2_Real_FULL<GFX1250Gen, 0x28>;
+defm V_SUB_U64 : VOP2_Real_FULL<GFX1250Gen, 0x29>;
+defm V_MUL_U64 : VOP2_Real_NO_DPP<GFX1250Gen, 0x2a>;
//===----------------------------------------------------------------------===//
// GFX11.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index f7f7e9645fa62..0d571d0e563b5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -6,6 +6,7 @@
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16, -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16, -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250 %s
define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
; GCN-LABEL: s_mul_i16:
@@ -22,6 +23,11 @@ define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_mul_i32 s0, s0, s1
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_i32 s0, s0, s1
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
@@ -74,6 +80,13 @@ define i16 @v_mul_i16(i16 %num, i16 %den) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
@@ -109,6 +122,13 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, 0xffff, s0
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i16_zeroext:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_i32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
@@ -165,6 +185,15 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i16_zeroext:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
@@ -188,6 +217,13 @@ define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inre
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_sext_i32_i16 s0, s0
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i16_signext:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_i32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_sext_i32_i16 s0, s0
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
@@ -248,6 +284,15 @@ define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i16_signext:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
@@ -267,6 +312,11 @@ define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_mul_i32 s0, s0, s1
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_i32 s0, s0, s1
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul i32 %num, %den
ret i32 %result
}
@@ -293,6 +343,13 @@ define i32 @v_mul_i32(i32 %num, i32 %den) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i32 %num, %den
ret i32 %result
}
@@ -315,6 +372,12 @@ define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %d
; GFX12-NEXT: s_mul_i32 s0, s0, s2
; GFX12-NEXT: s_mul_i32 s1, s1, s3
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_v2i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_i32 s0, s0, s2
+; GFX1250-NEXT: s_mul_i32 s1, s1, s3
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul <2 x i32> %num, %den
ret <2 x i32> %result
}
@@ -344,6 +407,14 @@ define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_v2i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v2
+; GFX1250-NEXT: v_mul_lo_u32 v1, v1, v3
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul <2 x i32> %num, %den
ret <2 x i32> %result
}
@@ -400,6 +471,11 @@ define amdgpu_cs i33 @s_mul_i33(i33 inreg %num, i33 inreg %den) {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i33:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul i33 %num, %den
ret i33 %result
}
@@ -456,6 +532,11 @@ define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul i64 %num, %den
ret i64 %result
}
@@ -504,6 +585,13 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v2, v[3:4]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i64 %num, %den
ret i64 %result
}
@@ -620,6 +708,26 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
; GFX12-NEXT: s_add_co_ci_u32 s2, s3, s0
; GFX12-NEXT: s_mov_b32 s0, s5
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i96:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_i32 s6, s0, s5
+; GFX1250-NEXT: s_mul_i32 s7, s1, s4
+; GFX1250-NEXT: s_mul_i32 s2, s2, s3
+; GFX1250-NEXT: s_add_co_i32 s6, s6, s7
+; GFX1250-NEXT: s_mul_hi_u32 s7, s0, s3
+; GFX1250-NEXT: s_add_co_i32 s6, s6, s2
+; GFX1250-NEXT: s_mul_i32 s2, s0, s4
+; GFX1250-NEXT: s_mul_i32 s5, s0, s3
+; GFX1250-NEXT: s_mul_hi_u32 s0, s0, s4
+; GFX1250-NEXT: s_add_co_u32 s2, s2, s7
+; GFX1250-NEXT: s_mul_i32 s4, s1, s3
+; GFX1250-NEXT: s_add_co_ci_u32 s0, s0, s6
+; GFX1250-NEXT: s_mul_hi_u32 s3, s1, s3
+; GFX1250-NEXT: s_add_co_u32 s1, s4, s2
+; GFX1250-NEXT: s_add_co_ci_u32 s2, s3, s0
+; GFX1250-NEXT: s_mov_b32 s0, s5
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul i96 %num, %den
%cast = bitcast i96 %result to <3 x i32>
ret <3 x i32> %cast
@@ -686,6 +794,25 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2]
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i96:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mul_lo_u32 v0, v6, v5
+; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v4, v[0:1]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v3, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9]
+; GFX1250-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v11, v8
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[4:5], null, v6, v4, v[10:11]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[2:3], null, v7, v3, v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i96 %num, %den
ret i96 %result
}
@@ -895,6 +1022,42 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
; GFX12-NEXT: s_mov_b32 s1, s8
; GFX12-NEXT: s_mov_b32 s2, s7
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i128:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_i32 s9, s0, s6
+; GFX1250-NEXT: s_mul_i32 s11, s1, s5
+; GFX1250-NEXT: s_mul_hi_u32 s10, s0, s6
+; GFX1250-NEXT: s_mul_hi_u32 s12, s1, s5
+; GFX1250-NEXT: s_add_co_u32 s9, s11, s9
+; GFX1250-NEXT: s_mul_i32 s11, s2, s4
+; GFX1250-NEXT: s_add_co_ci_u32 s10, s12, s10
+; GFX1250-NEXT: s_mul_hi_u32 s12, s2, s4
+; GFX1250-NEXT: s_mul_hi_u32 s8, s0, s4
+; GFX1250-NEXT: s_add_co_u32 s9, s11, s9
+; GFX1250-NEXT: s_mul_i32 s11, s0, s5
+; GFX1250-NEXT: s_add_co_ci_u32 s10, s12, s10
+; GFX1250-NEXT: s_mul_hi_u32 s12, s0, s5
+; GFX1250-NEXT: s_add_co_u32 s8, s11, s8
+; GFX1250-NEXT: s_add_co_ci_u32 s9, s12, s9
+; GFX1250-NEXT: s_mul_i32 s12, s1, s4
+; GFX1250-NEXT: s_mul_hi_u32 s13, s1, s4
+; GFX1250-NEXT: s_cselect_b32 s11, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s8, s12, s8
+; GFX1250-NEXT: s_mul_i32 s12, s0, s7
+; GFX1250-NEXT: s_add_co_ci_u32 s7, s13, s9
+; GFX1250-NEXT: s_add_co_ci_u32 s9, s10, s12
+; GFX1250-NEXT: s_mul_i32 s1, s1, s6
+; GFX1250-NEXT: s_cmp_lg_u32 s11, 0
+; GFX1250-NEXT: s_mul_i32 s2, s2, s5
+; GFX1250-NEXT: s_add_co_ci_u32 s1, s9, s1
+; GFX1250-NEXT: s_mul_i32 s3, s3, s4
+; GFX1250-NEXT: s_add_co_i32 s1, s1, s2
+; GFX1250-NEXT: s_mul_i32 s0, s0, s4
+; GFX1250-NEXT: s_add_co_i32 s3, s1, s3
+; GFX1250-NEXT: s_mov_b32 s1, s8
+; GFX1250-NEXT: s_mov_b32 s2, s7
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul i128 %num, %den
%cast = bitcast i128 %result to <4 x i32>
ret <4 x i32> %cast
@@ -1036,6 +1199,39 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i128:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v6, 0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], null, v9, v5, v[0:1]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], null, v2, v4, v[10:11]
+; GFX1250-NEXT: v_mov_b32_e32 v12, v1
+; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_e32 v13, v10
+; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[12:13]
+; GFX1250-NEXT: v_mul_lo_u32 v8, v8, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13]
+; GFX1250-NEXT: s_wait_alu 0xf1ff
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11, v8, s0
+; GFX1250-NEXT: s_wait_alu 0xfffd
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v8, v1, vcc_lo
+; GFX1250-NEXT: v_mov_b32_e32 v1, v6
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v5, v[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v7
+; GFX1250-NEXT: v_mad_co_u64_u32 v[4:5], null, v3, v4, v[8:9]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i128 %num, %den
ret i128 %result
}
@@ -2020,6 +2216,185 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX12-NEXT: s_add_co_i32 s7, s1, s7
; GFX12-NEXT: s_mov_b32 s1, s16
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i256:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_i32 s17, s0, s10
+; GFX1250-NEXT: s_mul_i32 s19, s1, s9
+; GFX1250-NEXT: s_mul_hi_u32 s18, s0, s10
+; GFX1250-NEXT: s_mul_hi_u32 s20, s1, s9
+; GFX1250-NEXT: s_add_co_u32 s17, s19, s17
+; GFX1250-NEXT: s_add_co_ci_u32 s18, s20, s18
+; GFX1250-NEXT: s_mul_i32 s20, s2, s8
+; GFX1250-NEXT: s_mul_hi_u32 s21, s2, s8
+; GFX1250-NEXT: s_cselect_b32 s19, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s17, s20, s17
+; GFX1250-NEXT: s_mul_hi_u32 s16, s0, s8
+; GFX1250-NEXT: s_add_co_ci_u32 s18, s21, s18
+; GFX1250-NEXT: s_mul_i32 s21, s0, s9
+; GFX1250-NEXT: s_mul_hi_u32 s22, s0, s9
+; GFX1250-NEXT: s_cselect_b32 s20, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s16, s21, s16
+; GFX1250-NEXT: s_add_co_ci_u32 s17, s22, s17
+; GFX1250-NEXT: s_mul_i32 s22, s1, s8
+; GFX1250-NEXT: s_mul_hi_u32 s23, s1, s8
+; GFX1250-NEXT: s_cselect_b32 s21, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s16, s22, s16
+; GFX1250-NEXT: s_add_co_ci_u32 s17, s23, s17
+; GFX1250-NEXT: s_mul_i32 s23, s0, s12
+; GFX1250-NEXT: s_mul_i32 s25, s1, s11
+; GFX1250-NEXT: s_mul_hi_u32 s24, s0, s12
+; GFX1250-NEXT: s_mul_hi_u32 s26, s1, s11
+; GFX1250-NEXT: s_cselect_b32 s22, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s23, s25, s23
+; GFX1250-NEXT: s_add_co_ci_u32 s24, s26, s24
+; GFX1250-NEXT: s_mul_i32 s26, s2, s10
+; GFX1250-NEXT: s_mul_hi_u32 s27, s2, s10
+; GFX1250-NEXT: s_cselect_b32 s25, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s23, s26, s23
+; GFX1250-NEXT: s_add_co_ci_u32 s24, s27, s24
+; GFX1250-NEXT: s_mul_i32 s27, s3, s9
+; GFX1250-NEXT: s_mul_hi_u32 s28, s3, s9
+; GFX1250-NEXT: s_cselect_b32 s26, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s23, s27, s23
+; GFX1250-NEXT: s_add_co_ci_u32 s24, s28, s24
+; GFX1250-NEXT: s_mul_i32 s28, s4, s8
+; GFX1250-NEXT: s_mul_hi_u32 s29, s4, s8
+; GFX1250-NEXT: s_cselect_b32 s27, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s23, s28, s23
+; GFX1250-NEXT: s_add_co_ci_u32 s24, s29, s24
+; GFX1250-NEXT: s_mul_i32 s29, s0, s11
+; GFX1250-NEXT: s_mul_hi_u32 s30, s0, s11
+; GFX1250-NEXT: s_cselect_b32 s28, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s18, s29, s18
+; GFX1250-NEXT: s_add_co_ci_u32 s23, s30, s23
+; GFX1250-NEXT: s_mul_i32 s30, s1, s10
+; GFX1250-NEXT: s_mul_hi_u32 s31, s1, s10
+; GFX1250-NEXT: s_cselect_b32 s29, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s18, s30, s18
+; GFX1250-NEXT: s_add_co_ci_u32 s23, s31, s23
+; GFX1250-NEXT: s_mul_i32 s31, s2, s9
+; GFX1250-NEXT: s_mul_hi_u32 s33, s2, s9
+; GFX1250-NEXT: s_cselect_b32 s30, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s18, s31, s18
+; GFX1250-NEXT: s_add_co_ci_u32 s23, s33, s23
+; GFX1250-NEXT: s_mul_i32 s33, s3, s8
+; GFX1250-NEXT: s_mul_hi_u32 s34, s3, s8
+; GFX1250-NEXT: s_cselect_b32 s31, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s18, s33, s18
+; GFX1250-NEXT: s_add_co_ci_u32 s23, s34, s23
+; GFX1250-NEXT: s_cselect_b32 s33, 1, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s22, 0
+; GFX1250-NEXT: s_mul_hi_u32 s22, s0, s14
+; GFX1250-NEXT: s_add_co_ci_u32 s18, s21, s18
+; GFX1250-NEXT: s_cselect_b32 s21, 1, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s20, 0
+; GFX1250-NEXT: s_mul_hi_u32 s34, s1, s13
+; GFX1250-NEXT: s_add_co_ci_u32 s19, s19, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s21, 0
+; GFX1250-NEXT: s_mul_i32 s21, s0, s14
+; GFX1250-NEXT: s_add_co_ci_u32 s19, s19, s23
+; GFX1250-NEXT: s_mul_i32 s23, s1, s13
+; GFX1250-NEXT: s_cselect_b32 s20, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s21, s23, s21
+; GFX1250-NEXT: s_mul_i32 s23, s2, s12
+; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22
+; GFX1250-NEXT: s_mul_hi_u32 s34, s2, s12
+; GFX1250-NEXT: s_add_co_u32 s21, s23, s21
+; GFX1250-NEXT: s_mul_i32 s23, s3, s11
+; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22
+; GFX1250-NEXT: s_mul_hi_u32 s34, s3, s11
+; GFX1250-NEXT: s_add_co_u32 s21, s23, s21
+; GFX1250-NEXT: s_mul_i32 s23, s4, s10
+; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22
+; GFX1250-NEXT: s_mul_hi_u32 s34, s4, s10
+; GFX1250-NEXT: s_add_co_u32 s21, s23, s21
+; GFX1250-NEXT: s_mul_i32 s23, s5, s9
+; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22
+; GFX1250-NEXT: s_mul_hi_u32 s34, s5, s9
+; GFX1250-NEXT: s_add_co_u32 s21, s23, s21
+; GFX1250-NEXT: s_mul_i32 s23, s6, s8
+; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22
+; GFX1250-NEXT: s_mul_hi_u32 s34, s6, s8
+; GFX1250-NEXT: s_add_co_u32 s21, s23, s21
+; GFX1250-NEXT: s_mul_i32 s23, s0, s13
+; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22
+; GFX1250-NEXT: s_mul_hi_u32 s34, s0, s13
+; GFX1250-NEXT: s_add_co_u32 s23, s23, s24
+; GFX1250-NEXT: s_add_co_ci_u32 s21, s34, s21
+; GFX1250-NEXT: s_mul_i32 s34, s1, s12
+; GFX1250-NEXT: s_mul_hi_u32 s35, s1, s12
+; GFX1250-NEXT: s_cselect_b32 s24, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s23, s34, s23
+; GFX1250-NEXT: s_add_co_ci_u32 s21, s35, s21
+; GFX1250-NEXT: s_mul_i32 s35, s2, s11
+; GFX1250-NEXT: s_mul_hi_u32 s36, s2, s11
+; GFX1250-NEXT: s_cselect_b32 s34, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s23, s35, s23
+; GFX1250-NEXT: s_add_co_ci_u32 s21, s36, s21
+; GFX1250-NEXT: s_mul_i32 s36, s3, s10
+; GFX1250-NEXT: s_mul_hi_u32 s37, s3, s10
+; GFX1250-NEXT: s_cselect_b32 s35, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s23, s36, s23
+; GFX1250-NEXT: s_add_co_ci_u32 s21, s37, s21
+; GFX1250-NEXT: s_mul_i32 s37, s4, s9
+; GFX1250-NEXT: s_mul_hi_u32 s38, s4, s9
+; GFX1250-NEXT: s_cselect_b32 s36, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s23, s37, s23
+; GFX1250-NEXT: s_add_co_ci_u32 s21, s38, s21
+; GFX1250-NEXT: s_mul_i32 s38, s5, s8
+; GFX1250-NEXT: s_mul_hi_u32 s39, s5, s8
+; GFX1250-NEXT: s_cselect_b32 s37, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s23, s38, s23
+; GFX1250-NEXT: s_add_co_ci_u32 s21, s39, s21
+; GFX1250-NEXT: s_cselect_b32 s38, 1, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s30, 0
+; GFX1250-NEXT: s_mul_i32 s1, s1, s14
+; GFX1250-NEXT: s_add_co_ci_u32 s29, s29, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s31, 0
+; GFX1250-NEXT: s_mul_i32 s2, s2, s13
+; GFX1250-NEXT: s_add_co_ci_u32 s29, s29, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s33, 0
+; GFX1250-NEXT: s_mul_i32 s3, s3, s12
+; GFX1250-NEXT: s_add_co_ci_u32 s29, s29, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s20, 0
+; GFX1250-NEXT: s_mul_i32 s4, s4, s11
+; GFX1250-NEXT: s_add_co_ci_u32 s20, s29, s23
+; GFX1250-NEXT: s_cselect_b32 s23, 1, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s26, 0
+; GFX1250-NEXT: s_mul_i32 s26, s0, s15
+; GFX1250-NEXT: s_add_co_ci_u32 s25, s25, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s27, 0
+; GFX1250-NEXT: s_mul_i32 s5, s5, s10
+; GFX1250-NEXT: s_add_co_ci_u32 s25, s25, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s28, 0
+; GFX1250-NEXT: s_mul_i32 s6, s6, s9
+; GFX1250-NEXT: s_add_co_ci_u32 s25, s25, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s23, 0
+; GFX1250-NEXT: s_mul_i32 s7, s7, s8
+; GFX1250-NEXT: s_add_co_ci_u32 s15, s25, s21
+; GFX1250-NEXT: s_add_co_ci_u32 s21, s22, s26
+; GFX1250-NEXT: s_cmp_lg_u32 s38, 0
+; GFX1250-NEXT: s_mul_i32 s0, s0, s8
+; GFX1250-NEXT: s_add_co_ci_u32 s1, s21, s1
+; GFX1250-NEXT: s_cmp_lg_u32 s37, 0
+; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s2
+; GFX1250-NEXT: s_cmp_lg_u32 s36, 0
+; GFX1250-NEXT: s_mov_b32 s2, s17
+; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s3
+; GFX1250-NEXT: s_cmp_lg_u32 s35, 0
+; GFX1250-NEXT: s_mov_b32 s3, s18
+; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s4
+; GFX1250-NEXT: s_cmp_lg_u32 s34, 0
+; GFX1250-NEXT: s_mov_b32 s4, s19
+; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s5
+; GFX1250-NEXT: s_cmp_lg_u32 s24, 0
+; GFX1250-NEXT: s_mov_b32 s5, s20
+; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s6
+; GFX1250-NEXT: s_mov_b32 s6, s15
+; GFX1250-NEXT: s_add_co_i32 s7, s1, s7
+; GFX1250-NEXT: s_mov_b32 s1, s16
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul i256 %num, %den
%cast = bitcast i256 %result to <8 x i32>
ret <8 x i32> %cast
@@ -2478,6 +2853,107 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i256:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v0, v14, 0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], null, v0, v12, 0
+; GFX1250-NEXT: v_mul_lo_u32 v26, v6, v9
+; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v1, v13, v[16:17]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19]
+; GFX1250-NEXT: s_wait_alu 0xf1ff
+; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v2, v12, v[16:17]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
+; GFX1250-NEXT: s_wait_alu 0xfffd
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], null, v0, v10, 0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v3, v11, v[16:17]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
+; GFX1250-NEXT: s_wait_alu 0xfffd
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v4, v10, v[16:17]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v5, v9, v[16:17]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[16:17]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21]
+; GFX1250-NEXT: v_mov_b32_e32 v20, v19
+; GFX1250-NEXT: s_wait_alu 0xfffd
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v24, vcc_lo
+; GFX1250-NEXT: s_wait_alu 0xf1ff
+; GFX1250-NEXT: v_cndmask_b32_e64 v19, 0, 1, s0
+; GFX1250-NEXT: v_mov_b32_e32 v21, v22
+; GFX1250-NEXT: v_mul_lo_u32 v22, v5, v10
+; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v2, v8, v[16:17]
+; GFX1250-NEXT: s_wait_alu 0xfffd
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v19, vcc_lo
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v0, v13, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v20, v25
+; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], vcc_lo, v1, v12, v[16:17]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21]
+; GFX1250-NEXT: s_wait_alu 0xf1ff
+; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v0, v8, 0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
+; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13
+; GFX1250-NEXT: s_wait_alu 0xf1ff
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2
+; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
+; GFX1250-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24
+; GFX1250-NEXT: s_wait_alu 0xf1ff
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2
+; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19]
+; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v15
+; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
+; GFX1250-NEXT: s_wait_alu 0xf1ff
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
+; GFX1250-NEXT: v_mul_lo_u32 v9, v1, v14
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2
+; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19]
+; GFX1250-NEXT: s_wait_alu 0xf1ff
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
+; GFX1250-NEXT: s_wait_alu 0xf1ff
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v27, v13, s2
+; GFX1250-NEXT: s_wait_alu 0xf1ff
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v2, v10, s2
+; GFX1250-NEXT: s_wait_alu 0xf1ff
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v6, v11, s2
+; GFX1250-NEXT: s_wait_alu 0xf1ff
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v23, v0, s2
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v9, s5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v20, s4
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v29, s3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v25, s1
+; GFX1250-NEXT: s_wait_alu 0xfffd
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v26, s0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v8, v[0:1]
+; GFX1250-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v7, v8
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i256 %num, %den
ret i256 %result
}
@@ -2536,6 +3012,14 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: s_mul_u64_zext_with_vregs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1250-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
%ext = zext i32 %val to i64
%mul = mul i64 %ext, 80
@@ -2632,6 +3116,21 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: s_mul_u64_zext_with_sregs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
%ext = zext i32 %val to i64
%mul = mul i64 %ext, 80
@@ -2704,6 +3203,14 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: s_mul_u64_sext_with_vregs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
+; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1250-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
%ext = sext i32 %val to i64
%mul = mul i64 %ext, 80
@@ -2815,6 +3322,20 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: s_mul_u64_sext_with_sregs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
%ext = sext i32 %val to i64
%mul = mul i64 %ext, 80
diff --git a/llvm/test/CodeGen/AMDGPU/add_u64.ll b/llvm/test/CodeGen/AMDGPU/add_u64.ll
new file mode 100644
index 0000000000000..0373027201378
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/add_u64.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
+
+define amdgpu_ps <2 x float> @test_add_u64_vv(i64 %a, i64 %b) {
+; GFX12-LABEL: test_add_u64_vv:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: ; return to shader part epilog
+ %add = add i64 %a, %b
+ %ret = bitcast i64 %add to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_vs(i64 %a, i64 inreg %b) {
+; GFX12-LABEL: test_add_u64_vs:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_vs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %add = add i64 %a, %b
+ %ret = bitcast i64 %add to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_sv(i64 inreg %a, i64 %b) {
+; GFX12-LABEL: test_add_u64_sv:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_sv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %add = add i64 %a, %b
+ %ret = bitcast i64 %add to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_ss(i64 inreg %a, i64 inreg %b) {
+; GCN-LABEL: test_add_u64_ss:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: ; return to shader part epilog
+ %add = add i64 %a, %b
+ %ret = bitcast i64 %add to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_v_inline_lit(i64 %a) {
+; GFX12-LABEL: test_add_u64_v_inline_lit:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_v_inline_lit:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 5, v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %add = add i64 %a, 5
+ %ret = bitcast i64 %add to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_v_small_imm(i64 %a) {
+; GFX12-LABEL: test_add_u64_v_small_imm:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x1f4, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_v_small_imm:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 0x1f4, v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %add = add i64 %a, 500
+ %ret = bitcast i64 %add to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_v_64bit_imm(i64 %a) {
+; GFX12-LABEL: test_add_u64_v_64bit_imm:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x3b9ac9ff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 1, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_v_64bit_imm:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], lit64(0x13b9ac9ff), v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %add = add i64 %a, 5294967295
+ %ret = bitcast i64 %add to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_s_small_imm(i64 inreg %a) {
+; GCN-LABEL: test_add_u64_s_small_imm:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x1f4
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: ; return to shader part epilog
+ %add = add i64 %a, 500
+ %ret = bitcast i64 %add to <2 x float>
+ ret <2 x float> %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
index 95504052249e0..7fec5f71ce8d5 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -152,7 +152,7 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
; GCN-NEXT: s_wait_xcnt 0x0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GCN-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GCN-NEXT: s_mov_b32 s0, exec_lo
; GCN-NEXT: v_cmpx_ne_u32_e32 0, v2
; GCN-NEXT: s_cbranch_execnz .LBB3_1
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index eff68ce2de11d..4a634520c682e 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -9,6 +9,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1030W32 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1030W64 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250 %s
; GCN-ISEL-LABEL: name: sadd64rr
; GCN-ISEL-LABEL: body:
@@ -113,6 +114,19 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: sadd64rr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%add = add i64 %a, %b
store i64 %add, ptr addrspace(1) %out
@@ -211,6 +225,17 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: sadd64ri:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], lit64(0x123456789876)
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%add = add i64 20015998343286, %a
store i64 %add, ptr addrspace(1) %out
@@ -301,6 +326,17 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: vadd64rr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1]
+; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -391,6 +427,17 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: vadd64ri:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], lit64(0x123456789876), v[0:1]
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -486,6 +533,18 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: suaddo32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %uadd, 0
%carry = extractvalue { i32, i1 } %uadd, 1
@@ -606,6 +665,21 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: uaddo32_vcc_user:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_add_co_u32 v1, s4, s6, s7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX1250-NEXT: s_endpgm
%uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %uadd, 0
%carry = extractvalue { i32, i1 } %uadd, 1
@@ -741,6 +815,22 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: suaddo64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX1250-NEXT: s_wait_alu 0xf1ff
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX1250-NEXT: s_endpgm
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %uadd, 0
%carry = extractvalue { i64, i1 } %uadd, 1
@@ -874,6 +964,23 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: vuaddo64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1]
+; GFX1250-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[2:3]
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %tid.ext)
@@ -987,6 +1094,19 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ssub64rr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_sub_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%sub = sub i64 %a, %b
store i64 %sub, ptr addrspace(1) %out
@@ -1085,6 +1205,17 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ssub64ri:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_sub_nc_u64 s[2:3], lit64(0x123456789876), s[2:3]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%sub = sub i64 20015998343286, %a
store i64 %sub, ptr addrspace(1) %out
@@ -1175,6 +1306,17 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: vsub64rr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], s[2:3], v[0:1]
+; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -1265,6 +1407,17 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: vsub64ri:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], lit64(0x123456789876), v[0:1]
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -1361,6 +1514,18 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: susubo32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_sub_co_i32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
%carry = extractvalue { i32, i1 } %usub, 1
@@ -1481,6 +1646,21 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: usubo32_vcc_user:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_sub_co_u32 v1, s4, s6, s7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX1250-NEXT: s_endpgm
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
%carry = extractvalue { i32, i1 } %usub, 1
@@ -1616,6 +1796,22 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: susubo64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_sub_nc_u64 s[6:7], s[4:5], s[6:7]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX1250-NEXT: s_wait_alu 0xf1ff
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX1250-NEXT: s_endpgm
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %usub, 0
%carry = extractvalue { i64, i1 } %usub, 1
@@ -1749,6 +1945,23 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: vusubo64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], s[6:7], v[0:1]
+; GFX1250-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[2:3]
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %tid.ext)
@@ -2904,6 +3117,191 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: .LBB16_4:
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-NEXT: s_branch .LBB16_2
+;
+; GFX1250-LABEL: sudiv64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x34
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_or_b64 s[0:1], s[10:11], s[2:3]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_b64 s[0:1], s[0:1], lit64(0xffffffff00000000)
+; GFX1250-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4
+; GFX1250-NEXT: ; %bb.1:
+; GFX1250-NEXT: s_cvt_f32_u32 s0, s2
+; GFX1250-NEXT: s_cvt_f32_u32 s1, s3
+; GFX1250-NEXT: s_sub_nc_u64 s[6:7], 0, s[2:3]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX1250-NEXT: s_fmac_f32 s0, s1, 0x4f800000
+; GFX1250-NEXT: v_s_rcp_f32 s0, s0
+; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX1250-NEXT: s_mul_f32 s0, s0, 0x5f7ffffc
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_mul_f32 s1, s0, 0x2f800000
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX1250-NEXT: s_trunc_f32 s1, s1
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_fmac_f32 s0, s1, 0xcf800000
+; GFX1250-NEXT: s_cvt_u32_f32 s5, s1
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_cvt_u32_f32 s4, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[6:7], s[4:5]
+; GFX1250-NEXT: s_mul_hi_u32 s15, s4, s13
+; GFX1250-NEXT: s_mul_i32 s14, s4, s13
+; GFX1250-NEXT: s_mul_hi_u32 s0, s4, s12
+; GFX1250-NEXT: s_mul_i32 s17, s5, s12
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[0:1], s[14:15]
+; GFX1250-NEXT: s_mul_hi_u32 s16, s5, s12
+; GFX1250-NEXT: s_mul_hi_u32 s18, s5, s13
+; GFX1250-NEXT: s_add_co_u32 s0, s14, s17
+; GFX1250-NEXT: s_add_co_ci_u32 s0, s15, s16
+; GFX1250-NEXT: s_mul_i32 s12, s5, s13
+; GFX1250-NEXT: s_add_co_ci_u32 s13, s18, 0
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[0:1], s[12:13]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_add_co_u32 v0, s0, s4, s12
+; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT: s_add_co_ci_u32 s5, s5, s13
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v0
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_mul_u64 s[6:7], s[6:7], s[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_mul_hi_u32 s13, s4, s7
+; GFX1250-NEXT: s_mul_i32 s12, s4, s7
+; GFX1250-NEXT: s_mul_hi_u32 s0, s4, s6
+; GFX1250-NEXT: s_mul_i32 s15, s5, s6
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[0:1], s[12:13]
+; GFX1250-NEXT: s_mul_hi_u32 s14, s5, s6
+; GFX1250-NEXT: s_mul_hi_u32 s4, s5, s7
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_add_co_u32 s0, s12, s15
+; GFX1250-NEXT: s_add_co_ci_u32 s0, s13, s14
+; GFX1250-NEXT: s_mul_i32 s6, s5, s7
+; GFX1250-NEXT: s_add_co_ci_u32 s7, s4, 0
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[0:1], s[6:7]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_add_co_u32 v0, s0, v0, s6
+; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT: s_add_co_ci_u32 s0, s5, s7
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v0
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_mul_hi_u32 s5, s10, s0
+; GFX1250-NEXT: s_mul_i32 s4, s10, s0
+; GFX1250-NEXT: s_mul_hi_u32 s12, s11, s0
+; GFX1250-NEXT: s_mul_i32 s6, s11, s0
+; GFX1250-NEXT: s_mul_hi_u32 s0, s10, s7
+; GFX1250-NEXT: s_mul_i32 s13, s11, s7
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[0:1], s[4:5]
+; GFX1250-NEXT: s_mul_hi_u32 s0, s11, s7
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_add_co_u32 s4, s4, s13
+; GFX1250-NEXT: s_add_co_ci_u32 s0, s5, s0
+; GFX1250-NEXT: s_add_co_ci_u32 s7, s12, 0
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[0:1], s[6:7]
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_and_b64 s[6:7], s[4:5], lit64(0xffffffff00000000)
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_or_b32 s6, s6, s4
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], s[6:7]
+; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[6:7], 2
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: v_sub_co_u32 v0, s0, s10, s4
+; GFX1250-NEXT: s_sub_co_i32 s4, s11, s5
+; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX1250-NEXT: v_sub_co_u32 v1, s12, v0, s2
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_sub_co_ci_u32 s4, s4, s3
+; GFX1250-NEXT: s_cmp_lg_u32 s12, 0
+; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[6:7], 1
+; GFX1250-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_sub_co_ci_u32 s4, s4, 0
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_cmp_ge_u32 s4, s3
+; GFX1250-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s4, s3
+; GFX1250-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: v_cndmask_b32_e32 v1, s14, v1, vcc_lo
+; GFX1250-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0
+; GFX1250-NEXT: s_sub_co_ci_u32 s0, s11, s5
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_cmp_ge_u32 s0, s3
+; GFX1250-NEXT: s_wait_alu 0xfffd
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s0, s3
+; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s4, v0, s0
+; GFX1250-NEXT: s_wait_alu 0xfffd
+; GFX1250-NEXT: v_cndmask_b32_e32 v2, s12, v2, vcc_lo
+; GFX1250-NEXT: v_cndmask_b32_e32 v1, s13, v3, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX1250-NEXT: s_wait_alu 0xfffd
+; GFX1250-NEXT: v_cndmask_b32_e32 v1, s7, v1, vcc_lo
+; GFX1250-NEXT: v_cndmask_b32_e32 v0, s6, v2, vcc_lo
+; GFX1250-NEXT: s_cbranch_execnz .LBB16_3
+; GFX1250-NEXT: .LBB16_2:
+; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX1250-NEXT: s_sub_co_i32 s1, 0, s2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX1250-NEXT: v_nop
+; GFX1250-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_mul_i32 s1, s1, s0
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_mul_hi_u32 s1, s0, s1
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_add_co_i32 s0, s0, s1
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_mul_hi_u32 s0, s10, s0
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_mul_i32 s1, s0, s2
+; GFX1250-NEXT: s_add_co_i32 s3, s0, 1
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_sub_co_i32 s1, s10, s1
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_sub_co_i32 s4, s1, s2
+; GFX1250-NEXT: s_cmp_ge_u32 s1, s2
+; GFX1250-NEXT: s_cselect_b32 s0, s3, s0
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_cselect_b32 s1, s4, s1
+; GFX1250-NEXT: s_add_co_i32 s3, s0, 1
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: s_cmp_ge_u32 s1, s2
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: s_cselect_b32 s0, s3, s0
+; GFX1250-NEXT: s_wait_alu 0xfffe
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-NEXT: .LBB16_3:
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[8:9]
+; GFX1250-NEXT: s_endpgm
+; GFX1250-NEXT: .LBB16_4:
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: s_branch .LBB16_2
%result = udiv i64 %x, %y
store i64 %result, ptr addrspace(1) %out
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
index dea9142cf2bee..f9fae025e0bf8 100644
--- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
@@ -737,7 +737,7 @@ define i64 @v_add_u64_vop2_literal_32(i64 %x) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
; GFX1250-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
-; GFX1250-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 0x7b ; encoding: [0x00,0x00,0x52,0xd6,0x00,0x01,0xfd,0x03,0x7b,0x00,0x00,0x00]
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 0x7b, v[0:1] ; encoding: [0xff,0x00,0x00,0x50,0x7b,0x00,0x00,0x00]
; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%add = add i64 %x, 123
ret i64 %add
@@ -747,7 +747,7 @@ define i64 @v_add_u64_vop2_literal_32(i64 %x) {
; GFX10: codeLenInByte = 28
; GFX1100: codeLenInByte = 32
; GFX1150: codeLenInByte = 32
-; GFX1250: codeLenInByte = 24
+; GFX1250: codeLenInByte = 20
define i64 @v_add_u64_vop2_literal_64(i64 %x) {
; GFX9-LABEL: v_add_u64_vop2_literal_64:
@@ -788,9 +788,7 @@ define i64 @v_add_u64_vop2_literal_64(i64 %x) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
; GFX1250-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
-; GFX1250-NEXT: s_mov_b64 s[0:1], lit64(0x112345678) ; encoding: [0xfe,0x01,0x80,0xbe,0x78,0x56,0x34,0x12,0x01,0x00,0x00,0x00]
-; GFX1250-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
-; GFX1250-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; encoding: [0x00,0x00,0x52,0xd6,0x00,0x01,0x01,0x00]
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], lit64(0x112345678), v[0:1] ; encoding: [0xfe,0x00,0x00,0x50,0x78,0x56,0x34,0x12,0x01,0x00,0x00,0x00]
; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%add = add i64 %x, 4600387192
ret i64 %add
@@ -800,6 +798,6 @@ define i64 @v_add_u64_vop2_literal_64(i64 %x) {
; GFX10: codeLenInByte = 28
; GFX1100: codeLenInByte = 32
; GFX1150: codeLenInByte = 32
-; GFX1250: codeLenInByte = 36
+; GFX1250: codeLenInByte = 24
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; NOT-GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index f4040f3049e0d..eba46a1ecb614 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -256,7 +256,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -350,8 +350,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -455,7 +455,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB12_3
@@ -529,8 +529,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -676,7 +676,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -704,7 +704,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -751,7 +751,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
@@ -772,8 +772,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -805,7 +805,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -856,7 +856,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
@@ -879,7 +879,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB20_3
@@ -904,7 +904,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
;
@@ -943,7 +943,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
@@ -959,8 +959,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -989,7 +989,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
;
@@ -1032,7 +1032,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
@@ -1112,7 +1112,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -1131,7 +1131,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2
+; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB26_2
@@ -1140,9 +1140,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -1179,7 +1177,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB26_2
@@ -1189,9 +1187,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
@@ -1212,8 +1208,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -1236,7 +1232,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2
+; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB27_2
@@ -1245,9 +1241,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -1288,7 +1282,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB27_2
@@ -1298,9 +1292,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
@@ -1323,7 +1315,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB28_3
@@ -1338,7 +1330,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2
+; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
@@ -1348,9 +1340,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
;
@@ -1378,7 +1368,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
@@ -1389,9 +1379,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
@@ -1407,8 +1395,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -1427,7 +1415,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2
+; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
@@ -1437,9 +1425,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
;
@@ -1471,7 +1457,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
@@ -1482,9 +1468,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
@@ -1564,7 +1548,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -1662,8 +1646,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -1771,7 +1755,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB36_3
@@ -1853,8 +1837,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -2008,7 +1992,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -2106,8 +2090,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2215,7 +2199,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB44_3
@@ -2297,8 +2281,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -2452,7 +2436,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -2550,8 +2534,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2659,7 +2643,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB52_3
@@ -2741,8 +2725,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -2890,7 +2874,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -2992,8 +2976,8 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -3105,7 +3089,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB60_3
@@ -3187,8 +3171,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -3336,7 +3320,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -3438,8 +3422,8 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -3551,7 +3535,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB68_3
@@ -3633,8 +3617,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -3782,7 +3766,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -3884,8 +3868,8 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -3997,7 +3981,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB76_3
@@ -4079,8 +4063,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -4228,7 +4212,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -4330,8 +4314,8 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -4443,7 +4427,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB84_3
@@ -4525,8 +4509,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -4695,7 +4679,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -4802,8 +4786,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -4920,7 +4904,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB92_3
@@ -5010,8 +4994,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -5164,7 +5148,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -5192,10 +5176,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
@@ -5243,10 +5227,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
@@ -5269,8 +5253,8 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -5302,10 +5286,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
@@ -5357,10 +5341,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
@@ -5385,7 +5369,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB100_3
@@ -5408,10 +5392,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -5449,10 +5433,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off
@@ -5470,8 +5454,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -5498,10 +5482,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -5543,10 +5527,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off
@@ -5621,7 +5605,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -5651,7 +5635,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], -1, v[0:1]
; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -5703,7 +5687,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, -1
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], -1, v[0:1]
; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -5727,8 +5711,8 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -5762,7 +5746,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], -1, v[0:1]
; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -5818,7 +5802,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, -1
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], -1, v[0:1]
; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -5844,7 +5828,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB108_3
@@ -5869,7 +5853,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1]
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -5913,7 +5897,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1]
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
@@ -5934,8 +5918,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -5964,7 +5948,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1]
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
@@ -6012,7 +5996,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1]
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
index e6018e413a85d..3f1e354f2ccc7 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -341,7 +341,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388608(ptr inreg %s
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -673,7 +673,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32(ptr %vbase, i32 inreg %soffse
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_mov_b32 s3, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1]
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: ; return to shader part epilog
@@ -703,7 +703,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i3
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_mov_b32 s3, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8388607
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: ; return to shader part epilog
@@ -2140,7 +2140,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, 4
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3]
; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2
@@ -2198,7 +2198,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, 4
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3]
; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index 79907fd0c60bc..fd644a35f61e3 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -304,78 +304,79 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4
; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32
; GCN-SDAG-NEXT: s_clause 0x7
-; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:112
-; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:96
-; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:80
+; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:112
+; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:96
+; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:80
; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:48
-; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off offset:32
-; GCN-SDAG-NEXT: global_load_b128 v[22:25], v[0:1], off offset:16
-; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off
+; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:32
+; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off offset:16
+; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off
; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[0:1], off offset:64
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[16:17], 0x70
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[24:25], 0x70
; GCN-SDAG-NEXT: v_mov_b64_e32 v[50:51], 0x60
; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 48
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x50
; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 32
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 64
; GCN-SDAG-NEXT: v_mov_b64_e32 v[40:41], 16
-; GCN-SDAG-NEXT: v_dual_mov_b32 v14, 0xc8 :: v_dual_mov_b32 v15, 0
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x50
; GCN-SDAG-NEXT: v_mov_b64_e32 v[42:43], 0
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 64
+; GCN-SDAG-NEXT: v_dual_mov_b32 v22, 0xc8 :: v_dual_mov_b32 v23, 0
; GCN-SDAG-NEXT: s_wait_loadcnt 0x7
-; GCN-SDAG-NEXT: global_store_b128 v[16:17], v[6:9], off
+; GCN-SDAG-NEXT: global_store_b128 v[24:25], v[10:13], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x6
-; GCN-SDAG-NEXT: global_store_b128 v[50:51], v[10:13], off
+; GCN-SDAG-NEXT: global_store_b128 v[50:51], v[18:21], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x5
; GCN-SDAG-NEXT: s_wait_xcnt 0x1
-; GCN-SDAG-NEXT: v_dual_mov_b32 v16, v20 :: v_dual_mov_b32 v17, v21
+; GCN-SDAG-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v25, v9
; GCN-SDAG-NEXT: s_wait_xcnt 0x0
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[12:13], v[12:13], 0, v[12:13]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[10:11], v[10:11], 0, v[10:11]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[8:9], v[8:9], 0, v[8:9]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[6:7], v[6:7], 0, v[6:7]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
; GCN-SDAG-NEXT: s_wait_loadcnt 0x4
; GCN-SDAG-NEXT: global_store_b128 v[52:53], v[34:37], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x3
-; GCN-SDAG-NEXT: global_store_b128 v[54:55], v[30:33], off
+; GCN-SDAG-NEXT: global_store_b128 v[54:55], v[14:17], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x2
-; GCN-SDAG-NEXT: global_store_b128 v[40:41], v[22:25], off
+; GCN-SDAG-NEXT: global_store_b128 v[40:41], v[26:29], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x1
-; GCN-SDAG-NEXT: global_store_b128 v[42:43], v[26:29], off
+; GCN-SDAG-NEXT: global_store_b128 v[42:43], v[30:33], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
; GCN-SDAG-NEXT: s_wait_xcnt 0x3
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[52:53], v[2:3], 0, v[2:3]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[50:51], v[0:1], 0, v[0:1]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[52:53], v[2:3], v[2:3]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[50:51], v[0:1], v[0:1]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[6:7], 0xc8, v[6:7]
+; GCN-SDAG-NEXT: s_wait_xcnt 0x2
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[16:17], 0x64, v[16:17]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15]
; GCN-SDAG-NEXT: s_wait_xcnt 0x1
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[24:25], v[24:25], 0, v[24:25]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[22:23], v[22:23], 0, v[22:23]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[28:29], v[28:29], v[28:29]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[26:27], v[26:27], v[26:27]
; GCN-SDAG-NEXT: s_wait_xcnt 0x0
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[28:29], v[28:29], 0, v[28:29]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[26:27], v[26:27], 0, v[26:27]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[36:37], v[36:37], 0, v[36:37]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[34:35], v[34:35], 0, v[34:35]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[32:33], v[32:33], 0, 0x64
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[30:31], v[30:31], 0, v[30:31]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[20:21], v[20:21], 0, v[20:21]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[18:19], v[18:19], 0, 0xc8
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[36:37], v[36:37], v[36:37]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[34:35], v[34:35], v[34:35]
; GCN-SDAG-NEXT: s_clause 0x1
-; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[14:17], off
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[22:25], off
; GCN-SDAG-NEXT: global_store_b128 v[48:49], v[0:3], off
; GCN-SDAG-NEXT: s_clause 0x7
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:96
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:112
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:96
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:112
; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[50:53], off offset:64
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:80
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[30:33], off offset:32
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:80
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[14:17], off offset:32
; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[34:37], off offset:48
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[26:29], off
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[22:25], off offset:16
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[30:33], off
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[26:29], off offset:16
; GCN-SDAG-NEXT: s_clause 0x3
; GCN-SDAG-NEXT: scratch_load_b32 v43, off, s32
; GCN-SDAG-NEXT: scratch_load_b32 v42, off, s32 offset:4
; GCN-SDAG-NEXT: scratch_load_b32 v41, off, s32 offset:8
; GCN-SDAG-NEXT: scratch_load_b32 v40, off, s32 offset:12
; GCN-SDAG-NEXT: s_wait_xcnt 0xc
-; GCN-SDAG-NEXT: v_dual_mov_b32 v0, v28 :: v_dual_mov_b32 v1, v29
+; GCN-SDAG-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
@@ -403,11 +404,11 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: v_mov_b64_e32 v[48:49], 16
; GCN-GISEL-NEXT: v_mov_b64_e32 v[50:51], 32
; GCN-GISEL-NEXT: v_mov_b64_e32 v[52:53], 48
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x60
; GCN-GISEL-NEXT: v_mov_b64_e32 v[54:55], 64
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[44:45], 0x70
; GCN-GISEL-NEXT: v_mov_b64_e32 v[34:35], 0xc8
; GCN-GISEL-NEXT: v_mov_b64_e32 v[40:41], 0x50
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x60
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[44:45], 0x70
; GCN-GISEL-NEXT: s_wait_loadcnt 0x6
; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[10:13], off
; GCN-GISEL-NEXT: s_wait_loadcnt 0x5
@@ -422,28 +423,28 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: global_store_b128 v[44:45], v[30:33], off
; GCN-GISEL-NEXT: v_mov_b64_e32 v[36:37], v[8:9]
; GCN-GISEL-NEXT: s_wait_xcnt 0x5
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[10:11], v[10:11], 0, v[10:11]
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[12:13], v[12:13], 0, v[12:13]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
; GCN-GISEL-NEXT: s_wait_xcnt 0x4
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[14:15], v[14:15], 0, v[14:15]
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[16:17], v[16:17], 0, v[16:17]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[16:17], v[16:17], v[16:17]
; GCN-GISEL-NEXT: s_wait_xcnt 0x3
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[18:19], v[18:19], 0, v[18:19]
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[20:21], v[20:21], 0, 0x64
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[20:21], 0x64, v[20:21]
; GCN-GISEL-NEXT: s_wait_xcnt 0x2
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[22:23], v[22:23], 0, v[22:23]
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[24:25], v[24:25], 0, v[24:25]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[22:23], v[22:23], v[22:23]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[24:25], v[24:25], v[24:25]
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[48:49], v[0:1], 0, v[0:1]
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[50:51], v[2:3], 0, v[2:3]
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[6:7], v[6:7], 0, 0xc8
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[8:9], v[8:9], 0, v[8:9]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[48:49], v[0:1], v[0:1]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[50:51], v[2:3], v[2:3]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[6:7], 0xc8, v[6:7]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9]
; GCN-GISEL-NEXT: s_wait_xcnt 0x1
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[26:27], v[26:27], 0, v[26:27]
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[28:29], v[28:29], 0, v[28:29]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[26:27], v[26:27], v[26:27]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[28:29], v[28:29], v[28:29]
; GCN-GISEL-NEXT: s_wait_xcnt 0x0
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[30:31], v[30:31], 0, v[30:31]
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[32:33], v[32:33], 0, v[32:33]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
; GCN-GISEL-NEXT: s_clause 0x1
; GCN-GISEL-NEXT: global_store_b128 v[54:55], v[0:3], off
; GCN-GISEL-NEXT: global_store_b128 v[40:41], v[34:37], off
diff --git a/llvm/test/CodeGen/AMDGPU/literal64.ll b/llvm/test/CodeGen/AMDGPU/literal64.ll
index df4ff2c8d9851..6706e7638580d 100644
--- a/llvm/test/CodeGen/AMDGPU/literal64.ll
+++ b/llvm/test/CodeGen/AMDGPU/literal64.ll
@@ -12,21 +12,11 @@ define amdgpu_ps i64 @s_add_u64(i64 inreg %a) {
}
define amdgpu_ps void @v_add_u64(i64 %a, ptr addrspace(1) %out) {
-; GCN-SDAG-LABEL: v_add_u64:
-; GCN-SDAG: ; %bb.0:
-; GCN-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xf12345678)
-; GCN-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GCN-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off
-; GCN-SDAG-NEXT: s_endpgm
-;
-; GCN-GISEL-LABEL: v_add_u64:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[4:5], lit64(0xf12345678)
-; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
-; GCN-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off
-; GCN-GISEL-NEXT: s_endpgm
+; GCN-LABEL: v_add_u64:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_nc_u64_e32 v[0:1], lit64(0xf12345678), v[0:1]
+; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT: s_endpgm
%result = add i64 %a, 64729929336
store i64 %result, ptr addrspace(1) %out, align 8
ret void
@@ -42,21 +32,11 @@ define amdgpu_ps i64 @s_add_neg_u64(i64 inreg %a) {
}
define amdgpu_ps void @v_add_neg_u64(i64 %a, ptr addrspace(1) %out) {
-; GCN-SDAG-LABEL: v_add_neg_u64:
-; GCN-SDAG: ; %bb.0:
-; GCN-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xfffffff0edcba988)
-; GCN-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GCN-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off
-; GCN-SDAG-NEXT: s_endpgm
-;
-; GCN-GISEL-LABEL: v_add_neg_u64:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[4:5], lit64(0xfffffff0edcba988)
-; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
-; GCN-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off
-; GCN-GISEL-NEXT: s_endpgm
+; GCN-LABEL: v_add_neg_u64:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_nc_u64_e32 v[0:1], lit64(0xfffffff0edcba988), v[0:1]
+; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT: s_endpgm
%result = sub i64 %a, 64729929336
store i64 %result, ptr addrspace(1) %out, align 8
ret void
@@ -74,9 +54,7 @@ define amdgpu_ps i64 @s_sub_u64(i64 inreg %a) {
define amdgpu_ps void @v_sub_u64(i64 %a, ptr addrspace(1) %out) {
; GCN-LABEL: v_sub_u64:
; GCN: ; %bb.0:
-; GCN-NEXT: v_sub_co_u32 v0, vcc_lo, 0x12345678, v0
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT: v_sub_co_ci_u32_e64 v1, null, 15, v1, vcc_lo
+; GCN-NEXT: v_sub_nc_u64_e32 v[0:1], lit64(0xf12345678), v[0:1]
; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off
; GCN-NEXT: s_endpgm
%result = sub i64 64729929336, %a
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 91b3a85d36114..8d3716ef62f7c 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -5,6 +5,7 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1250 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX1250 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s
; mul24 and mad24 are affected
@@ -124,6 +125,25 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: test_mul_v2i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s2
+; GFX1250-NEXT: s_mov_b32 s9, s3
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mul_lo_u32 v1, v1, v3
+; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v2
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: test_mul_v2i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -286,6 +306,29 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: v_mul_v4i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s2
+; GFX1250-NEXT: s_mov_b32 s9, s3
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null
+; GFX1250-NEXT: buffer_load_b128 v[4:7], off, s[8:11], null offset:16
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mul_lo_u32 v3, v3, v7
+; GFX1250-NEXT: v_mul_lo_u32 v2, v2, v6
+; GFX1250-NEXT: v_mul_lo_u32 v1, v1, v5
+; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v4
+; GFX1250-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: v_mul_v4i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
@@ -402,6 +445,19 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a,
; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: s_trunc_i64_mul_to_i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x34
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mul_i32 s2, s3, s2
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: s_trunc_i64_mul_to_i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -555,6 +611,29 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: v_trunc_i64_mul_to_i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: s_mov_b32 s10, -1
+; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s14, s10
+; GFX1250-NEXT: s_mov_b32 s15, s11
+; GFX1250-NEXT: s_mov_b32 s6, s10
+; GFX1250-NEXT: s_mov_b32 s7, s11
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s12, s2
+; GFX1250-NEXT: s_mov_b32 s13, s3
+; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null
+; GFX1250-NEXT: buffer_load_b32 v1, off, s[4:7], null
+; GFX1250-NEXT: s_mov_b32 s8, s0
+; GFX1250-NEXT: s_mov_b32 s9, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mul_lo_u32 v0, v1, v0
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: v_trunc_i64_mul_to_i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
@@ -670,6 +749,19 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: mul64_sext_c:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: mul64_sext_c:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -773,6 +865,18 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: mul64_zext_c:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: mul64_zext_c:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -909,6 +1013,26 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: v_mul64_sext_c:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s2
+; GFX1250-NEXT: s_mov_b32 s9, s3
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: buffer_load_b32 v0, off, s[8:11], null
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mul_u64_e32 v[0:1], 0x50, v[0:1]
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: v_mul64_sext_c:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1052,6 +1176,25 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: v_mul64_zext_c:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s2
+; GFX1250-NEXT: s_mov_b32 s9, s3
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: buffer_load_b32 v0, off, s[8:11], null
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mul_u64_e32 v[0:1], 0x50, v[0:1]
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: v_mul64_zext_c:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1192,6 +1335,26 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: v_mul64_sext_inline_imm:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s2
+; GFX1250-NEXT: s_mov_b32 s9, s3
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: buffer_load_b32 v0, off, s[8:11], null
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mul_u64_e32 v[0:1], 9, v[0:1]
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: v_mul64_sext_inline_imm:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1300,6 +1463,20 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [
; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: s_mul_i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_clause 0x2
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4c
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x70
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mul_i32 s2, s2, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: s_mul_i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -1425,6 +1602,24 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: v_mul_i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s2
+; GFX1250-NEXT: s_mov_b32 s9, s3
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: v_mul_i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1540,6 +1735,22 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: s_mul_i1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_clause 0x2
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4c
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x70
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_and_b32 s2, s2, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_and_b32 s2, s2, 1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: buffer_store_b8 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: s_mul_i1:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @10, KC0[], KC1[]
@@ -1699,6 +1910,28 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX12-NEXT: buffer_store_b8 v0, off, s[4:7], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: v_mul_i1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s2
+; GFX1250-NEXT: s_mov_b32 s9, s3
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: buffer_load_u8 v0, off, s[8:11], null
+; GFX1250-NEXT: buffer_load_u8 v1, off, s[8:11], null offset:4
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1250-NEXT: buffer_store_b8 v0, off, s[4:7], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: v_mul_i1:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
@@ -1856,6 +2089,19 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: s_mul_i64:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], s[4:5]
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: s_mul_i64:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
@@ -2044,6 +2290,29 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: v_mul_i64:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: s_mov_b32 s10, -1
+; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s14, s10
+; GFX1250-NEXT: s_mov_b32 s15, s11
+; GFX1250-NEXT: s_mov_b32 s6, s10
+; GFX1250-NEXT: s_mov_b32 s7, s11
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s12, s2
+; GFX1250-NEXT: s_mov_b32 s13, s3
+; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null
+; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[4:7], null
+; GFX1250-NEXT: s_mov_b32 s8, s0
+; GFX1250-NEXT: s_mov_b32 s9, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: v_mul_i64:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
@@ -2286,6 +2555,41 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: mul32_in_branch:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
+; GFX1250-NEXT: s_mov_b32 s6, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT: s_cbranch_scc0 .LBB15_2
+; GFX1250-NEXT: ; %bb.1: ; %else
+; GFX1250-NEXT: s_mul_i32 s7, s0, s1
+; GFX1250-NEXT: s_branch .LBB15_3
+; GFX1250-NEXT: .LBB15_2:
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: ; implicit-def: $sgpr7
+; GFX1250-NEXT: .LBB15_3: ; %Flow
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
+; GFX1250-NEXT: s_cbranch_vccnz .LBB15_5
+; GFX1250-NEXT: ; %bb.4: ; %if
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s4, s2
+; GFX1250-NEXT: s_mov_b32 s5, s3
+; GFX1250-NEXT: buffer_load_b32 v0, off, s[4:7], null
+; GFX1250-NEXT: s_branch .LBB15_6
+; GFX1250-NEXT: .LBB15_5:
+; GFX1250-NEXT: v_mov_b32_e32 v0, s7
+; GFX1250-NEXT: .LBB15_6: ; %endif
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: mul32_in_branch:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU_PUSH_BEFORE 3, @14, KC0[CB0:0-32], KC1[]
@@ -2539,6 +2843,34 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: mul64_in_branch:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1250-NEXT: s_cbranch_scc0 .LBB16_3
+; GFX1250-NEXT: ; %bb.1: ; %else
+; GFX1250-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7]
+; GFX1250-NEXT: s_cbranch_execnz .LBB16_4
+; GFX1250-NEXT: .LBB16_2: ; %if
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s4, s2
+; GFX1250-NEXT: s_mov_b32 s5, s3
+; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null
+; GFX1250-NEXT: s_branch .LBB16_5
+; GFX1250-NEXT: .LBB16_3:
+; GFX1250-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX1250-NEXT: s_branch .LBB16_2
+; GFX1250-NEXT: .LBB16_4:
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT: .LBB16_5: ; %endif
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: mul64_in_branch:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[]
@@ -2882,6 +3214,52 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: s_mul_i128:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_clause 0x2
+; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x7c
+; GFX1250-NEXT: s_load_b128 s[12:15], s[4:5], 0x4c
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b64 s[4:5], lit64(0xffffffff)
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_mov_b32 s7, s3
+; GFX1250-NEXT: s_mov_b32 s17, s3
+; GFX1250-NEXT: s_mov_b32 s19, s3
+; GFX1250-NEXT: s_mov_b32 s20, s3
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s2, s8
+; GFX1250-NEXT: s_and_b64 s[4:5], s[12:13], s[4:5]
+; GFX1250-NEXT: s_mov_b32 s6, s13
+; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[4:5], s[2:3]
+; GFX1250-NEXT: s_mov_b32 s16, s9
+; GFX1250-NEXT: s_mul_u64 s[8:9], s[8:9], s[14:15]
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[6:7], s[2:3]
+; GFX1250-NEXT: s_mov_b32 s2, s13
+; GFX1250-NEXT: s_mul_u64 s[4:5], s[4:5], s[16:17]
+; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[14:15], s[2:3]
+; GFX1250-NEXT: s_mul_u64 s[6:7], s[6:7], s[16:17]
+; GFX1250-NEXT: s_mov_b32 s2, s15
+; GFX1250-NEXT: s_mov_b32 s15, s3
+; GFX1250-NEXT: s_mov_b32 s13, s3
+; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[14:15]
+; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[10:11], s[8:9]
+; GFX1250-NEXT: s_mov_b32 s18, s5
+; GFX1250-NEXT: s_mov_b32 s21, s4
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19]
+; GFX1250-NEXT: s_or_b64 s[4:5], s[12:13], s[20:21]
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[6:7], s[2:3]
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[8:9]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: s_mul_i128:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 41, @4, KC0[CB0:0-32], KC1[]
@@ -3159,6 +3537,43 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX12-NEXT: global_store_b128 v13, v[8:11], s[2:3]
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: v_mul_i128:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX1250-NEXT: v_and_b32_e32 v16, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_load_b128 v[0:3], v16, s[2:3] scale_offset
+; GFX1250-NEXT: global_load_b128 v[4:7], v16, s[0:1] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v10, v0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_dual_mov_b32 v9, v11 :: v_dual_mov_b32 v8, v4
+; GFX1250-NEXT: v_mul_u64_e32 v[6:7], v[0:1], v[6:7]
+; GFX1250-NEXT: v_mul_lo_u32 v3, v3, v4
+; GFX1250-NEXT: v_mul_u64_e32 v[8:9], v[8:9], v[10:11]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], null, v2, v4, v[6:7]
+; GFX1250-NEXT: v_mul_lo_u32 v2, v2, v5
+; GFX1250-NEXT: v_mov_b32_e32 v10, v9
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], null, v5, v0, v[10:11]
+; GFX1250-NEXT: v_add3_u32 v7, v3, v7, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v10, v13 :: v_dual_mov_b32 v13, v11
+; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], null, v4, v1, v[12:13]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v9, v12
+; GFX1250-NEXT: v_mov_b32_e32 v14, v13
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[14:15]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v1, v[10:11]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[10:11], v[0:1], v[6:7]
+; GFX1250-NEXT: global_store_b128 v16, v[8:11], s[2:3] scale_offset
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: v_mul_i128:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
@@ -3271,6 +3686,13 @@ define i32 @mul_pow2_plus_1(i32 %val) {
; GFX12-NEXT: v_lshl_add_u32 v0, v0, 3, v0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1250-LABEL: mul_pow2_plus_1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 3, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; EG-LABEL: mul_pow2_plus_1:
; EG: ; %bb.0:
; EG-NEXT: CF_END
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
index 64392a15e9a9b..192dce369b0ef 100644
--- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
@@ -369,7 +369,7 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; SDAG-NEXT: s_wait_loadcnt 0x0
-; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
+; SDAG-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; SDAG-NEXT: s_wait_xcnt 0x0
; SDAG-NEXT: s_wait_alu 0xfffe
@@ -418,7 +418,7 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GISEL-NEXT: scratch_load_b64 v[0:1], v4, off
; GISEL-NEXT: s_wait_loadcnt 0x0
-; GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
+; GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GISEL-NEXT: scratch_store_b64 v4, v[2:3], off
; GISEL-NEXT: s_wait_xcnt 0x0
; GISEL-NEXT: s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/sub_u64.ll b/llvm/test/CodeGen/AMDGPU/sub_u64.ll
new file mode 100644
index 0000000000000..baaca4ddeaf05
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sub_u64.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
+
+define amdgpu_ps <2 x float> @test_sub_u64_vv(i64 %a, i64 %b) {
+; GFX12-LABEL: test_sub_u64_vv:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: ; return to shader part epilog
+ %sub = sub i64 %a, %b
+ %ret = bitcast i64 %sub to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_vs(i64 %a, i64 inreg %b) {
+; GFX12-LABEL: test_sub_u64_vs:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_subrev_co_ci_u32_e64 v1, null, s1, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_vs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_sub_nc_u64_e64 v[0:1], v[0:1], s[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %sub = sub i64 %a, %b
+ %ret = bitcast i64 %sub to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_sv(i64 inreg %a, i64 %b) {
+; GFX12-LABEL: test_sub_u64_sv:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, s1, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_sv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %sub = sub i64 %a, %b
+ %ret = bitcast i64 %sub to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_ss(i64 inreg %a, i64 inreg %b) {
+; GCN-LABEL: test_sub_u64_ss:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: ; return to shader part epilog
+ %sub = sub i64 %a, %b
+ %ret = bitcast i64 %sub to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_inline_lit_v(i64 %a) {
+; GFX12-LABEL: test_sub_u64_inline_lit_v:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, 5, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_inline_lit_v:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], 5, v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %sub = sub i64 5, %a
+ %ret = bitcast i64 %sub to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_v_inline_lit(i64 %a) {
+; GFX12-LABEL: test_sub_u64_v_inline_lit:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, -5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_v_inline_lit:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], -5, v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %sub = sub i64 %a, 5
+ %ret = bitcast i64 %sub to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_small_imm_v(i64 %a) {
+; GFX12-LABEL: test_sub_u64_small_imm_v:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, 0x1f4, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_small_imm_v:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], 0x1f4, v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %sub = sub i64 500, %a
+ %ret = bitcast i64 %sub to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_64bit_imm_v(i64 %a) {
+; GFX12-LABEL: test_sub_u64_64bit_imm_v:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, 0x3b9ac9ff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, 1, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_64bit_imm_v:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], lit64(0x13b9ac9ff), v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %sub = sub i64 5294967295, %a
+ %ret = bitcast i64 %sub to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_small_imm_s(i64 inreg %a) {
+; GCN-LABEL: test_sub_u64_small_imm_s:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_sub_nc_u64 s[0:1], 0x1f4, s[0:1]
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: ; return to shader part epilog
+ %sub = sub i64 500, %a
+ %ret = bitcast i64 %sub to <2 x float>
+ ret <2 x float> %ret
+}
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
index 20bc578605b8c..0a1d3bfc02503 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
@@ -154,6 +154,362 @@ v_fmac_f64 v[4:5], v[2:3], v[8:9] div:2
// GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] div:2 ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x18]
// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+v_add_nc_u64 v[4:5], v[2:3], v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[254:255], v[2:3], v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x51]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64_e64 v[4:5], s[2:3], s[4:5]
+// GFX1250: v_add_nc_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x08,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[254:255], v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], vcc, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], exec, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], 0, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], -1, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], 0.5, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], -4.0, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], 0xaf123456, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], 0x3f717273, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x50,0x73,0x72,0x71,0x3f]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[254:255], v[2:3], v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x51]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[254:255], v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], vcc, v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], vcc, v[8:9] ; encoding: [0x6a,0x10,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], exec, v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], exec, v[8:9] ; encoding: [0x7e,0x10,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], 0, v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], 0, v[8:9] ; encoding: [0x80,0x10,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], -1, v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], -1, v[8:9] ; encoding: [0xc1,0x10,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], 0.5, v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], 0.5, v[8:9] ; encoding: [0xf0,0x10,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], -4.0, v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], -4.0, v[8:9] ; encoding: [0xf7,0x10,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], vcc
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xd5,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], exec
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xfd,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], 0
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x01,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], -1
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x83,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], 0.5
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xe1,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], -4.0
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xef,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], v[8:9] clamp
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x28,0xd5,0x02,0x11,0x02,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[254:255], v[2:3], v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x53]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64_e64 v[4:5], s[2:3], s[4:5]
+// GFX1250: v_sub_nc_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x08,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[254:255], v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], vcc, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], exec, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], 0, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], -1, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], 0.5, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], -4.0, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], 0xaf123456, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], 0x3f717273, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x52,0x73,0x72,0x71,0x3f]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[254:255], v[2:3], v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x53]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[254:255], v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], vcc, v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], vcc, v[8:9] ; encoding: [0x6a,0x10,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], exec, v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], exec, v[8:9] ; encoding: [0x7e,0x10,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], 0, v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], 0, v[8:9] ; encoding: [0x80,0x10,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], -1, v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], -1, v[8:9] ; encoding: [0xc1,0x10,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], 0.5, v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], 0.5, v[8:9] ; encoding: [0xf0,0x10,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], -4.0, v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], -4.0, v[8:9] ; encoding: [0xf7,0x10,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], vcc
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xd5,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], exec
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xfd,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], 0
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x01,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], -1
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x83,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], 0.5
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xe1,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], -4.0
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xef,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], v[8:9] clamp
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x29,0xd5,0x02,0x11,0x02,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[254:255], v[2:3], v[4:5]
+// GFX1250: v_mul_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x55]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64_e64 v[4:5], s[2:3], s[4:5]
+// GFX1250: v_mul_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x08,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[254:255], v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], vcc, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], exec, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], 0, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], -1, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], 0.5, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], -4.0, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], 0xaf123456, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], 0x3f717273, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x54,0x73,0x72,0x71,0x3f]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[254:255], v[2:3], v[8:9]
+// GFX1250: v_mul_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x55]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[254:255], v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], vcc, v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], vcc, v[8:9] ; encoding: [0x6a,0x10,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], exec, v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], exec, v[8:9] ; encoding: [0x7e,0x10,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], 0, v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], 0, v[8:9] ; encoding: [0x80,0x10,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], -1, v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], -1, v[8:9] ; encoding: [0xc1,0x10,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], 0.5, v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], 0.5, v[8:9] ; encoding: [0xf0,0x10,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], -4.0, v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], -4.0, v[8:9] ; encoding: [0xf7,0x10,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], vcc
+// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xd5,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], exec
+// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xfd,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], 0
+// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x01,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], -1
+// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x83,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], 0.5
+// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xe1,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], -4.0
+// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xef,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
v_fmamk_f64 v[6:7], v[4:5], 0x405ec000, v[2:3]
// GFX1250: v_fmamk_f64 v[6:7], v[4:5], 0x405ec000, v[2:3] ; encoding: [0x04,0x05,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s
index f67ad88b5ae83..9f5036106dbd3 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s
@@ -22,3 +22,8 @@ v_fmamk_f16 v4, v2, 3, v6 row_share:1
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
// GFX1250-ERR-NEXT:{{^}}v_fmamk_f16 v4, v2, 3, v6 row_share:1
// GFX1250-ERR-NEXT:{{^}} ^
+
+v_mul_u64 v[4:5], v[2:3], v[8:9] clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_mul_u64 v[4:5], v[2:3], v[8:9] clamp
+// GFX1250-ERR-NEXT:{{^}} ^
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt
index c1213f2d9ec0d..130941c8c1397 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt
@@ -112,6 +112,264 @@
0x04,0x03,0x17,0xd5,0x02,0x11,0x02,0x00
# GFX1250: v_fmac_f64_e64 v[4:5], |v[2:3]|, |v[8:9]| ; encoding: [0x04,0x03,0x17,0xd5,0x02,0x11,0x02,0x00]
+0x02,0x09,0xfc,0x51
+# GFX1250: v_add_nc_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x51]
+
+0x02,0x11,0xfc,0x51
+# GFX1250: v_add_nc_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x51]
+
+0xc1,0x08,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x50]
+
+0xc1,0x10,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], -1, v[8:9] ; encoding: [0xc1,0x10,0x08,0x50]
+
+0xf7,0x08,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x50]
+
+0xf7,0x10,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], -4.0, v[8:9] ; encoding: [0xf7,0x10,0x08,0x50]
+
+0x80,0x08,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x50]
+
+0x80,0x10,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], 0, v[8:9] ; encoding: [0x80,0x10,0x08,0x50]
+
+0xf0,0x08,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x50]
+
+0xf0,0x10,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], 0.5, v[8:9] ; encoding: [0xf0,0x10,0x08,0x50]
+
+0xff,0x08,0x08,0x50,0x73,0x72,0x71,0x3f
+# GFX1250: v_add_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x50,0x73,0x72,0x71,0x3f]
+
+0xff,0x08,0x08,0x50,0x56,0x34,0x12,0xaf
+# GFX1250: v_add_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+
+0x7e,0x08,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x50]
+
+0x7e,0x10,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], exec, v[8:9] ; encoding: [0x7e,0x10,0x08,0x50]
+
+0xfe,0x09,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x50]
+
+0xfe,0x11,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x50]
+
+0x02,0xfd,0x09,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x50]
+
+0x02,0x09,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x50]
+
+0x02,0x11,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x50]
+
+0x6a,0x08,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x50]
+
+0x6a,0x10,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], vcc, v[8:9] ; encoding: [0x6a,0x10,0x08,0x50]
+
+0x04,0x00,0x28,0xd5,0x02,0x08,0x00,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x08,0x00,0x00]
+
+0x04,0x00,0x28,0xd5,0x02,0x83,0x01,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x83,0x01,0x00]
+
+0x04,0x00,0x28,0xd5,0x02,0xef,0x01,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xef,0x01,0x00]
+
+0x04,0x00,0x28,0xd5,0x02,0x01,0x01,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x01,0x01,0x00]
+
+0x04,0x00,0x28,0xd5,0x02,0xe1,0x01,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xe1,0x01,0x00]
+
+0x04,0x00,0x28,0xd5,0x02,0xfd,0x00,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xfd,0x00,0x00]
+
+0x04,0x80,0x28,0xd5,0x02,0x11,0x02,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x28,0xd5,0x02,0x11,0x02,0x00]
+
+0x04,0x00,0x28,0xd5,0x02,0xd5,0x00,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xd5,0x00,0x00]
+
+0x02,0x09,0xfc,0x53
+# GFX1250: v_sub_nc_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x53]
+
+0x02,0x11,0xfc,0x53
+# GFX1250: v_sub_nc_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x53]
+
+0xc1,0x08,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x52]
+
+0xc1,0x10,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], -1, v[8:9] ; encoding: [0xc1,0x10,0x08,0x52]
+
+0xf7,0x08,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x52]
+
+0xf7,0x10,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], -4.0, v[8:9] ; encoding: [0xf7,0x10,0x08,0x52]
+
+0x80,0x08,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x52]
+
+0x80,0x10,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], 0, v[8:9] ; encoding: [0x80,0x10,0x08,0x52]
+
+0xf0,0x08,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x52]
+
+0xf0,0x10,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], 0.5, v[8:9] ; encoding: [0xf0,0x10,0x08,0x52]
+
+0xff,0x08,0x08,0x52,0x73,0x72,0x71,0x3f
+# GFX1250: v_sub_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x52,0x73,0x72,0x71,0x3f]
+
+0xff,0x08,0x08,0x52,0x56,0x34,0x12,0xaf
+# GFX1250: v_sub_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+
+0x7e,0x08,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x52]
+
+0x7e,0x10,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], exec, v[8:9] ; encoding: [0x7e,0x10,0x08,0x52]
+
+0xfe,0x09,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x52]
+
+0xfe,0x11,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x52]
+
+0x02,0xfd,0x09,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x52]
+
+0x02,0x09,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x52]
+
+0x02,0x11,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x52]
+
+0x6a,0x08,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x52]
+
+0x6a,0x10,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], vcc, v[8:9] ; encoding: [0x6a,0x10,0x08,0x52]
+
+0x04,0x00,0x29,0xd5,0x02,0x08,0x00,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x08,0x00,0x00]
+
+0x04,0x00,0x29,0xd5,0x02,0x83,0x01,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x83,0x01,0x00]
+
+0x04,0x00,0x29,0xd5,0x02,0xef,0x01,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xef,0x01,0x00]
+
+0x04,0x00,0x29,0xd5,0x02,0x01,0x01,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x01,0x01,0x00]
+
+0x04,0x00,0x29,0xd5,0x02,0xe1,0x01,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xe1,0x01,0x00]
+
+0x04,0x00,0x29,0xd5,0x02,0xfd,0x00,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xfd,0x00,0x00]
+
+0x04,0x80,0x29,0xd5,0x02,0x11,0x02,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x29,0xd5,0x02,0x11,0x02,0x00]
+
+0x04,0x00,0x29,0xd5,0x02,0xd5,0x00,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xd5,0x00,0x00]
+
+0x02,0x09,0xfc,0x55
+# GFX1250: v_mul_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x55]
+
+0x02,0x11,0xfc,0x55
+# GFX1250: v_mul_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x55]
+
+0xc1,0x08,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x54]
+
+0xc1,0x10,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], -1, v[8:9] ; encoding: [0xc1,0x10,0x08,0x54]
+
+0xf7,0x08,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x54]
+
+0xf7,0x10,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], -4.0, v[8:9] ; encoding: [0xf7,0x10,0x08,0x54]
+
+0x80,0x08,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x54]
+
+0x80,0x10,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], 0, v[8:9] ; encoding: [0x80,0x10,0x08,0x54]
+
+0xf0,0x08,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x54]
+
+0xf0,0x10,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], 0.5, v[8:9] ; encoding: [0xf0,0x10,0x08,0x54]
+
+0xff,0x08,0x08,0x54,0x73,0x72,0x71,0x3f
+# GFX1250: v_mul_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x54,0x73,0x72,0x71,0x3f]
+
+0xff,0x08,0x08,0x54,0x56,0x34,0x12,0xaf
+# GFX1250: v_mul_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+
+0x7e,0x08,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x54]
+
+0x7e,0x10,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], exec, v[8:9] ; encoding: [0x7e,0x10,0x08,0x54]
+
+0xfe,0x09,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x54]
+
+0xfe,0x11,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x54]
+
+0x02,0xfd,0x09,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x54]
+
+0x02,0x09,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x54]
+
+0x02,0x11,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x54]
+
+0x6a,0x08,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x54]
+
+0x6a,0x10,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], vcc, v[8:9] ; encoding: [0x6a,0x10,0x08,0x54]
+
+0x04,0x00,0x2a,0xd5,0x02,0x08,0x00,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x08,0x00,0x00]
+
+0x04,0x00,0x2a,0xd5,0x02,0x83,0x01,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x83,0x01,0x00]
+
+0x04,0x00,0x2a,0xd5,0x02,0xef,0x01,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xef,0x01,0x00]
+
+0x04,0x00,0x2a,0xd5,0x02,0x01,0x01,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x01,0x01,0x00]
+
+0x04,0x00,0x2a,0xd5,0x02,0xe1,0x01,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xe1,0x01,0x00]
+
+0x04,0x00,0x2a,0xd5,0x02,0xfd,0x00,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xfd,0x00,0x00]
+
+0x04,0x00,0x2a,0xd5,0x02,0xd5,0x00,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xd5,0x00,0x00]
+
0xfe,0xfc,0xfd,0x49,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
# GFX1250: v_fmaak_f64 v[254:255], 0x405ec000, v[254:255], 0x405ec000 ; encoding: [0xfe,0xfc,0xfd,0x49,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
More information about the llvm-commits
mailing list