[llvm] [AMDGPU] Add V_ADD|SUB|MUL_U64 gfx1250 opcodes (PR #150291)

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 23 12:20:45 PDT 2025


https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/150291

None

>From 19ceda7b605f906190e4f64ab224f00876c5ea78 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 23 Jul 2025 11:16:26 -0700
Subject: [PATCH] [AMDGPU] Add V_ADD|SUB|MUL_U64 gfx1250 opcodes

---
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   8 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   3 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |   7 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  17 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   4 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |   1 +
 llvm/lib/Target/AMDGPU/VOP2Instructions.td    |  17 +
 llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll    | 521 ++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/add_u64.ll           | 129 +++++
 .../AMDGPU/branch-relaxation-gfx1250.ll       |   2 +-
 .../test/CodeGen/AMDGPU/carryout-selection.ll | 398 +++++++++++++
 .../test/CodeGen/AMDGPU/code-size-estimate.ll |  10 +-
 .../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 268 +++++----
 llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll   |  10 +-
 llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll  | 117 ++--
 llvm/test/CodeGen/AMDGPU/literal64.ll         |  44 +-
 llvm/test/CodeGen/AMDGPU/mul.ll               | 422 ++++++++++++++
 llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll |   4 +-
 llvm/test/CodeGen/AMDGPU/sub_u64.ll           | 146 +++++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s        | 356 ++++++++++++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s    |   5 +
 .../Disassembler/AMDGPU/gfx1250_dasm_vop2.txt | 258 +++++++++
 23 files changed, 2506 insertions(+), 250 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/add_u64.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/sub_u64.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 1c7ee724fef09..ff2595ef51869 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1355,6 +1355,10 @@ def FeatureLshlAddU64Inst
     : SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true",
                        "Has v_lshl_add_u64 instruction">;
 
+def FeatureAddSubU64Insts
+    : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true",
+                       "Has v_add_u64 and v_sub_u64 instructions">;
+
 def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
   "HasVMemToLDSLoad",
   "true",
@@ -2010,6 +2014,7 @@ def FeatureISAVersion12_50 : FeatureSet<
    FeatureMemoryAtomicFAddF32DenormalSupport,
    FeatureKernargPreload,
    FeatureLshlAddU64Inst,
+   FeatureAddSubU64Insts,
    FeatureLdsBarrierArriveAtomic,
    FeatureSetPrioIncWgInst,
 ]>;
@@ -2787,6 +2792,9 @@ def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">,
 def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">,
                         AssemblerPredicate<(all_of FeatureLshlAddU64Inst)>;
 
+def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">,
+                        AssemblerPredicate<(all_of FeatureAddSubU64Insts)>;
+
 def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">,
   AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e7bf88d2ee5b6..fedfa3f9dd900 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4208,6 +4208,9 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
   assert(Ty.isScalar());
 
   unsigned Size = Ty.getSizeInBits();
+  if (ST.hasVectorMulU64() && Size == 64)
+    return true;
+
   unsigned NumParts = Size / 32;
   assert((Size % 32) == 0);
   assert(NumParts >= 2);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index f1caf2478e630..9b05f7c339738 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2528,7 +2528,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     // Special case for s_mul_u64. There is not a vector equivalent of
     // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
     // multiplications.
-    if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) {
+    if (!Subtarget.hasVectorMulU64() && Opc == AMDGPU::G_MUL &&
+        DstTy.getSizeInBits() == 64) {
       applyMappingSMULU64(B, OpdMapper);
       return;
     }
@@ -3973,7 +3974,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
       } else {
-        OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
+        if (MI.getOpcode() == AMDGPU::G_MUL && Subtarget.hasVectorMulU64())
+          OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+        else
+          OpdsMapping[0] =
+              getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 8b758b011f6ad..5eddde1f72ec7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -267,6 +267,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasMinimum3Maximum3F16 = false;
   bool HasMinimum3Maximum3PKF16 = false;
   bool HasLshlAddU64Inst = false;
+  bool HasAddSubU64Insts = false;
   bool HasPointSampleAccel = false;
   bool HasLdsBarrierArriveAtomic = false;
   bool HasSetPrioIncWgInst = false;
@@ -1500,6 +1501,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasVOPD3() const { return GFX1250Insts; }
 
+  // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
+  bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }
+
+  // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
+  bool hasVectorMulU64() const { return GFX1250Insts; }
+
   // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
   bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 441034b508c10..92a56a1d5f492 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -874,7 +874,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
 
-  if (Subtarget->hasScalarSMulU64())
+  if (Subtarget->hasVectorMulU64())
+    setOperationAction(ISD::MUL, MVT::i64, Legal);
+  else if (Subtarget->hasScalarSMulU64())
     setOperationAction(ISD::MUL, MVT::i64, Custom);
 
   if (Subtarget->hasMad64_32())
@@ -5421,6 +5423,19 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MachineOperand &Src0 = MI.getOperand(1);
     MachineOperand &Src1 = MI.getOperand(2);
 
+    if (ST.hasAddSubU64Insts()) {
+      auto I = BuildMI(*BB, MI, DL,
+                       TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
+                                      : AMDGPU::V_SUB_U64_e64),
+                       Dest.getReg())
+                   .add(Src0)
+                   .add(Src1)
+                   .addImm(0); // clamp
+      TII->legalizeOperands(*I);
+      MI.eraseFromParent();
+      return BB;
+    }
+
     if (IsAdd && ST.hasLshlAddU64Inst()) {
       auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
                          Dest.getReg())
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 571f3efd68260..40e687178fb01 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7361,6 +7361,10 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
   }
 
   case AMDGPU::S_MUL_U64:
+    if (ST.hasVectorMulU64()) {
+      NewOpcode = AMDGPU::V_MUL_U64_e64;
+      break;
+    }
     // Split s_mul_u64 in 32-bit vector multiplications.
     splitScalarSMulU64(Worklist, Inst, MDT);
     Inst.eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index b8537513ce986..485ca78db93a7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2914,6 +2914,7 @@ def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
 def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
 def VOP_I16_F32_F32 : VOPProfile <[i16, f32, f32, untyped]>;
 def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=*/1>;
+def VOP_I64_I64_I64_ARITH : VOPProfile <[i64, i64, i64, untyped], /*EnableClamp=*/1>;
 def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
 def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
 def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 030a6e1e978c1..550ec9d3f55ab 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -925,6 +925,17 @@ let isAdd = 1 in {
   defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32">;
 }
 
+let isReMaterializable = 1 in {
+let SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit] in {
+defm V_ADD_U64 : VOP2Inst <"v_add_nc_u64", VOP_I64_I64_I64_ARITH>;
+// We don't actually have something like V_SUBREV_U64 so V_SUB_U64 can't be treated as commutable.
+let isCommutable = 0 in
+defm V_SUB_U64 : VOP2Inst <"v_sub_nc_u64", VOP_I64_I64_I64_ARITH>;
+} // End SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit]
+let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDouble] in
+defm V_MUL_U64 : VOP2Inst <"v_mul_u64", VOP_I64_I64_I64, DivergentBinFrag<mul>>;
+} // End isReMaterializable = 1
+
 } // End isCommutable = 1
 
 // These are special and do not read the exec mask.
@@ -1754,6 +1765,9 @@ multiclass VOP2_Real_FULL_with_name<GFXGen Gen, bits<6> op, string opName,
   VOP2_Realtriple_e64_with_name<Gen, op, opName, asmName>,
   VOP2_Real_NO_VOP3_with_name<Gen, op, opName, asmName>;
 
+multiclass VOP2_Real_NO_DPP<GFXGen Gen, bits<6> op> :
+  VOP2_Real_e32<Gen, op>, VOP2_Real_e64<Gen, op>;
+
 multiclass VOP2_Real_NO_DPP_with_name<GFXGen Gen, bits<6> op, string opName,
                                       string asmName> {
   defm NAME : VOP2_Real_e32_with_name<Gen, op, opName, asmName>,
@@ -1843,6 +1857,9 @@ defm V_FMAC_F64 : VOP2_Real_FULL<GFX12Gen, 0x17>;
 
 defm V_FMAMK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x23>;
 defm V_FMAAK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x24>;
+defm V_ADD_U64 : VOP2_Real_FULL<GFX1250Gen, 0x28>;
+defm V_SUB_U64 : VOP2_Real_FULL<GFX1250Gen, 0x29>;
+defm V_MUL_U64 : VOP2_Real_NO_DPP<GFX1250Gen, 0x2a>;
 
 //===----------------------------------------------------------------------===//
 // GFX11.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index f7f7e9645fa62..0d571d0e563b5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -6,6 +6,7 @@
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16, -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16, -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250 %s
 
 define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
 ; GCN-LABEL: s_mul_i16:
@@ -22,6 +23,11 @@ define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mul_i32 s0, s0, s1
+; GFX1250-NEXT:    ; return to shader part epilog
   %result = mul i16 %num, %den
   ret i16 %result
 }
@@ -74,6 +80,13 @@ define i16 @v_mul_i16(i16 %num, i16 %den) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = mul i16 %num, %den
   ret i16 %result
 }
@@ -109,6 +122,13 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i16_zeroext:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mul_i32 s0, s0, s1
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX1250-NEXT:    ; return to shader part epilog
   %result = mul i16 %num, %den
   ret i16 %result
 }
@@ -165,6 +185,15 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i16_zeroext:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = mul i16 %num, %den
   ret i16 %result
 }
@@ -188,6 +217,13 @@ define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inre
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i16_signext:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mul_i32 s0, s0, s1
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_sext_i32_i16 s0, s0
+; GFX1250-NEXT:    ; return to shader part epilog
   %result = mul i16 %num, %den
   ret i16 %result
 }
@@ -248,6 +284,15 @@ define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i16_signext:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = mul i16 %num, %den
   ret i16 %result
 }
@@ -267,6 +312,11 @@ define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mul_i32 s0, s0, s1
+; GFX1250-NEXT:    ; return to shader part epilog
   %result = mul i32 %num, %den
   ret i32 %result
 }
@@ -293,6 +343,13 @@ define i32 @v_mul_i32(i32 %num, i32 %den) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = mul i32 %num, %den
   ret i32 %result
 }
@@ -315,6 +372,12 @@ define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %d
 ; GFX12-NEXT:    s_mul_i32 s0, s0, s2
 ; GFX12-NEXT:    s_mul_i32 s1, s1, s3
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_v2i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mul_i32 s0, s0, s2
+; GFX1250-NEXT:    s_mul_i32 s1, s1, s3
+; GFX1250-NEXT:    ; return to shader part epilog
   %result = mul <2 x i32> %num, %den
   ret <2 x i32> %result
 }
@@ -344,6 +407,14 @@ define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX12-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_v2i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GFX1250-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = mul <2 x i32> %num, %den
   ret <2 x i32> %result
 }
@@ -400,6 +471,11 @@ define amdgpu_cs i33 @s_mul_i33(i33 inreg %num,  i33 inreg %den) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_mul_u64 s[0:1], s[0:1], s[2:3]
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i33:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mul_u64 s[0:1], s[0:1], s[2:3]
+; GFX1250-NEXT:    ; return to shader part epilog
   %result = mul i33 %num, %den
   ret i33 %result
 }
@@ -456,6 +532,11 @@ define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_mul_u64 s[0:1], s[0:1], s[2:3]
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i64:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mul_u64 s[0:1], s[0:1], s[2:3]
+; GFX1250-NEXT:    ; return to shader part epilog
   %result = mul i64 %num, %den
   ret i64 %result
 }
@@ -504,6 +585,13 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
 ; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v1, v2, v[3:4]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i64:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mul_u64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = mul i64 %num, %den
   ret i64 %result
 }
@@ -620,6 +708,26 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
 ; GFX12-NEXT:    s_add_co_ci_u32 s2, s3, s0
 ; GFX12-NEXT:    s_mov_b32 s0, s5
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i96:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mul_i32 s6, s0, s5
+; GFX1250-NEXT:    s_mul_i32 s7, s1, s4
+; GFX1250-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1250-NEXT:    s_add_co_i32 s6, s6, s7
+; GFX1250-NEXT:    s_mul_hi_u32 s7, s0, s3
+; GFX1250-NEXT:    s_add_co_i32 s6, s6, s2
+; GFX1250-NEXT:    s_mul_i32 s2, s0, s4
+; GFX1250-NEXT:    s_mul_i32 s5, s0, s3
+; GFX1250-NEXT:    s_mul_hi_u32 s0, s0, s4
+; GFX1250-NEXT:    s_add_co_u32 s2, s2, s7
+; GFX1250-NEXT:    s_mul_i32 s4, s1, s3
+; GFX1250-NEXT:    s_add_co_ci_u32 s0, s0, s6
+; GFX1250-NEXT:    s_mul_hi_u32 s3, s1, s3
+; GFX1250-NEXT:    s_add_co_u32 s1, s4, s2
+; GFX1250-NEXT:    s_add_co_ci_u32 s2, s3, s0
+; GFX1250-NEXT:    s_mov_b32 s0, s5
+; GFX1250-NEXT:    ; return to shader part epilog
   %result = mul i96 %num, %den
   %cast = bitcast i96 %result to <3 x i32>
   ret <3 x i32> %cast
@@ -686,6 +794,25 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2]
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i96:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mul_lo_u32 v0, v6, v5
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[8:9], null, v7, v4, v[0:1]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], null, v6, v3, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9]
+; GFX1250-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v11, v8
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[4:5], null, v6, v4, v[10:11]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[2:3], null, v7, v3, v[4:5]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = mul i96 %num, %den
   ret i96 %result
 }
@@ -895,6 +1022,42 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
 ; GFX12-NEXT:    s_mov_b32 s1, s8
 ; GFX12-NEXT:    s_mov_b32 s2, s7
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mul_i32 s9, s0, s6
+; GFX1250-NEXT:    s_mul_i32 s11, s1, s5
+; GFX1250-NEXT:    s_mul_hi_u32 s10, s0, s6
+; GFX1250-NEXT:    s_mul_hi_u32 s12, s1, s5
+; GFX1250-NEXT:    s_add_co_u32 s9, s11, s9
+; GFX1250-NEXT:    s_mul_i32 s11, s2, s4
+; GFX1250-NEXT:    s_add_co_ci_u32 s10, s12, s10
+; GFX1250-NEXT:    s_mul_hi_u32 s12, s2, s4
+; GFX1250-NEXT:    s_mul_hi_u32 s8, s0, s4
+; GFX1250-NEXT:    s_add_co_u32 s9, s11, s9
+; GFX1250-NEXT:    s_mul_i32 s11, s0, s5
+; GFX1250-NEXT:    s_add_co_ci_u32 s10, s12, s10
+; GFX1250-NEXT:    s_mul_hi_u32 s12, s0, s5
+; GFX1250-NEXT:    s_add_co_u32 s8, s11, s8
+; GFX1250-NEXT:    s_add_co_ci_u32 s9, s12, s9
+; GFX1250-NEXT:    s_mul_i32 s12, s1, s4
+; GFX1250-NEXT:    s_mul_hi_u32 s13, s1, s4
+; GFX1250-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX1250-NEXT:    s_add_co_u32 s8, s12, s8
+; GFX1250-NEXT:    s_mul_i32 s12, s0, s7
+; GFX1250-NEXT:    s_add_co_ci_u32 s7, s13, s9
+; GFX1250-NEXT:    s_add_co_ci_u32 s9, s10, s12
+; GFX1250-NEXT:    s_mul_i32 s1, s1, s6
+; GFX1250-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX1250-NEXT:    s_mul_i32 s2, s2, s5
+; GFX1250-NEXT:    s_add_co_ci_u32 s1, s9, s1
+; GFX1250-NEXT:    s_mul_i32 s3, s3, s4
+; GFX1250-NEXT:    s_add_co_i32 s1, s1, s2
+; GFX1250-NEXT:    s_mul_i32 s0, s0, s4
+; GFX1250-NEXT:    s_add_co_i32 s3, s1, s3
+; GFX1250-NEXT:    s_mov_b32 s1, s8
+; GFX1250-NEXT:    s_mov_b32 s2, s7
+; GFX1250-NEXT:    ; return to shader part epilog
   %result = mul i128 %num, %den
   %cast = bitcast i128 %result to <4 x i32>
   ret <4 x i32> %cast
@@ -1036,6 +1199,39 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], null, v8, v6, 0
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], null, v9, v5, v[0:1]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], null, v2, v4, v[10:11]
+; GFX1250-NEXT:    v_mov_b32_e32 v12, v1
+; GFX1250-NEXT:    v_mul_lo_u32 v1, v9, v6
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v13, v10
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[12:13]
+; GFX1250-NEXT:    v_mul_lo_u32 v8, v8, v7
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13]
+; GFX1250-NEXT:    s_wait_alu 0xf1ff
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v8, null, v11, v8, s0
+; GFX1250-NEXT:    s_wait_alu 0xfffd
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v8, null, v8, v1, vcc_lo
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v6
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[8:9], null, v2, v5, v[8:9]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v7
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[4:5], null, v3, v4, v[8:9]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = mul i128 %num, %den
   ret i128 %result
 }
@@ -2020,6 +2216,185 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX12-NEXT:    s_add_co_i32 s7, s1, s7
 ; GFX12-NEXT:    s_mov_b32 s1, s16
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i256:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mul_i32 s17, s0, s10
+; GFX1250-NEXT:    s_mul_i32 s19, s1, s9
+; GFX1250-NEXT:    s_mul_hi_u32 s18, s0, s10
+; GFX1250-NEXT:    s_mul_hi_u32 s20, s1, s9
+; GFX1250-NEXT:    s_add_co_u32 s17, s19, s17
+; GFX1250-NEXT:    s_add_co_ci_u32 s18, s20, s18
+; GFX1250-NEXT:    s_mul_i32 s20, s2, s8
+; GFX1250-NEXT:    s_mul_hi_u32 s21, s2, s8
+; GFX1250-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX1250-NEXT:    s_add_co_u32 s17, s20, s17
+; GFX1250-NEXT:    s_mul_hi_u32 s16, s0, s8
+; GFX1250-NEXT:    s_add_co_ci_u32 s18, s21, s18
+; GFX1250-NEXT:    s_mul_i32 s21, s0, s9
+; GFX1250-NEXT:    s_mul_hi_u32 s22, s0, s9
+; GFX1250-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX1250-NEXT:    s_add_co_u32 s16, s21, s16
+; GFX1250-NEXT:    s_add_co_ci_u32 s17, s22, s17
+; GFX1250-NEXT:    s_mul_i32 s22, s1, s8
+; GFX1250-NEXT:    s_mul_hi_u32 s23, s1, s8
+; GFX1250-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX1250-NEXT:    s_add_co_u32 s16, s22, s16
+; GFX1250-NEXT:    s_add_co_ci_u32 s17, s23, s17
+; GFX1250-NEXT:    s_mul_i32 s23, s0, s12
+; GFX1250-NEXT:    s_mul_i32 s25, s1, s11
+; GFX1250-NEXT:    s_mul_hi_u32 s24, s0, s12
+; GFX1250-NEXT:    s_mul_hi_u32 s26, s1, s11
+; GFX1250-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX1250-NEXT:    s_add_co_u32 s23, s25, s23
+; GFX1250-NEXT:    s_add_co_ci_u32 s24, s26, s24
+; GFX1250-NEXT:    s_mul_i32 s26, s2, s10
+; GFX1250-NEXT:    s_mul_hi_u32 s27, s2, s10
+; GFX1250-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX1250-NEXT:    s_add_co_u32 s23, s26, s23
+; GFX1250-NEXT:    s_add_co_ci_u32 s24, s27, s24
+; GFX1250-NEXT:    s_mul_i32 s27, s3, s9
+; GFX1250-NEXT:    s_mul_hi_u32 s28, s3, s9
+; GFX1250-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX1250-NEXT:    s_add_co_u32 s23, s27, s23
+; GFX1250-NEXT:    s_add_co_ci_u32 s24, s28, s24
+; GFX1250-NEXT:    s_mul_i32 s28, s4, s8
+; GFX1250-NEXT:    s_mul_hi_u32 s29, s4, s8
+; GFX1250-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX1250-NEXT:    s_add_co_u32 s23, s28, s23
+; GFX1250-NEXT:    s_add_co_ci_u32 s24, s29, s24
+; GFX1250-NEXT:    s_mul_i32 s29, s0, s11
+; GFX1250-NEXT:    s_mul_hi_u32 s30, s0, s11
+; GFX1250-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX1250-NEXT:    s_add_co_u32 s18, s29, s18
+; GFX1250-NEXT:    s_add_co_ci_u32 s23, s30, s23
+; GFX1250-NEXT:    s_mul_i32 s30, s1, s10
+; GFX1250-NEXT:    s_mul_hi_u32 s31, s1, s10
+; GFX1250-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX1250-NEXT:    s_add_co_u32 s18, s30, s18
+; GFX1250-NEXT:    s_add_co_ci_u32 s23, s31, s23
+; GFX1250-NEXT:    s_mul_i32 s31, s2, s9
+; GFX1250-NEXT:    s_mul_hi_u32 s33, s2, s9
+; GFX1250-NEXT:    s_cselect_b32 s30, 1, 0
+; GFX1250-NEXT:    s_add_co_u32 s18, s31, s18
+; GFX1250-NEXT:    s_add_co_ci_u32 s23, s33, s23
+; GFX1250-NEXT:    s_mul_i32 s33, s3, s8
+; GFX1250-NEXT:    s_mul_hi_u32 s34, s3, s8
+; GFX1250-NEXT:    s_cselect_b32 s31, 1, 0
+; GFX1250-NEXT:    s_add_co_u32 s18, s33, s18
+; GFX1250-NEXT:    s_add_co_ci_u32 s23, s34, s23
+; GFX1250-NEXT:    s_cselect_b32 s33, 1, 0
+; GFX1250-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX1250-NEXT:    s_mul_hi_u32 s22, s0, s14
+; GFX1250-NEXT:    s_add_co_ci_u32 s18, s21, s18
+; GFX1250-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX1250-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX1250-NEXT:    s_mul_hi_u32 s34, s1, s13
+; GFX1250-NEXT:    s_add_co_ci_u32 s19, s19, 0
+; GFX1250-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX1250-NEXT:    s_mul_i32 s21, s0, s14
+; GFX1250-NEXT:    s_add_co_ci_u32 s19, s19, s23
+; GFX1250-NEXT:    s_mul_i32 s23, s1, s13
+; GFX1250-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX1250-NEXT:    s_add_co_u32 s21, s23, s21
+; GFX1250-NEXT:    s_mul_i32 s23, s2, s12
+; GFX1250-NEXT:    s_add_co_ci_u32 s22, s34, s22
+; GFX1250-NEXT:    s_mul_hi_u32 s34, s2, s12
+; GFX1250-NEXT:    s_add_co_u32 s21, s23, s21
+; GFX1250-NEXT:    s_mul_i32 s23, s3, s11
+; GFX1250-NEXT:    s_add_co_ci_u32 s22, s34, s22
+; GFX1250-NEXT:    s_mul_hi_u32 s34, s3, s11
+; GFX1250-NEXT:    s_add_co_u32 s21, s23, s21
+; GFX1250-NEXT:    s_mul_i32 s23, s4, s10
+; GFX1250-NEXT:    s_add_co_ci_u32 s22, s34, s22
+; GFX1250-NEXT:    s_mul_hi_u32 s34, s4, s10
+; GFX1250-NEXT:    s_add_co_u32 s21, s23, s21
+; GFX1250-NEXT:    s_mul_i32 s23, s5, s9
+; GFX1250-NEXT:    s_add_co_ci_u32 s22, s34, s22
+; GFX1250-NEXT:    s_mul_hi_u32 s34, s5, s9
+; GFX1250-NEXT:    s_add_co_u32 s21, s23, s21
+; GFX1250-NEXT:    s_mul_i32 s23, s6, s8
+; GFX1250-NEXT:    s_add_co_ci_u32 s22, s34, s22
+; GFX1250-NEXT:    s_mul_hi_u32 s34, s6, s8
+; GFX1250-NEXT:    s_add_co_u32 s21, s23, s21
+; GFX1250-NEXT:    s_mul_i32 s23, s0, s13
+; GFX1250-NEXT:    s_add_co_ci_u32 s22, s34, s22
+; GFX1250-NEXT:    s_mul_hi_u32 s34, s0, s13
+; GFX1250-NEXT:    s_add_co_u32 s23, s23, s24
+; GFX1250-NEXT:    s_add_co_ci_u32 s21, s34, s21
+; GFX1250-NEXT:    s_mul_i32 s34, s1, s12
+; GFX1250-NEXT:    s_mul_hi_u32 s35, s1, s12
+; GFX1250-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX1250-NEXT:    s_add_co_u32 s23, s34, s23
+; GFX1250-NEXT:    s_add_co_ci_u32 s21, s35, s21
+; GFX1250-NEXT:    s_mul_i32 s35, s2, s11
+; GFX1250-NEXT:    s_mul_hi_u32 s36, s2, s11
+; GFX1250-NEXT:    s_cselect_b32 s34, 1, 0
+; GFX1250-NEXT:    s_add_co_u32 s23, s35, s23
+; GFX1250-NEXT:    s_add_co_ci_u32 s21, s36, s21
+; GFX1250-NEXT:    s_mul_i32 s36, s3, s10
+; GFX1250-NEXT:    s_mul_hi_u32 s37, s3, s10
+; GFX1250-NEXT:    s_cselect_b32 s35, 1, 0
+; GFX1250-NEXT:    s_add_co_u32 s23, s36, s23
+; GFX1250-NEXT:    s_add_co_ci_u32 s21, s37, s21
+; GFX1250-NEXT:    s_mul_i32 s37, s4, s9
+; GFX1250-NEXT:    s_mul_hi_u32 s38, s4, s9
+; GFX1250-NEXT:    s_cselect_b32 s36, 1, 0
+; GFX1250-NEXT:    s_add_co_u32 s23, s37, s23
+; GFX1250-NEXT:    s_add_co_ci_u32 s21, s38, s21
+; GFX1250-NEXT:    s_mul_i32 s38, s5, s8
+; GFX1250-NEXT:    s_mul_hi_u32 s39, s5, s8
+; GFX1250-NEXT:    s_cselect_b32 s37, 1, 0
+; GFX1250-NEXT:    s_add_co_u32 s23, s38, s23
+; GFX1250-NEXT:    s_add_co_ci_u32 s21, s39, s21
+; GFX1250-NEXT:    s_cselect_b32 s38, 1, 0
+; GFX1250-NEXT:    s_cmp_lg_u32 s30, 0
+; GFX1250-NEXT:    s_mul_i32 s1, s1, s14
+; GFX1250-NEXT:    s_add_co_ci_u32 s29, s29, 0
+; GFX1250-NEXT:    s_cmp_lg_u32 s31, 0
+; GFX1250-NEXT:    s_mul_i32 s2, s2, s13
+; GFX1250-NEXT:    s_add_co_ci_u32 s29, s29, 0
+; GFX1250-NEXT:    s_cmp_lg_u32 s33, 0
+; GFX1250-NEXT:    s_mul_i32 s3, s3, s12
+; GFX1250-NEXT:    s_add_co_ci_u32 s29, s29, 0
+; GFX1250-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX1250-NEXT:    s_mul_i32 s4, s4, s11
+; GFX1250-NEXT:    s_add_co_ci_u32 s20, s29, s23
+; GFX1250-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX1250-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX1250-NEXT:    s_mul_i32 s26, s0, s15
+; GFX1250-NEXT:    s_add_co_ci_u32 s25, s25, 0
+; GFX1250-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX1250-NEXT:    s_mul_i32 s5, s5, s10
+; GFX1250-NEXT:    s_add_co_ci_u32 s25, s25, 0
+; GFX1250-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX1250-NEXT:    s_mul_i32 s6, s6, s9
+; GFX1250-NEXT:    s_add_co_ci_u32 s25, s25, 0
+; GFX1250-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX1250-NEXT:    s_mul_i32 s7, s7, s8
+; GFX1250-NEXT:    s_add_co_ci_u32 s15, s25, s21
+; GFX1250-NEXT:    s_add_co_ci_u32 s21, s22, s26
+; GFX1250-NEXT:    s_cmp_lg_u32 s38, 0
+; GFX1250-NEXT:    s_mul_i32 s0, s0, s8
+; GFX1250-NEXT:    s_add_co_ci_u32 s1, s21, s1
+; GFX1250-NEXT:    s_cmp_lg_u32 s37, 0
+; GFX1250-NEXT:    s_add_co_ci_u32 s1, s1, s2
+; GFX1250-NEXT:    s_cmp_lg_u32 s36, 0
+; GFX1250-NEXT:    s_mov_b32 s2, s17
+; GFX1250-NEXT:    s_add_co_ci_u32 s1, s1, s3
+; GFX1250-NEXT:    s_cmp_lg_u32 s35, 0
+; GFX1250-NEXT:    s_mov_b32 s3, s18
+; GFX1250-NEXT:    s_add_co_ci_u32 s1, s1, s4
+; GFX1250-NEXT:    s_cmp_lg_u32 s34, 0
+; GFX1250-NEXT:    s_mov_b32 s4, s19
+; GFX1250-NEXT:    s_add_co_ci_u32 s1, s1, s5
+; GFX1250-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX1250-NEXT:    s_mov_b32 s5, s20
+; GFX1250-NEXT:    s_add_co_ci_u32 s1, s1, s6
+; GFX1250-NEXT:    s_mov_b32 s6, s15
+; GFX1250-NEXT:    s_add_co_i32 s7, s1, s7
+; GFX1250-NEXT:    s_mov_b32 s1, s16
+; GFX1250-NEXT:    ; return to shader part epilog
   %result = mul i256 %num, %den
   %cast = bitcast i256 %result to <8 x i32>
   ret <8 x i32> %cast
@@ -2478,6 +2853,107 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i256:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], null, v0, v14, 0
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], null, v0, v12, 0
+; GFX1250-NEXT:    v_mul_lo_u32 v26, v6, v9
+; GFX1250-NEXT:    v_mul_lo_u32 v29, v3, v12
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], null, v1, v13, v[16:17]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19]
+; GFX1250-NEXT:    s_wait_alu 0xf1ff
+; GFX1250-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], null, v2, v12, v[16:17]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
+; GFX1250-NEXT:    s_wait_alu 0xfffd
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], null, v0, v10, 0
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], null, v3, v11, v[16:17]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
+; GFX1250-NEXT:    s_wait_alu 0xfffd
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], null, v4, v10, v[16:17]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], null, v5, v9, v[16:17]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[22:23], null, v6, v8, v[16:17]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21]
+; GFX1250-NEXT:    v_mov_b32_e32 v20, v19
+; GFX1250-NEXT:    s_wait_alu 0xfffd
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v24, vcc_lo
+; GFX1250-NEXT:    s_wait_alu 0xf1ff
+; GFX1250-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v21, v22
+; GFX1250-NEXT:    v_mul_lo_u32 v22, v5, v10
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[24:25], vcc_lo, v2, v8, v[16:17]
+; GFX1250-NEXT:    s_wait_alu 0xfffd
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v27, null, 0, v19, vcc_lo
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], s0, v0, v13, v[20:21]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v20, v25
+; GFX1250-NEXT:    v_mul_lo_u32 v25, v4, v11
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], vcc_lo, v1, v12, v[16:17]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21]
+; GFX1250-NEXT:    s_wait_alu 0xf1ff
+; GFX1250-NEXT:    v_cndmask_b32_e64 v28, 0, 1, s2
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], null, v0, v8, 0
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
+; GFX1250-NEXT:    v_mul_lo_u32 v20, v2, v13
+; GFX1250-NEXT:    s_wait_alu 0xf1ff
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v21, null, 0, v28, s2
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
+; GFX1250-NEXT:    v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24
+; GFX1250-NEXT:    s_wait_alu 0xf1ff
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, v21, s2
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19]
+; GFX1250-NEXT:    v_mul_lo_u32 v0, v0, v15
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
+; GFX1250-NEXT:    s_wait_alu 0xf1ff
+; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
+; GFX1250-NEXT:    v_mul_lo_u32 v9, v1, v14
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, v2, s2
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19]
+; GFX1250-NEXT:    s_wait_alu 0xf1ff
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
+; GFX1250-NEXT:    s_wait_alu 0xf1ff
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v4, s2, v27, v13, s2
+; GFX1250-NEXT:    s_wait_alu 0xf1ff
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v5, s2, v2, v10, s2
+; GFX1250-NEXT:    s_wait_alu 0xf1ff
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v6, s2, v6, v11, s2
+; GFX1250-NEXT:    s_wait_alu 0xf1ff
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v23, v0, s2
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v9, s5
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v20, s4
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v29, s3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v25, s1
+; GFX1250-NEXT:    s_wait_alu 0xfffd
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v26, s0
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[8:9], null, v7, v8, v[0:1]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT:    v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v7, v8
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = mul i256 %num, %den
   ret i256 %result
 }
@@ -2536,6 +3012,14 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: s_mul_u64_zext_with_vregs:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX1250-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1250-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %in, align 4
   %ext = zext i32 %val to i64
   %mul = mul i64 %ext, 80
@@ -2632,6 +3116,21 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: s_mul_u64_zext_with_sregs:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mul_u64 s[2:3], s[2:3], 0x50
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %in, align 4
   %ext = zext i32 %val to i64
   %mul = mul i64 %ext, 80
@@ -2704,6 +3203,14 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
 ; GFX12-NEXT:    v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: s_mul_u64_sext_with_vregs:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
+; GFX1250-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1250-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %in, align 4
   %ext = sext i32 %val to i64
   %mul = mul i64 %ext, 80
@@ -2815,6 +3322,20 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: s_mul_u64_sext_with_sregs:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_mul_u64 s[2:3], s[2:3], 0x50
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %in, align 4
   %ext = sext i32 %val to i64
   %mul = mul i64 %ext, 80
diff --git a/llvm/test/CodeGen/AMDGPU/add_u64.ll b/llvm/test/CodeGen/AMDGPU/add_u64.ll
new file mode 100644
index 0000000000000..0373027201378
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/add_u64.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
+
+define amdgpu_ps <2 x float> @test_add_u64_vv(i64 %a, i64 %b) {
+; GFX12-LABEL: test_add_u64_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_vv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT:    ; return to shader part epilog
+  %add = add i64 %a, %b
+  %ret = bitcast i64 %add to <2 x float>
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_vs(i64 %a, i64 inreg %b) {
+; GFX12-LABEL: test_add_u64_vs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, v1, vcc_lo
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_vs:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-NEXT:    ; return to shader part epilog
+  %add = add i64 %a, %b
+  %ret = bitcast i64 %add to <2 x float>
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_sv(i64 inreg %a, i64 %b) {
+; GFX12-LABEL: test_add_u64_sv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, v1, vcc_lo
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_sv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-NEXT:    ; return to shader part epilog
+  %add = add i64 %a, %b
+  %ret = bitcast i64 %add to <2 x float>
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_ss(i64 inreg %a, i64 inreg %b) {
+; GCN-LABEL: test_add_u64_ss:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    ; return to shader part epilog
+  %add = add i64 %a, %b
+  %ret = bitcast i64 %add to <2 x float>
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_v_inline_lit(i64 %a) {
+; GFX12-LABEL: test_add_u64_v_inline_lit:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_v_inline_lit:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], 5, v[0:1]
+; GFX1250-NEXT:    ; return to shader part epilog
+  %add = add i64 %a, 5
+  %ret = bitcast i64 %add to <2 x float>
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_v_small_imm(i64 %a) {
+; GFX12-LABEL: test_add_u64_v_small_imm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1f4, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_v_small_imm:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], 0x1f4, v[0:1]
+; GFX1250-NEXT:    ; return to shader part epilog
+  %add = add i64 %a, 500
+  %ret = bitcast i64 %add to <2 x float>
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_v_64bit_imm(i64 %a) {
+; GFX12-LABEL: test_add_u64_v_64bit_imm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3b9ac9ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, v1, vcc_lo
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_v_64bit_imm:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], lit64(0x13b9ac9ff), v[0:1]
+; GFX1250-NEXT:    ; return to shader part epilog
+  %add = add i64 %a, 5294967295
+  %ret = bitcast i64 %add to <2 x float>
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_s_small_imm(i64 inreg %a) {
+; GCN-LABEL: test_add_u64_s_small_imm:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x1f4
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    ; return to shader part epilog
+  %add = add i64 %a, 500
+  %ret = bitcast i64 %add to <2 x float>
+  ret <2 x float> %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
index 95504052249e0..7fec5f71ce8d5 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -152,7 +152,7 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
 ; GCN-NEXT:    s_wait_xcnt 0x0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GCN-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
 ; GCN-NEXT:    s_mov_b32 s0, exec_lo
 ; GCN-NEXT:    v_cmpx_ne_u32_e32 0, v2
 ; GCN-NEXT:    s_cbranch_execnz .LBB3_1
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index eff68ce2de11d..4a634520c682e 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -9,6 +9,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1030W32 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1030W64 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250 %s
 
 ; GCN-ISEL-LABEL: name:   sadd64rr
 ; GCN-ISEL-LABEL: body:
@@ -113,6 +114,19 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: sadd64rr:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    s_endpgm
 entry:
   %add = add i64 %a, %b
   store i64 %add, ptr addrspace(1) %out
@@ -211,6 +225,17 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: sadd64ri:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_nc_u64 s[2:3], s[2:3], lit64(0x123456789876)
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    s_endpgm
 entry:
   %add = add i64 20015998343286, %a
   store i64 %add, ptr addrspace(1) %out
@@ -301,6 +326,17 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: vadd64rr:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1]
+; GFX1250-NEXT:    global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -391,6 +427,17 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: vadd64ri:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], lit64(0x123456789876), v[0:1]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -486,6 +533,18 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: suaddo32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
@@ -606,6 +665,21 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: uaddo32_vcc_user:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_add_co_u32 v1, s4, s6, s7
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    global_store_b8 v0, v2, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
@@ -741,6 +815,22 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: suaddo64:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_nc_u64 s[6:7], s[4:5], s[6:7]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX1250-NEXT:    s_wait_alu 0xf1ff
+; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b8 v2, v3, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
@@ -874,6 +964,23 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: vuaddo64:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1]
+; GFX1250-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[2:3]
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT:    global_store_b8 v1, v0, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %tid.ext)
@@ -987,6 +1094,19 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ssub64rr:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_sub_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    s_endpgm
 entry:
   %sub = sub i64 %a, %b
   store i64 %sub, ptr addrspace(1) %out
@@ -1085,6 +1205,17 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ssub64ri:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_sub_nc_u64 s[2:3], lit64(0x123456789876), s[2:3]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    s_endpgm
 entry:
   %sub = sub i64 20015998343286, %a
   store i64 %sub, ptr addrspace(1) %out
@@ -1175,6 +1306,17 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
 ; GFX11-NEXT:    v_sub_co_ci_u32_e64 v1, null, s3, 0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: vsub64rr:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_sub_nc_u64_e32 v[2:3], s[2:3], v[0:1]
+; GFX1250-NEXT:    global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -1265,6 +1407,17 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: vsub64ri:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_sub_nc_u64_e32 v[2:3], lit64(0x123456789876), v[0:1]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -1361,6 +1514,18 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: susubo32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_sub_co_i32 s0, s0, s1
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
@@ -1481,6 +1646,21 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: usubo32_vcc_user:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_sub_co_u32 v1, s4, s6, s7
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    global_store_b8 v0, v2, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
@@ -1616,6 +1796,22 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: susubo64:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_sub_nc_u64 s[6:7], s[4:5], s[6:7]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX1250-NEXT:    s_wait_alu 0xf1ff
+; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b8 v2, v3, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
@@ -1749,6 +1945,23 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: vusubo64:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_sub_nc_u64_e32 v[2:3], s[6:7], v[0:1]
+; GFX1250-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[2:3]
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT:    global_store_b8 v1, v0, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %tid.ext)
@@ -2904,6 +3117,191 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX11-NEXT:  .LBB16_4:
 ; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX11-NEXT:    s_branch .LBB16_2
+;
+; GFX1250-LABEL: sudiv64:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x34
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_or_b64 s[0:1], s[10:11], s[2:3]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_and_b64 s[0:1], s[0:1], lit64(0xffffffff00000000)
+; GFX1250-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1250-NEXT:    s_cbranch_scc0 .LBB16_4
+; GFX1250-NEXT:  ; %bb.1:
+; GFX1250-NEXT:    s_cvt_f32_u32 s0, s2
+; GFX1250-NEXT:    s_cvt_f32_u32 s1, s3
+; GFX1250-NEXT:    s_sub_nc_u64 s[6:7], 0, s[2:3]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX1250-NEXT:    s_fmac_f32 s0, s1, 0x4f800000
+; GFX1250-NEXT:    v_s_rcp_f32 s0, s0
+; GFX1250-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX1250-NEXT:    s_mul_f32 s0, s0, 0x5f7ffffc
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_mul_f32 s1, s0, 0x2f800000
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX1250-NEXT:    s_trunc_f32 s1, s1
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_fmac_f32 s0, s1, 0xcf800000
+; GFX1250-NEXT:    s_cvt_u32_f32 s5, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_cvt_u32_f32 s4, s0
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_mul_u64 s[12:13], s[6:7], s[4:5]
+; GFX1250-NEXT:    s_mul_hi_u32 s15, s4, s13
+; GFX1250-NEXT:    s_mul_i32 s14, s4, s13
+; GFX1250-NEXT:    s_mul_hi_u32 s0, s4, s12
+; GFX1250-NEXT:    s_mul_i32 s17, s5, s12
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_add_nc_u64 s[14:15], s[0:1], s[14:15]
+; GFX1250-NEXT:    s_mul_hi_u32 s16, s5, s12
+; GFX1250-NEXT:    s_mul_hi_u32 s18, s5, s13
+; GFX1250-NEXT:    s_add_co_u32 s0, s14, s17
+; GFX1250-NEXT:    s_add_co_ci_u32 s0, s15, s16
+; GFX1250-NEXT:    s_mul_i32 s12, s5, s13
+; GFX1250-NEXT:    s_add_co_ci_u32 s13, s18, 0
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_add_nc_u64 s[12:13], s[0:1], s[12:13]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_add_co_u32 v0, s0, s4, s12
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT:    s_add_co_ci_u32 s5, s5, s13
+; GFX1250-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_mul_u64 s[6:7], s[6:7], s[4:5]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_mul_hi_u32 s13, s4, s7
+; GFX1250-NEXT:    s_mul_i32 s12, s4, s7
+; GFX1250-NEXT:    s_mul_hi_u32 s0, s4, s6
+; GFX1250-NEXT:    s_mul_i32 s15, s5, s6
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_add_nc_u64 s[12:13], s[0:1], s[12:13]
+; GFX1250-NEXT:    s_mul_hi_u32 s14, s5, s6
+; GFX1250-NEXT:    s_mul_hi_u32 s4, s5, s7
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_add_co_u32 s0, s12, s15
+; GFX1250-NEXT:    s_add_co_ci_u32 s0, s13, s14
+; GFX1250-NEXT:    s_mul_i32 s6, s5, s7
+; GFX1250-NEXT:    s_add_co_ci_u32 s7, s4, 0
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_add_nc_u64 s[6:7], s[0:1], s[6:7]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_add_co_u32 v0, s0, v0, s6
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT:    s_add_co_ci_u32 s0, s5, s7
+; GFX1250-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_mul_hi_u32 s5, s10, s0
+; GFX1250-NEXT:    s_mul_i32 s4, s10, s0
+; GFX1250-NEXT:    s_mul_hi_u32 s12, s11, s0
+; GFX1250-NEXT:    s_mul_i32 s6, s11, s0
+; GFX1250-NEXT:    s_mul_hi_u32 s0, s10, s7
+; GFX1250-NEXT:    s_mul_i32 s13, s11, s7
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_add_nc_u64 s[4:5], s[0:1], s[4:5]
+; GFX1250-NEXT:    s_mul_hi_u32 s0, s11, s7
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_add_co_u32 s4, s4, s13
+; GFX1250-NEXT:    s_add_co_ci_u32 s0, s5, s0
+; GFX1250-NEXT:    s_add_co_ci_u32 s7, s12, 0
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_add_nc_u64 s[4:5], s[0:1], s[6:7]
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_and_b64 s[6:7], s[4:5], lit64(0xffffffff00000000)
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_or_b32 s6, s6, s4
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_mul_u64 s[4:5], s[2:3], s[6:7]
+; GFX1250-NEXT:    s_add_nc_u64 s[14:15], s[6:7], 2
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    v_sub_co_u32 v0, s0, s10, s4
+; GFX1250-NEXT:    s_sub_co_i32 s4, s11, s5
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX1250-NEXT:    v_sub_co_u32 v1, s12, v0, s2
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_sub_co_ci_u32 s4, s4, s3
+; GFX1250-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX1250-NEXT:    s_add_nc_u64 s[12:13], s[6:7], 1
+; GFX1250-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v1
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_sub_co_ci_u32 s4, s4, 0
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_cmp_ge_u32 s4, s3
+; GFX1250-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX1250-NEXT:    s_cselect_b32 s14, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s4, s3
+; GFX1250-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    v_cndmask_b32_e32 v1, s14, v1, vcc_lo
+; GFX1250-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v0
+; GFX1250-NEXT:    s_sub_co_ci_u32 s0, s11, s5
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX1250-NEXT:    s_wait_alu 0xfffd
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s0, s3
+; GFX1250-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX1250-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s4, v0, s0
+; GFX1250-NEXT:    s_wait_alu 0xfffd
+; GFX1250-NEXT:    v_cndmask_b32_e32 v2, s12, v2, vcc_lo
+; GFX1250-NEXT:    v_cndmask_b32_e32 v1, s13, v3, vcc_lo
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX1250-NEXT:    s_wait_alu 0xfffd
+; GFX1250-NEXT:    v_cndmask_b32_e32 v1, s7, v1, vcc_lo
+; GFX1250-NEXT:    v_cndmask_b32_e32 v0, s6, v2, vcc_lo
+; GFX1250-NEXT:    s_cbranch_execnz .LBB16_3
+; GFX1250-NEXT:  .LBB16_2:
+; GFX1250-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX1250-NEXT:    s_sub_co_i32 s1, 0, s2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX1250-NEXT:    v_nop
+; GFX1250-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_mul_i32 s1, s1, s0
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_mul_hi_u32 s1, s0, s1
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_mul_hi_u32 s0, s10, s0
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_mul_i32 s1, s0, s2
+; GFX1250-NEXT:    s_add_co_i32 s3, s0, 1
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_sub_co_i32 s1, s10, s1
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_sub_co_i32 s4, s1, s2
+; GFX1250-NEXT:    s_cmp_ge_u32 s1, s2
+; GFX1250-NEXT:    s_cselect_b32 s0, s3, s0
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_cselect_b32 s1, s4, s1
+; GFX1250-NEXT:    s_add_co_i32 s3, s0, 1
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_cmp_ge_u32 s1, s2
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    s_cselect_b32 s0, s3, s0
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-NEXT:  .LBB16_3:
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[8:9]
+; GFX1250-NEXT:    s_endpgm
+; GFX1250-NEXT:  .LBB16_4:
+; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    s_branch .LBB16_2
   %result = udiv i64 %x, %y
   store i64 %result, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
index dea9142cf2bee..f9fae025e0bf8 100644
--- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
@@ -737,7 +737,7 @@ define i64 @v_add_u64_vop2_literal_32(i64 %x) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
-; GFX1250-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 0x7b ; encoding: [0x00,0x00,0x52,0xd6,0x00,0x01,0xfd,0x03,0x7b,0x00,0x00,0x00]
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], 0x7b, v[0:1] ; encoding: [0xff,0x00,0x00,0x50,0x7b,0x00,0x00,0x00]
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %add = add i64 %x, 123
   ret i64 %add
@@ -747,7 +747,7 @@ define i64 @v_add_u64_vop2_literal_32(i64 %x) {
 ; GFX10: codeLenInByte = 28
 ; GFX1100: codeLenInByte = 32
 ; GFX1150: codeLenInByte = 32
-; GFX1250: codeLenInByte = 24
+; GFX1250: codeLenInByte = 20
 
 define i64 @v_add_u64_vop2_literal_64(i64 %x) {
 ; GFX9-LABEL: v_add_u64_vop2_literal_64:
@@ -788,9 +788,7 @@ define i64 @v_add_u64_vop2_literal_64(i64 %x) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
-; GFX1250-NEXT:    s_mov_b64 s[0:1], lit64(0x112345678) ; encoding: [0xfe,0x01,0x80,0xbe,0x78,0x56,0x34,0x12,0x01,0x00,0x00,0x00]
-; GFX1250-NEXT:    s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
-; GFX1250-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; encoding: [0x00,0x00,0x52,0xd6,0x00,0x01,0x01,0x00]
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], lit64(0x112345678), v[0:1] ; encoding: [0xfe,0x00,0x00,0x50,0x78,0x56,0x34,0x12,0x01,0x00,0x00,0x00]
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %add = add i64 %x, 4600387192
   ret i64 %add
@@ -800,6 +798,6 @@ define i64 @v_add_u64_vop2_literal_64(i64 %x) {
 ; GFX10: codeLenInByte = 28
 ; GFX1100: codeLenInByte = 32
 ; GFX1150: codeLenInByte = 32
-; GFX1250: codeLenInByte = 36
+; GFX1250: codeLenInByte = 24
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; NOT-GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index f4040f3049e0d..eba46a1ecb614 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -256,7 +256,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -350,8 +350,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -455,7 +455,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB12_3
@@ -529,8 +529,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -676,7 +676,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -704,7 +704,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -751,7 +751,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5]
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
@@ -772,8 +772,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -805,7 +805,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -856,7 +856,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5]
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
@@ -879,7 +879,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB20_3
@@ -904,7 +904,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
@@ -943,7 +943,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5]
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
 ; GFX1250-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
@@ -959,8 +959,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -989,7 +989,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
@@ -1032,7 +1032,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5]
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
 ; GFX1250-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
@@ -1112,7 +1112,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -1131,7 +1131,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB26_2
@@ -1140,9 +1140,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
-; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT:    v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT:    v_sub_nc_u64_e32 v[2:3], v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -1179,7 +1177,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
-; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB26_2
@@ -1189,9 +1187,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v4
-; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT:    v_sub_co_ci_u32_e64 v3, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT:    v_sub_nc_u64_e32 v[2:3], v[0:1], v[4:5]
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
@@ -1212,8 +1208,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -1236,7 +1232,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB27_2
@@ -1245,9 +1241,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
-; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT:    v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT:    v_sub_nc_u64_e32 v[2:3], v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -1288,7 +1282,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
-; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB27_2
@@ -1298,9 +1292,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v4
-; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT:    v_sub_co_ci_u32_e64 v3, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT:    v_sub_nc_u64_e32 v[2:3], v[0:1], v[4:5]
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
@@ -1323,7 +1315,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB28_3
@@ -1338,7 +1330,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
@@ -1348,9 +1340,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT:    v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT:    v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
@@ -1378,7 +1368,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
@@ -1389,9 +1379,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v4
-; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT:    v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT:    v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5]
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
 ; GFX1250-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
@@ -1407,8 +1395,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -1427,7 +1415,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
@@ -1437,9 +1425,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT:    v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT:    v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
@@ -1471,7 +1457,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
@@ -1482,9 +1468,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v4
-; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT:    v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT:    v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5]
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
 ; GFX1250-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
@@ -1564,7 +1548,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -1662,8 +1646,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -1771,7 +1755,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB36_3
@@ -1853,8 +1837,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -2008,7 +1992,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -2106,8 +2090,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -2215,7 +2199,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB44_3
@@ -2297,8 +2281,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -2452,7 +2436,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -2550,8 +2534,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -2659,7 +2643,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB52_3
@@ -2741,8 +2725,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -2890,7 +2874,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -2992,8 +2976,8 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -3105,7 +3089,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB60_3
@@ -3187,8 +3171,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -3336,7 +3320,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -3438,8 +3422,8 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -3551,7 +3535,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB68_3
@@ -3633,8 +3617,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -3782,7 +3766,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -3884,8 +3868,8 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -3997,7 +3981,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB76_3
@@ -4079,8 +4063,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -4228,7 +4212,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -4330,8 +4314,8 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -4443,7 +4427,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB84_3
@@ -4525,8 +4509,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -4695,7 +4679,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v3
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -4802,8 +4786,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -4920,7 +4904,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB92_3
@@ -5010,8 +4994,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -5164,7 +5148,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -5192,10 +5176,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], 1, v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, 1
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
 ; GFX1250-SDAG-NEXT:    scratch_store_b64 v6, v[2:3], off
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
@@ -5243,10 +5227,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], 1, v[0:1]
 ; GFX1250-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, 1
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
@@ -5269,8 +5253,8 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -5302,10 +5286,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], 1, v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, 1
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
 ; GFX1250-SDAG-NEXT:    scratch_store_b64 v6, v[2:3], off
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
@@ -5357,10 +5341,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], 1, v[0:1]
 ; GFX1250-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, 1
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
@@ -5385,7 +5369,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB100_3
@@ -5408,10 +5392,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], 1, v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, 1
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4
 ; GFX1250-SDAG-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-SDAG-NEXT:    s_endpgm
@@ -5449,10 +5433,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], 1, v[0:1]
 ; GFX1250-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, 1
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[0:1], off
@@ -5470,8 +5454,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -5498,10 +5482,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], 1, v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, 1
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4
 ; GFX1250-SDAG-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-SDAG-NEXT:    s_endpgm
@@ -5543,10 +5527,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], 1, v[0:1]
 ; GFX1250-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, 1
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[0:1], off
@@ -5621,7 +5605,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -5651,7 +5635,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, -1
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], -1, v[0:1]
 ; GFX1250-SDAG-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -5703,7 +5687,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-GISEL-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, -1
+; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], -1, v[0:1]
 ; GFX1250-GISEL-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -5727,8 +5711,8 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -5762,7 +5746,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, -1
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], -1, v[0:1]
 ; GFX1250-SDAG-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -5818,7 +5802,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-GISEL-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, -1
+; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], -1, v[0:1]
 ; GFX1250-GISEL-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -5844,7 +5828,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB108_3
@@ -5869,7 +5853,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, -1
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], -1, v[0:1]
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-SDAG-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -5913,7 +5897,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-GISEL-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, -1
+; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[0:1], -1, v[0:1]
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-GISEL-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
@@ -5934,8 +5918,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -5964,7 +5948,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, -1
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], -1, v[0:1]
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-SDAG-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
 ; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -6012,7 +5996,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-GISEL-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, -1
+; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[0:1], -1, v[0:1]
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-GISEL-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
index e6018e413a85d..3f1e354f2ccc7 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -341,7 +341,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388608(ptr inreg %s
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -673,7 +673,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32(ptr %vbase, i32 inreg %soffse
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s3, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    ; return to shader part epilog
@@ -703,7 +703,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i3
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s3, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8388607
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    ; return to shader part epilog
@@ -2140,7 +2140,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, 4
+; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], 4, v[2:3]
 ; GFX1250-GISEL-NEXT:    flat_load_b32 v4, v[4:5] scope:SCOPE_SYS
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x400, v2
@@ -2198,7 +2198,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
 ; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, 4
+; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], 4, v[2:3]
 ; GFX1250-GISEL-NEXT:    flat_load_b32 v6, v[4:5] scope:SCOPE_SYS
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-GISEL-NEXT:    flat_load_b32 v4, v[4:5] scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index 79907fd0c60bc..fd644a35f61e3 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -304,78 +304,79 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-SDAG-NEXT:    scratch_store_b32 off, v42, s32 offset:4
 ; GCN-SDAG-NEXT:    scratch_store_b32 off, v43, s32
 ; GCN-SDAG-NEXT:    s_clause 0x7
-; GCN-SDAG-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:112
-; GCN-SDAG-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:96
-; GCN-SDAG-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:80
+; GCN-SDAG-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:112
+; GCN-SDAG-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:96
+; GCN-SDAG-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:80
 ; GCN-SDAG-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:48
-; GCN-SDAG-NEXT:    global_load_b128 v[30:33], v[0:1], off offset:32
-; GCN-SDAG-NEXT:    global_load_b128 v[22:25], v[0:1], off offset:16
-; GCN-SDAG-NEXT:    global_load_b128 v[26:29], v[0:1], off
+; GCN-SDAG-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:32
+; GCN-SDAG-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:16
+; GCN-SDAG-NEXT:    global_load_b128 v[30:33], v[0:1], off
 ; GCN-SDAG-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:64
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[16:17], 0x70
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[24:25], 0x70
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[50:51], 0x60
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[52:53], 48
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[38:39], 0x50
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[54:55], 32
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[48:49], 64
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[40:41], 16
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v14, 0xc8 :: v_dual_mov_b32 v15, 0
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[38:39], 0x50
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[42:43], 0
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[48:49], 64
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v22, 0xc8 :: v_dual_mov_b32 v23, 0
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x7
-; GCN-SDAG-NEXT:    global_store_b128 v[16:17], v[6:9], off
+; GCN-SDAG-NEXT:    global_store_b128 v[24:25], v[10:13], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x6
-; GCN-SDAG-NEXT:    global_store_b128 v[50:51], v[10:13], off
+; GCN-SDAG-NEXT:    global_store_b128 v[50:51], v[18:21], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x5
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x1
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v16, v20 :: v_dual_mov_b32 v17, v21
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v25, v9
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
-; GCN-SDAG-NEXT:    v_lshl_add_u64 v[12:13], v[12:13], 0, v[12:13]
-; GCN-SDAG-NEXT:    v_lshl_add_u64 v[10:11], v[10:11], 0, v[10:11]
-; GCN-SDAG-NEXT:    v_lshl_add_u64 v[8:9], v[8:9], 0, v[8:9]
-; GCN-SDAG-NEXT:    v_lshl_add_u64 v[6:7], v[6:7], 0, v[6:7]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x4
 ; GCN-SDAG-NEXT:    global_store_b128 v[52:53], v[34:37], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x3
-; GCN-SDAG-NEXT:    global_store_b128 v[54:55], v[30:33], off
+; GCN-SDAG-NEXT:    global_store_b128 v[54:55], v[14:17], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x2
-; GCN-SDAG-NEXT:    global_store_b128 v[40:41], v[22:25], off
+; GCN-SDAG-NEXT:    global_store_b128 v[40:41], v[26:29], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x1
-; GCN-SDAG-NEXT:    global_store_b128 v[42:43], v[26:29], off
+; GCN-SDAG-NEXT:    global_store_b128 v[42:43], v[30:33], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x3
-; GCN-SDAG-NEXT:    v_lshl_add_u64 v[52:53], v[2:3], 0, v[2:3]
-; GCN-SDAG-NEXT:    v_lshl_add_u64 v[50:51], v[0:1], 0, v[0:1]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[52:53], v[2:3], v[2:3]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[50:51], v[0:1], v[0:1]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[6:7], 0xc8, v[6:7]
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x2
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[16:17], 0x64, v[16:17]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15]
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x1
-; GCN-SDAG-NEXT:    v_lshl_add_u64 v[24:25], v[24:25], 0, v[24:25]
-; GCN-SDAG-NEXT:    v_lshl_add_u64 v[22:23], v[22:23], 0, v[22:23]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[28:29], v[28:29], v[28:29]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[26:27], v[26:27], v[26:27]
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
-; GCN-SDAG-NEXT:    v_lshl_add_u64 v[28:29], v[28:29], 0, v[28:29]
-; GCN-SDAG-NEXT:    v_lshl_add_u64 v[26:27], v[26:27], 0, v[26:27]
-; GCN-SDAG-NEXT:    v_lshl_add_u64 v[36:37], v[36:37], 0, v[36:37]
-; GCN-SDAG-NEXT:    v_lshl_add_u64 v[34:35], v[34:35], 0, v[34:35]
-; GCN-SDAG-NEXT:    v_lshl_add_u64 v[32:33], v[32:33], 0, 0x64
-; GCN-SDAG-NEXT:    v_lshl_add_u64 v[30:31], v[30:31], 0, v[30:31]
-; GCN-SDAG-NEXT:    v_lshl_add_u64 v[20:21], v[20:21], 0, v[20:21]
-; GCN-SDAG-NEXT:    v_lshl_add_u64 v[18:19], v[18:19], 0, 0xc8
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[36:37], v[36:37], v[36:37]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[34:35], v[34:35], v[34:35]
 ; GCN-SDAG-NEXT:    s_clause 0x1
-; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[14:17], off
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[22:25], off
 ; GCN-SDAG-NEXT:    global_store_b128 v[48:49], v[0:3], off
 ; GCN-SDAG-NEXT:    s_clause 0x7
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:96
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:112
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[18:21], off offset:96
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:112
 ; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[50:53], off offset:64
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[18:21], off offset:80
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[30:33], off offset:32
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:80
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[14:17], off offset:32
 ; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[34:37], off offset:48
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[26:29], off
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[22:25], off offset:16
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[30:33], off
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[26:29], off offset:16
 ; GCN-SDAG-NEXT:    s_clause 0x3
 ; GCN-SDAG-NEXT:    scratch_load_b32 v43, off, s32
 ; GCN-SDAG-NEXT:    scratch_load_b32 v42, off, s32 offset:4
 ; GCN-SDAG-NEXT:    scratch_load_b32 v41, off, s32 offset:8
 ; GCN-SDAG-NEXT:    scratch_load_b32 v40, off, s32 offset:12
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0xc
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v0, v28 :: v_dual_mov_b32 v1, v29
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GCN-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
@@ -403,11 +404,11 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[48:49], 16
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[50:51], 32
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[52:53], 48
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[42:43], 0x60
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[54:55], 64
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[44:45], 0x70
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[34:35], 0xc8
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[40:41], 0x50
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[42:43], 0x60
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[44:45], 0x70
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x6
 ; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[10:13], off
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x5
@@ -422,28 +423,28 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    global_store_b128 v[44:45], v[30:33], off
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[36:37], v[8:9]
 ; GCN-GISEL-NEXT:    s_wait_xcnt 0x5
-; GCN-GISEL-NEXT:    v_lshl_add_u64 v[10:11], v[10:11], 0, v[10:11]
-; GCN-GISEL-NEXT:    v_lshl_add_u64 v[12:13], v[12:13], 0, v[12:13]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
 ; GCN-GISEL-NEXT:    s_wait_xcnt 0x4
-; GCN-GISEL-NEXT:    v_lshl_add_u64 v[14:15], v[14:15], 0, v[14:15]
-; GCN-GISEL-NEXT:    v_lshl_add_u64 v[16:17], v[16:17], 0, v[16:17]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[16:17], v[16:17], v[16:17]
 ; GCN-GISEL-NEXT:    s_wait_xcnt 0x3
-; GCN-GISEL-NEXT:    v_lshl_add_u64 v[18:19], v[18:19], 0, v[18:19]
-; GCN-GISEL-NEXT:    v_lshl_add_u64 v[20:21], v[20:21], 0, 0x64
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[20:21], 0x64, v[20:21]
 ; GCN-GISEL-NEXT:    s_wait_xcnt 0x2
-; GCN-GISEL-NEXT:    v_lshl_add_u64 v[22:23], v[22:23], 0, v[22:23]
-; GCN-GISEL-NEXT:    v_lshl_add_u64 v[24:25], v[24:25], 0, v[24:25]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[22:23], v[22:23], v[22:23]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[24:25], v[24:25], v[24:25]
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT:    v_lshl_add_u64 v[48:49], v[0:1], 0, v[0:1]
-; GCN-GISEL-NEXT:    v_lshl_add_u64 v[50:51], v[2:3], 0, v[2:3]
-; GCN-GISEL-NEXT:    v_lshl_add_u64 v[6:7], v[6:7], 0, 0xc8
-; GCN-GISEL-NEXT:    v_lshl_add_u64 v[8:9], v[8:9], 0, v[8:9]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[48:49], v[0:1], v[0:1]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[50:51], v[2:3], v[2:3]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[6:7], 0xc8, v[6:7]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9]
 ; GCN-GISEL-NEXT:    s_wait_xcnt 0x1
-; GCN-GISEL-NEXT:    v_lshl_add_u64 v[26:27], v[26:27], 0, v[26:27]
-; GCN-GISEL-NEXT:    v_lshl_add_u64 v[28:29], v[28:29], 0, v[28:29]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[26:27], v[26:27], v[26:27]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[28:29], v[28:29], v[28:29]
 ; GCN-GISEL-NEXT:    s_wait_xcnt 0x0
-; GCN-GISEL-NEXT:    v_lshl_add_u64 v[30:31], v[30:31], 0, v[30:31]
-; GCN-GISEL-NEXT:    v_lshl_add_u64 v[32:33], v[32:33], 0, v[32:33]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
 ; GCN-GISEL-NEXT:    s_clause 0x1
 ; GCN-GISEL-NEXT:    global_store_b128 v[54:55], v[0:3], off
 ; GCN-GISEL-NEXT:    global_store_b128 v[40:41], v[34:37], off
diff --git a/llvm/test/CodeGen/AMDGPU/literal64.ll b/llvm/test/CodeGen/AMDGPU/literal64.ll
index df4ff2c8d9851..6706e7638580d 100644
--- a/llvm/test/CodeGen/AMDGPU/literal64.ll
+++ b/llvm/test/CodeGen/AMDGPU/literal64.ll
@@ -12,21 +12,11 @@ define amdgpu_ps i64 @s_add_u64(i64 inreg %a) {
 }
 
 define amdgpu_ps void @v_add_u64(i64 %a, ptr addrspace(1) %out) {
-; GCN-SDAG-LABEL: v_add_u64:
-; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xf12345678)
-; GCN-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GCN-SDAG-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GCN-SDAG-NEXT:    s_endpgm
-;
-; GCN-GISEL-LABEL: v_add_u64:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[4:5], lit64(0xf12345678)
-; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-GISEL-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
-; GCN-GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GCN-GISEL-NEXT:    s_endpgm
+; GCN-LABEL: v_add_u64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_add_nc_u64_e32 v[0:1], lit64(0xf12345678), v[0:1]
+; GCN-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT:    s_endpgm
   %result = add i64 %a, 64729929336
   store i64 %result, ptr addrspace(1) %out, align 8
   ret void
@@ -42,21 +32,11 @@ define amdgpu_ps i64 @s_add_neg_u64(i64 inreg %a) {
 }
 
 define amdgpu_ps void @v_add_neg_u64(i64 %a, ptr addrspace(1) %out) {
-; GCN-SDAG-LABEL: v_add_neg_u64:
-; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xfffffff0edcba988)
-; GCN-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GCN-SDAG-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GCN-SDAG-NEXT:    s_endpgm
-;
-; GCN-GISEL-LABEL: v_add_neg_u64:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[4:5], lit64(0xfffffff0edcba988)
-; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-GISEL-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
-; GCN-GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GCN-GISEL-NEXT:    s_endpgm
+; GCN-LABEL: v_add_neg_u64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_add_nc_u64_e32 v[0:1], lit64(0xfffffff0edcba988), v[0:1]
+; GCN-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT:    s_endpgm
   %result = sub i64 %a, 64729929336
   store i64 %result, ptr addrspace(1) %out, align 8
   ret void
@@ -74,9 +54,7 @@ define amdgpu_ps i64 @s_sub_u64(i64 inreg %a) {
 define amdgpu_ps void @v_sub_u64(i64 %a, ptr addrspace(1) %out) {
 ; GCN-LABEL: v_sub_u64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_sub_co_u32 v0, vcc_lo, 0x12345678, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_sub_co_ci_u32_e64 v1, null, 15, v1, vcc_lo
+; GCN-NEXT:    v_sub_nc_u64_e32 v[0:1], lit64(0xf12345678), v[0:1]
 ; GCN-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GCN-NEXT:    s_endpgm
   %result = sub i64 64729929336, %a
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 91b3a85d36114..8d3716ef62f7c 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -5,6 +5,7 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10 %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11 %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1250 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX1250 %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s
 
 ; mul24 and mad24 are affected
@@ -124,6 +125,25 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: test_mul_v2i32:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s10, s6
+; GFX1250-NEXT:    s_mov_b32 s11, s7
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mov_b32 s8, s2
+; GFX1250-NEXT:    s_mov_b32 s9, s3
+; GFX1250-NEXT:    s_mov_b32 s4, s0
+; GFX1250-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], null
+; GFX1250-NEXT:    s_mov_b32 s5, s1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX1250-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GFX1250-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: test_mul_v2i32:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -286,6 +306,29 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX12-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], null
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: v_mul_v4i32:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s10, s6
+; GFX1250-NEXT:    s_mov_b32 s11, s7
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mov_b32 s8, s2
+; GFX1250-NEXT:    s_mov_b32 s9, s3
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], null
+; GFX1250-NEXT:    buffer_load_b128 v[4:7], off, s[8:11], null offset:16
+; GFX1250-NEXT:    s_mov_b32 s4, s0
+; GFX1250-NEXT:    s_mov_b32 s5, s1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mul_lo_u32 v3, v3, v7
+; GFX1250-NEXT:    v_mul_lo_u32 v2, v2, v6
+; GFX1250-NEXT:    v_mul_lo_u32 v1, v1, v5
+; GFX1250-NEXT:    v_mul_lo_u32 v0, v0, v4
+; GFX1250-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], null
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: v_mul_v4i32:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
@@ -402,6 +445,19 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a,
 ; GFX12-NEXT:    buffer_store_b32 v0, off, s[0:3], null
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: s_trunc_i64_mul_to_i32:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x34
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    s_mov_b32 s2, -1
+; GFX1250-NEXT:    buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: s_trunc_i64_mul_to_i32:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -555,6 +611,29 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
 ; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: v_trunc_i64_mul_to_i32:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT:    s_mov_b32 s10, -1
+; GFX1250-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s14, s10
+; GFX1250-NEXT:    s_mov_b32 s15, s11
+; GFX1250-NEXT:    s_mov_b32 s6, s10
+; GFX1250-NEXT:    s_mov_b32 s7, s11
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mov_b32 s12, s2
+; GFX1250-NEXT:    s_mov_b32 s13, s3
+; GFX1250-NEXT:    buffer_load_b32 v0, off, s[12:15], null
+; GFX1250-NEXT:    buffer_load_b32 v1, off, s[4:7], null
+; GFX1250-NEXT:    s_mov_b32 s8, s0
+; GFX1250-NEXT:    s_mov_b32 s9, s1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX1250-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: v_trunc_i64_mul_to_i32:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
@@ -670,6 +749,19 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: mul64_sext_c:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_mul_u64 s[4:5], s[2:3], 0x50
+; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT:    s_mov_b32 s2, -1
+; GFX1250-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: mul64_sext_c:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -773,6 +865,18 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: mul64_zext_c:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mul_u64 s[4:5], s[2:3], 0x50
+; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT:    s_mov_b32 s2, -1
+; GFX1250-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: mul64_zext_c:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -909,6 +1013,26 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: v_mul64_sext_c:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s10, s6
+; GFX1250-NEXT:    s_mov_b32 s11, s7
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mov_b32 s8, s2
+; GFX1250-NEXT:    s_mov_b32 s9, s3
+; GFX1250-NEXT:    s_mov_b32 s4, s0
+; GFX1250-NEXT:    buffer_load_b32 v0, off, s[8:11], null
+; GFX1250-NEXT:    s_mov_b32 s5, s1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_mul_u64_e32 v[0:1], 0x50, v[0:1]
+; GFX1250-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: v_mul64_sext_c:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1052,6 +1176,25 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: v_mul64_zext_c:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s10, s6
+; GFX1250-NEXT:    s_mov_b32 s11, s7
+; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mov_b32 s8, s2
+; GFX1250-NEXT:    s_mov_b32 s9, s3
+; GFX1250-NEXT:    s_mov_b32 s4, s0
+; GFX1250-NEXT:    buffer_load_b32 v0, off, s[8:11], null
+; GFX1250-NEXT:    s_mov_b32 s5, s1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mul_u64_e32 v[0:1], 0x50, v[0:1]
+; GFX1250-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: v_mul64_zext_c:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1192,6 +1335,26 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: v_mul64_sext_inline_imm:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s10, s6
+; GFX1250-NEXT:    s_mov_b32 s11, s7
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mov_b32 s8, s2
+; GFX1250-NEXT:    s_mov_b32 s9, s3
+; GFX1250-NEXT:    s_mov_b32 s4, s0
+; GFX1250-NEXT:    buffer_load_b32 v0, off, s[8:11], null
+; GFX1250-NEXT:    s_mov_b32 s5, s1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_mul_u64_e32 v[0:1], 9, v[0:1]
+; GFX1250-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: v_mul64_sext_inline_imm:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1300,6 +1463,20 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [
 ; GFX12-NEXT:    buffer_store_b32 v0, off, s[0:3], null
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: s_mul_i32:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_clause 0x2
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4c
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x70
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    s_mov_b32 s2, -1
+; GFX1250-NEXT:    buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: s_mul_i32:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -1425,6 +1602,24 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GFX12-NEXT:    buffer_store_b32 v0, off, s[4:7], null
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: v_mul_i32:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s10, s6
+; GFX1250-NEXT:    s_mov_b32 s11, s7
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mov_b32 s8, s2
+; GFX1250-NEXT:    s_mov_b32 s9, s3
+; GFX1250-NEXT:    s_mov_b32 s4, s0
+; GFX1250-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], null
+; GFX1250-NEXT:    s_mov_b32 s5, s1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX1250-NEXT:    buffer_store_b32 v0, off, s[4:7], null
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: v_mul_i32:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1540,6 +1735,22 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
 ; GFX12-NEXT:    buffer_store_b8 v0, off, s[0:3], null
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: s_mul_i1:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_clause 0x2
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4c
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x70
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_and_b32 s2, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT:    s_and_b32 s2, s2, 1
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    s_mov_b32 s2, -1
+; GFX1250-NEXT:    buffer_store_b8 v0, off, s[0:3], null
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: s_mul_i1:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
@@ -1699,6 +1910,28 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GFX12-NEXT:    buffer_store_b8 v0, off, s[4:7], null
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: v_mul_i1:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s10, s6
+; GFX1250-NEXT:    s_mov_b32 s11, s7
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mov_b32 s8, s2
+; GFX1250-NEXT:    s_mov_b32 s9, s3
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    buffer_load_u8 v0, off, s[8:11], null
+; GFX1250-NEXT:    buffer_load_u8 v1, off, s[8:11], null offset:4
+; GFX1250-NEXT:    s_mov_b32 s4, s0
+; GFX1250-NEXT:    s_mov_b32 s5, s1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX1250-NEXT:    buffer_store_b8 v0, off, s[4:7], null
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: v_mul_i1:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
@@ -1856,6 +2089,19 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: s_mul_i64:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mul_u64 s[4:5], s[2:3], s[4:5]
+; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT:    s_mov_b32 s2, -1
+; GFX1250-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: s_mul_i64:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
@@ -2044,6 +2290,29 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: v_mul_i64:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT:    s_mov_b32 s10, -1
+; GFX1250-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s14, s10
+; GFX1250-NEXT:    s_mov_b32 s15, s11
+; GFX1250-NEXT:    s_mov_b32 s6, s10
+; GFX1250-NEXT:    s_mov_b32 s7, s11
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mov_b32 s12, s2
+; GFX1250-NEXT:    s_mov_b32 s13, s3
+; GFX1250-NEXT:    buffer_load_b64 v[0:1], off, s[12:15], null
+; GFX1250-NEXT:    buffer_load_b64 v[2:3], off, s[4:7], null
+; GFX1250-NEXT:    s_mov_b32 s8, s0
+; GFX1250-NEXT:    s_mov_b32 s9, s1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mul_u64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: v_mul_i64:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
@@ -2286,6 +2555,41 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    buffer_store_b32 v0, off, s[0:3], null
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: mul32_in_branch:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX1250-NEXT:    s_mov_b32 s6, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT:    s_cbranch_scc0 .LBB15_2
+; GFX1250-NEXT:  ; %bb.1: ; %else
+; GFX1250-NEXT:    s_mul_i32 s7, s0, s1
+; GFX1250-NEXT:    s_branch .LBB15_3
+; GFX1250-NEXT:  .LBB15_2:
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    ; implicit-def: $sgpr7
+; GFX1250-NEXT:  .LBB15_3: ; %Flow
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
+; GFX1250-NEXT:    s_cbranch_vccnz .LBB15_5
+; GFX1250-NEXT:  ; %bb.4: ; %if
+; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mov_b32 s4, s2
+; GFX1250-NEXT:    s_mov_b32 s5, s3
+; GFX1250-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX1250-NEXT:    s_branch .LBB15_6
+; GFX1250-NEXT:  .LBB15_5:
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1250-NEXT:  .LBB15_6: ; %endif
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s2, -1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: mul32_in_branch:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU_PUSH_BEFORE 3, @14, KC0[CB0:0-32], KC1[]
@@ -2539,6 +2843,34 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: mul64_in_branch:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1250-NEXT:    s_cbranch_scc0 .LBB16_3
+; GFX1250-NEXT:  ; %bb.1: ; %else
+; GFX1250-NEXT:    s_mul_u64 s[4:5], s[4:5], s[6:7]
+; GFX1250-NEXT:    s_cbranch_execnz .LBB16_4
+; GFX1250-NEXT:  .LBB16_2: ; %if
+; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_mov_b32 s4, s2
+; GFX1250-NEXT:    s_mov_b32 s5, s3
+; GFX1250-NEXT:    buffer_load_b64 v[0:1], off, s[4:7], null
+; GFX1250-NEXT:    s_branch .LBB16_5
+; GFX1250-NEXT:  .LBB16_3:
+; GFX1250-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX1250-NEXT:    s_branch .LBB16_2
+; GFX1250-NEXT:  .LBB16_4:
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT:  .LBB16_5: ; %endif
+; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s2, -1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: mul64_in_branch:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[]
@@ -2882,6 +3214,52 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
 ; GFX12-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: s_mul_i128:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_clause 0x2
+; GFX1250-NEXT:    s_load_b128 s[8:11], s[4:5], 0x7c
+; GFX1250-NEXT:    s_load_b128 s[12:15], s[4:5], 0x4c
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_mov_b64 s[4:5], lit64(0xffffffff)
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_mov_b32 s7, s3
+; GFX1250-NEXT:    s_mov_b32 s17, s3
+; GFX1250-NEXT:    s_mov_b32 s19, s3
+; GFX1250-NEXT:    s_mov_b32 s20, s3
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mov_b32 s2, s8
+; GFX1250-NEXT:    s_and_b64 s[4:5], s[12:13], s[4:5]
+; GFX1250-NEXT:    s_mov_b32 s6, s13
+; GFX1250-NEXT:    s_mul_u64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT:    s_mul_u64 s[12:13], s[4:5], s[2:3]
+; GFX1250-NEXT:    s_mov_b32 s16, s9
+; GFX1250-NEXT:    s_mul_u64 s[8:9], s[8:9], s[14:15]
+; GFX1250-NEXT:    s_mul_u64 s[14:15], s[6:7], s[2:3]
+; GFX1250-NEXT:    s_mov_b32 s2, s13
+; GFX1250-NEXT:    s_mul_u64 s[4:5], s[4:5], s[16:17]
+; GFX1250-NEXT:    s_add_nc_u64 s[14:15], s[14:15], s[2:3]
+; GFX1250-NEXT:    s_mul_u64 s[6:7], s[6:7], s[16:17]
+; GFX1250-NEXT:    s_mov_b32 s2, s15
+; GFX1250-NEXT:    s_mov_b32 s15, s3
+; GFX1250-NEXT:    s_mov_b32 s13, s3
+; GFX1250-NEXT:    s_add_nc_u64 s[4:5], s[4:5], s[14:15]
+; GFX1250-NEXT:    s_add_nc_u64 s[8:9], s[10:11], s[8:9]
+; GFX1250-NEXT:    s_mov_b32 s18, s5
+; GFX1250-NEXT:    s_mov_b32 s21, s4
+; GFX1250-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[18:19]
+; GFX1250-NEXT:    s_or_b64 s[4:5], s[12:13], s[20:21]
+; GFX1250-NEXT:    s_add_nc_u64 s[2:3], s[6:7], s[2:3]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[8:9]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s2, -1
+; GFX1250-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: s_mul_i128:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 41, @4, KC0[CB0:0-32], KC1[]
@@ -3159,6 +3537,43 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
 ; GFX12-NEXT:    global_store_b128 v13, v[8:11], s[2:3]
 ; GFX12-NEXT:    s_endpgm
 ;
+; GFX1250-LABEL: v_mul_i128:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX1250-NEXT:    v_and_b32_e32 v16, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_load_b128 v[0:3], v16, s[2:3] scale_offset
+; GFX1250-NEXT:    global_load_b128 v[4:7], v16, s[0:1] scale_offset
+; GFX1250-NEXT:    s_wait_loadcnt 0x1
+; GFX1250-NEXT:    v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v10, v0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_dual_mov_b32 v9, v11 :: v_dual_mov_b32 v8, v4
+; GFX1250-NEXT:    v_mul_u64_e32 v[6:7], v[0:1], v[6:7]
+; GFX1250-NEXT:    v_mul_lo_u32 v3, v3, v4
+; GFX1250-NEXT:    v_mul_u64_e32 v[8:9], v[8:9], v[10:11]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[6:7], null, v2, v4, v[6:7]
+; GFX1250-NEXT:    v_mul_lo_u32 v2, v2, v5
+; GFX1250-NEXT:    v_mov_b32_e32 v10, v9
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], null, v5, v0, v[10:11]
+; GFX1250-NEXT:    v_add3_u32 v7, v3, v7, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v10, v13 :: v_dual_mov_b32 v13, v11
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], null, v4, v1, v[12:13]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v9, v12
+; GFX1250-NEXT:    v_mov_b32_e32 v14, v13
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[10:11], v[10:11], v[14:15]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], null, v5, v1, v[10:11]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[10:11], v[0:1], v[6:7]
+; GFX1250-NEXT:    global_store_b128 v16, v[8:11], s[2:3] scale_offset
+; GFX1250-NEXT:    s_endpgm
+;
 ; EG-LABEL: v_mul_i128:
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 3, @10, KC0[CB0:0-32], KC1[]
@@ -3271,6 +3686,13 @@ define i32 @mul_pow2_plus_1(i32 %val) {
 ; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX1250-LABEL: mul_pow2_plus_1:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
 ; EG-LABEL: mul_pow2_plus_1:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    CF_END
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
index 64392a15e9a9b..192dce369b0ef 100644
--- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
@@ -369,7 +369,7 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
 ; SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; SDAG-NEXT:    s_wait_loadcnt 0x0
-; SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, 1
+; SDAG-NEXT:    v_add_nc_u64_e32 v[2:3], 1, v[0:1]
 ; SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
 ; SDAG-NEXT:    s_wait_xcnt 0x0
 ; SDAG-NEXT:    s_wait_alu 0xfffe
@@ -418,7 +418,7 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
 ; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GISEL-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
-; GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, 1
+; GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], 1, v[0:1]
 ; GISEL-NEXT:    scratch_store_b64 v4, v[2:3], off
 ; GISEL-NEXT:    s_wait_xcnt 0x0
 ; GISEL-NEXT:    s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/sub_u64.ll b/llvm/test/CodeGen/AMDGPU/sub_u64.ll
new file mode 100644
index 0000000000000..baaca4ddeaf05
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sub_u64.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
+
+define amdgpu_ps <2 x float> @test_sub_u64_vv(i64 %a, i64 %b) {
+; GFX12-LABEL: test_sub_u64_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_vv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %a, %b
+  %ret = bitcast i64 %sub to <2 x float>
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_vs(i64 %a, i64 inreg %b) {
+; GFX12-LABEL: test_sub_u64_vs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_subrev_co_ci_u32_e64 v1, null, s1, v1, vcc_lo
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_vs:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_sub_nc_u64_e64 v[0:1], v[0:1], s[0:1]
+; GFX1250-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %a, %b
+  %ret = bitcast i64 %sub to <2 x float>
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_sv(i64 inreg %a, i64 %b) {
+; GFX12-LABEL: test_sub_u64_sv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_co_ci_u32_e64 v1, null, s1, v1, vcc_lo
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_sv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_sub_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %a, %b
+  %ret = bitcast i64 %sub to <2 x float>
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_ss(i64 inreg %a, i64 inreg %b) {
+; GCN-LABEL: test_sub_u64_ss:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_sub_nc_u64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %a, %b
+  %ret = bitcast i64 %sub to <2 x float>
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_inline_lit_v(i64 %a) {
+; GFX12-LABEL: test_sub_u64_inline_lit_v:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, 5, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_inline_lit_v:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_sub_nc_u64_e32 v[0:1], 5, v[0:1]
+; GFX1250-NEXT:    ; return to shader part epilog
+  %sub = sub i64 5, %a
+  %ret = bitcast i64 %sub to <2 x float>
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_v_inline_lit(i64 %a) {
+; GFX12-LABEL: test_sub_u64_v_inline_lit:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, -5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_v_inline_lit:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], -5, v[0:1]
+; GFX1250-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %a, 5
+  %ret = bitcast i64 %sub to <2 x float>
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_small_imm_v(i64 %a) {
+; GFX12-LABEL: test_sub_u64_small_imm_v:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, 0x1f4, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_small_imm_v:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_sub_nc_u64_e32 v[0:1], 0x1f4, v[0:1]
+; GFX1250-NEXT:    ; return to shader part epilog
+  %sub = sub i64 500, %a
+  %ret = bitcast i64 %sub to <2 x float>
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_64bit_imm_v(i64 %a) {
+; GFX12-LABEL: test_sub_u64_64bit_imm_v:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, 0x3b9ac9ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_co_ci_u32_e64 v1, null, 1, v1, vcc_lo
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_64bit_imm_v:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_sub_nc_u64_e32 v[0:1], lit64(0x13b9ac9ff), v[0:1]
+; GFX1250-NEXT:    ; return to shader part epilog
+  %sub = sub i64 5294967295, %a
+  %ret = bitcast i64 %sub to <2 x float>
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_small_imm_s(i64 inreg %a) {
+; GCN-LABEL: test_sub_u64_small_imm_s:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_sub_nc_u64 s[0:1], 0x1f4, s[0:1]
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    ; return to shader part epilog
+  %sub = sub i64 500, %a
+  %ret = bitcast i64 %sub to <2 x float>
+  ret <2 x float> %ret
+}
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
index 20bc578605b8c..0a1d3bfc02503 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
@@ -154,6 +154,362 @@ v_fmac_f64 v[4:5], v[2:3], v[8:9] div:2
 // GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] div:2 ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x18]
 // GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_add_nc_u64 v[4:5], v[2:3], v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[254:255], v[2:3], v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x51]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64_e64 v[4:5], s[2:3], s[4:5]
+// GFX1250: v_add_nc_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x08,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[254:255], v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], vcc, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], vcc, v[4:5]    ; encoding: [0x6a,0x08,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], exec, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], exec, v[4:5]   ; encoding: [0x7e,0x08,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], 0, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], 0, v[4:5]      ; encoding: [0x80,0x08,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], -1, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], -1, v[4:5]     ; encoding: [0xc1,0x08,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], 0.5, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], 0.5, v[4:5]    ; encoding: [0xf0,0x08,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], -4.0, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], -4.0, v[4:5]   ; encoding: [0xf7,0x08,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], 0xaf123456, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], 0x3f717273, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x50,0x73,0x72,0x71,0x3f]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[254:255], v[2:3], v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x51]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[254:255], v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], vcc, v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], vcc, v[8:9]    ; encoding: [0x6a,0x10,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], exec, v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], exec, v[8:9]   ; encoding: [0x7e,0x10,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], 0, v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], 0, v[8:9]      ; encoding: [0x80,0x10,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], -1, v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], -1, v[8:9]     ; encoding: [0xc1,0x10,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], 0.5, v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], 0.5, v[8:9]    ; encoding: [0xf0,0x10,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], -4.0, v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], -4.0, v[8:9]   ; encoding: [0xf7,0x10,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], vcc
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], vcc    ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xd5,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], exec
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], exec   ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xfd,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], 0
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], 0      ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x01,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], -1
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], -1     ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x83,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], 0.5
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], 0.5    ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xe1,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], -4.0
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], -4.0   ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xef,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], v[8:9] clamp
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x28,0xd5,0x02,0x11,0x02,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[254:255], v[2:3], v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x53]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64_e64 v[4:5], s[2:3], s[4:5]
+// GFX1250: v_sub_nc_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x08,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[254:255], v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], vcc, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], vcc, v[4:5]    ; encoding: [0x6a,0x08,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], exec, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], exec, v[4:5]   ; encoding: [0x7e,0x08,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], 0, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], 0, v[4:5]      ; encoding: [0x80,0x08,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], -1, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], -1, v[4:5]     ; encoding: [0xc1,0x08,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], 0.5, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], 0.5, v[4:5]    ; encoding: [0xf0,0x08,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], -4.0, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], -4.0, v[4:5]   ; encoding: [0xf7,0x08,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], 0xaf123456, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], 0x3f717273, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x52,0x73,0x72,0x71,0x3f]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[254:255], v[2:3], v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x53]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[254:255], v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], vcc, v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], vcc, v[8:9]    ; encoding: [0x6a,0x10,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], exec, v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], exec, v[8:9]   ; encoding: [0x7e,0x10,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], 0, v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], 0, v[8:9]      ; encoding: [0x80,0x10,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], -1, v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], -1, v[8:9]     ; encoding: [0xc1,0x10,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], 0.5, v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], 0.5, v[8:9]    ; encoding: [0xf0,0x10,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], -4.0, v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], -4.0, v[8:9]   ; encoding: [0xf7,0x10,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], vcc
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], vcc    ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xd5,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], exec
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], exec   ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xfd,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], 0
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], 0      ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x01,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], -1
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], -1     ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x83,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], 0.5
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], 0.5    ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xe1,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], -4.0
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], -4.0   ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xef,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], v[8:9] clamp
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x29,0xd5,0x02,0x11,0x02,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[4:5]    ; encoding: [0x02,0x09,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[254:255], v[2:3], v[4:5]
+// GFX1250: v_mul_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x55]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64_e64 v[4:5], s[2:3], s[4:5]
+// GFX1250: v_mul_u64_e64 v[4:5], s[2:3], s[4:5]    ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x08,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[254:255], v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], vcc, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], vcc, v[4:5]       ; encoding: [0x6a,0x08,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], exec, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], exec, v[4:5]      ; encoding: [0x7e,0x08,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], 0, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], 0, v[4:5]         ; encoding: [0x80,0x08,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], -1, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], -1, v[4:5]        ; encoding: [0xc1,0x08,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], 0.5, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], 0.5, v[4:5]       ; encoding: [0xf0,0x08,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], -4.0, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], -4.0, v[4:5]      ; encoding: [0xf7,0x08,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], 0xaf123456, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], 0x3f717273, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x54,0x73,0x72,0x71,0x3f]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[8:9]    ; encoding: [0x02,0x11,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[254:255], v[2:3], v[8:9]
+// GFX1250: v_mul_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x55]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[254:255], v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], vcc, v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], vcc, v[8:9]       ; encoding: [0x6a,0x10,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], exec, v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], exec, v[8:9]      ; encoding: [0x7e,0x10,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], 0, v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], 0, v[8:9]         ; encoding: [0x80,0x10,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], -1, v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], -1, v[8:9]        ; encoding: [0xc1,0x10,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], 0.5, v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], 0.5, v[8:9]       ; encoding: [0xf0,0x10,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], -4.0, v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], -4.0, v[8:9]      ; encoding: [0xf7,0x10,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], vcc
+// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], vcc       ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xd5,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], exec
+// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], exec      ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xfd,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], 0
+// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], 0         ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x01,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], -1
+// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], -1        ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x83,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], 0.5
+// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], 0.5       ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xe1,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], -4.0
+// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], -4.0      ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xef,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_fmamk_f64 v[6:7], v[4:5], 0x405ec000, v[2:3]
 // GFX1250: v_fmamk_f64 v[6:7], v[4:5], 0x405ec000, v[2:3] ; encoding: [0x04,0x05,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
 // GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s
index f67ad88b5ae83..9f5036106dbd3 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s
@@ -22,3 +22,8 @@ v_fmamk_f16 v4, v2, 3, v6 row_share:1
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
 // GFX1250-ERR-NEXT:{{^}}v_fmamk_f16 v4, v2, 3, v6 row_share:1
 // GFX1250-ERR-NEXT:{{^}}                          ^
+
+v_mul_u64 v[4:5], v[2:3], v[8:9] clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_mul_u64 v[4:5], v[2:3], v[8:9] clamp
+// GFX1250-ERR-NEXT:{{^}}                                 ^
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt
index c1213f2d9ec0d..130941c8c1397 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt
@@ -112,6 +112,264 @@
 0x04,0x03,0x17,0xd5,0x02,0x11,0x02,0x00
 # GFX1250: v_fmac_f64_e64 v[4:5], |v[2:3]|, |v[8:9]| ; encoding: [0x04,0x03,0x17,0xd5,0x02,0x11,0x02,0x00]
 
+0x02,0x09,0xfc,0x51
+# GFX1250: v_add_nc_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x51]
+
+0x02,0x11,0xfc,0x51
+# GFX1250: v_add_nc_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x51]
+
+0xc1,0x08,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], -1, v[4:5]     ; encoding: [0xc1,0x08,0x08,0x50]
+
+0xc1,0x10,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], -1, v[8:9]     ; encoding: [0xc1,0x10,0x08,0x50]
+
+0xf7,0x08,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], -4.0, v[4:5]   ; encoding: [0xf7,0x08,0x08,0x50]
+
+0xf7,0x10,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], -4.0, v[8:9]   ; encoding: [0xf7,0x10,0x08,0x50]
+
+0x80,0x08,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], 0, v[4:5]      ; encoding: [0x80,0x08,0x08,0x50]
+
+0x80,0x10,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], 0, v[8:9]      ; encoding: [0x80,0x10,0x08,0x50]
+
+0xf0,0x08,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], 0.5, v[4:5]    ; encoding: [0xf0,0x08,0x08,0x50]
+
+0xf0,0x10,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], 0.5, v[8:9]    ; encoding: [0xf0,0x10,0x08,0x50]
+
+0xff,0x08,0x08,0x50,0x73,0x72,0x71,0x3f
+# GFX1250: v_add_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x50,0x73,0x72,0x71,0x3f]
+
+0xff,0x08,0x08,0x50,0x56,0x34,0x12,0xaf
+# GFX1250: v_add_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+
+0x7e,0x08,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], exec, v[4:5]   ; encoding: [0x7e,0x08,0x08,0x50]
+
+0x7e,0x10,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], exec, v[8:9]   ; encoding: [0x7e,0x10,0x08,0x50]
+
+0xfe,0x09,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x50]
+
+0xfe,0x11,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x50]
+
+0x02,0xfd,0x09,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x50]
+
+0x02,0x09,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x50]
+
+0x02,0x11,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x50]
+
+0x6a,0x08,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], vcc, v[4:5]    ; encoding: [0x6a,0x08,0x08,0x50]
+
+0x6a,0x10,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], vcc, v[8:9]    ; encoding: [0x6a,0x10,0x08,0x50]
+
+0x04,0x00,0x28,0xd5,0x02,0x08,0x00,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x08,0x00,0x00]
+
+0x04,0x00,0x28,0xd5,0x02,0x83,0x01,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], -1     ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x83,0x01,0x00]
+
+0x04,0x00,0x28,0xd5,0x02,0xef,0x01,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], -4.0   ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xef,0x01,0x00]
+
+0x04,0x00,0x28,0xd5,0x02,0x01,0x01,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], 0      ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x01,0x01,0x00]
+
+0x04,0x00,0x28,0xd5,0x02,0xe1,0x01,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], 0.5    ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xe1,0x01,0x00]
+
+0x04,0x00,0x28,0xd5,0x02,0xfd,0x00,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], exec   ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xfd,0x00,0x00]
+
+0x04,0x80,0x28,0xd5,0x02,0x11,0x02,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x28,0xd5,0x02,0x11,0x02,0x00]
+
+0x04,0x00,0x28,0xd5,0x02,0xd5,0x00,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], vcc    ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xd5,0x00,0x00]
+
+0x02,0x09,0xfc,0x53
+# GFX1250: v_sub_nc_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x53]
+
+0x02,0x11,0xfc,0x53
+# GFX1250: v_sub_nc_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x53]
+
+0xc1,0x08,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], -1, v[4:5]     ; encoding: [0xc1,0x08,0x08,0x52]
+
+0xc1,0x10,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], -1, v[8:9]     ; encoding: [0xc1,0x10,0x08,0x52]
+
+0xf7,0x08,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], -4.0, v[4:5]   ; encoding: [0xf7,0x08,0x08,0x52]
+
+0xf7,0x10,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], -4.0, v[8:9]   ; encoding: [0xf7,0x10,0x08,0x52]
+
+0x80,0x08,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], 0, v[4:5]      ; encoding: [0x80,0x08,0x08,0x52]
+
+0x80,0x10,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], 0, v[8:9]      ; encoding: [0x80,0x10,0x08,0x52]
+
+0xf0,0x08,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], 0.5, v[4:5]    ; encoding: [0xf0,0x08,0x08,0x52]
+
+0xf0,0x10,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], 0.5, v[8:9]    ; encoding: [0xf0,0x10,0x08,0x52]
+
+0xff,0x08,0x08,0x52,0x73,0x72,0x71,0x3f
+# GFX1250: v_sub_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x52,0x73,0x72,0x71,0x3f]
+
+0xff,0x08,0x08,0x52,0x56,0x34,0x12,0xaf
+# GFX1250: v_sub_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+
+0x7e,0x08,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], exec, v[4:5]   ; encoding: [0x7e,0x08,0x08,0x52]
+
+0x7e,0x10,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], exec, v[8:9]   ; encoding: [0x7e,0x10,0x08,0x52]
+
+0xfe,0x09,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x52]
+
+0xfe,0x11,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x52]
+
+0x02,0xfd,0x09,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x52]
+
+0x02,0x09,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x52]
+
+0x02,0x11,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x52]
+
+0x6a,0x08,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], vcc, v[4:5]    ; encoding: [0x6a,0x08,0x08,0x52]
+
+0x6a,0x10,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], vcc, v[8:9]    ; encoding: [0x6a,0x10,0x08,0x52]
+
+0x04,0x00,0x29,0xd5,0x02,0x08,0x00,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x08,0x00,0x00]
+
+0x04,0x00,0x29,0xd5,0x02,0x83,0x01,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], -1     ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x83,0x01,0x00]
+
+0x04,0x00,0x29,0xd5,0x02,0xef,0x01,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], -4.0   ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xef,0x01,0x00]
+
+0x04,0x00,0x29,0xd5,0x02,0x01,0x01,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], 0      ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x01,0x01,0x00]
+
+0x04,0x00,0x29,0xd5,0x02,0xe1,0x01,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], 0.5    ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xe1,0x01,0x00]
+
+0x04,0x00,0x29,0xd5,0x02,0xfd,0x00,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], exec   ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xfd,0x00,0x00]
+
+0x04,0x80,0x29,0xd5,0x02,0x11,0x02,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x29,0xd5,0x02,0x11,0x02,0x00]
+
+0x04,0x00,0x29,0xd5,0x02,0xd5,0x00,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], vcc    ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xd5,0x00,0x00]
+
+0x02,0x09,0xfc,0x55
+# GFX1250: v_mul_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x55]
+
+0x02,0x11,0xfc,0x55
+# GFX1250: v_mul_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x55]
+
+0xc1,0x08,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], -1, v[4:5]        ; encoding: [0xc1,0x08,0x08,0x54]
+
+0xc1,0x10,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], -1, v[8:9]        ; encoding: [0xc1,0x10,0x08,0x54]
+
+0xf7,0x08,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], -4.0, v[4:5]      ; encoding: [0xf7,0x08,0x08,0x54]
+
+0xf7,0x10,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], -4.0, v[8:9]      ; encoding: [0xf7,0x10,0x08,0x54]
+
+0x80,0x08,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], 0, v[4:5]         ; encoding: [0x80,0x08,0x08,0x54]
+
+0x80,0x10,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], 0, v[8:9]         ; encoding: [0x80,0x10,0x08,0x54]
+
+0xf0,0x08,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], 0.5, v[4:5]       ; encoding: [0xf0,0x08,0x08,0x54]
+
+0xf0,0x10,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], 0.5, v[8:9]       ; encoding: [0xf0,0x10,0x08,0x54]
+
+0xff,0x08,0x08,0x54,0x73,0x72,0x71,0x3f
+# GFX1250: v_mul_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x54,0x73,0x72,0x71,0x3f]
+
+0xff,0x08,0x08,0x54,0x56,0x34,0x12,0xaf
+# GFX1250: v_mul_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+
+0x7e,0x08,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], exec, v[4:5]      ; encoding: [0x7e,0x08,0x08,0x54]
+
+0x7e,0x10,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], exec, v[8:9]      ; encoding: [0x7e,0x10,0x08,0x54]
+
+0xfe,0x09,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x54]
+
+0xfe,0x11,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x54]
+
+0x02,0xfd,0x09,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x54]
+
+0x02,0x09,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[4:5]    ; encoding: [0x02,0x09,0x08,0x54]
+
+0x02,0x11,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[8:9]    ; encoding: [0x02,0x11,0x08,0x54]
+
+0x6a,0x08,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], vcc, v[4:5]       ; encoding: [0x6a,0x08,0x08,0x54]
+
+0x6a,0x10,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], vcc, v[8:9]       ; encoding: [0x6a,0x10,0x08,0x54]
+
+0x04,0x00,0x2a,0xd5,0x02,0x08,0x00,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], s[2:3], s[4:5]    ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x08,0x00,0x00]
+
+0x04,0x00,0x2a,0xd5,0x02,0x83,0x01,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], -1        ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x83,0x01,0x00]
+
+0x04,0x00,0x2a,0xd5,0x02,0xef,0x01,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], -4.0      ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xef,0x01,0x00]
+
+0x04,0x00,0x2a,0xd5,0x02,0x01,0x01,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], 0         ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x01,0x01,0x00]
+
+0x04,0x00,0x2a,0xd5,0x02,0xe1,0x01,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], 0.5       ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xe1,0x01,0x00]
+
+0x04,0x00,0x2a,0xd5,0x02,0xfd,0x00,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], exec      ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xfd,0x00,0x00]
+
+0x04,0x00,0x2a,0xd5,0x02,0xd5,0x00,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], vcc       ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xd5,0x00,0x00]
+
 0xfe,0xfc,0xfd,0x49,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
 # GFX1250: v_fmaak_f64 v[254:255], 0x405ec000, v[254:255], 0x405ec000 ; encoding: [0xfe,0xfc,0xfd,0x49,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
 



More information about the llvm-commits mailing list