[llvm] AMDGPU: Remove ds_fmin/ds_fmax intrinsics (PR #96739)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 27 06:32:44 PDT 2024


https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/96739

>From 906c8fe7193f148ecb08d0b89405391c3006db72 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 11 Jun 2024 11:46:15 +0200
Subject: [PATCH] AMDGPU: Remove ds_fmin/ds_fmax intrinsics

These have been replaced with atomicrmw.
---
 llvm/docs/ReleaseNotes.rst                    |    5 +
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   14 -
 llvm/lib/IR/AutoUpgrade.cpp                   |    8 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   32 -
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h  |    3 -
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |    2 -
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |   20 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   15 +-
 llvm/test/Bitcode/amdgcn-atomic.ll            |   52 +
 .../AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll  |  371 -----
 .../AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll  |  279 ----
 .../CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll    | 1418 -----------------
 12 files changed, 65 insertions(+), 2154 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 416b3952f1ac4..ed7d252668850 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -132,6 +132,11 @@ Changes to the AMDGPU Backend
 
 * Implemented :ref:`llvm.get.rounding <int_get_rounding>` and :ref:`llvm.set.rounding <int_set_rounding>`
 
+* Removed ``llvm.amdgcn.ds.fadd``, ``llvm.amdgcn.ds.fmin`` and
+  ``llvm.amdgcn.ds.fmax`` intrinsics. Users should use the
+  :ref:`atomicrmw <i_atomicrmw>` instruction with `fadd`, `fmin` and
+  `fmax` with addrspace(3) instead.
+
 Changes to the ARM Backend
 --------------------------
 
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index d040aa8f38278..71b1e832bde3c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -523,17 +523,6 @@ def int_amdgcn_fmad_ftz :
             [IntrNoMem, IntrSpeculatable]
 >;
 
-class AMDGPULDSIntrin :
-  Intrinsic<[llvm_any_ty],
-    [LLVMQualPointerType<3>,
-    LLVMMatchType<0>,
-    llvm_i32_ty, // ordering
-    llvm_i32_ty, // scope
-    llvm_i1_ty], // isVolatile
-    [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
-     ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]
->;
-
 // FIXME: The m0 argument should be moved after the normal arguments
 class AMDGPUDSOrderedIntrinsic : Intrinsic<
   [llvm_i32_ty],
@@ -571,9 +560,6 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
 def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
 def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
 
-def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
-def int_amdgcn_ds_fmax : AMDGPULDSIntrin;
-
 } // TargetPrefix = "amdgcn"
 
 // New-style image intrinsics
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index d7825d9b3e3e5..32076a07d30e7 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1033,8 +1033,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
         break; // No other 'amdgcn.atomic.*'
       }
 
-      if (Name.starts_with("ds.fadd")) {
-        // Replaced with atomicrmw fadd, so there's no new declaration.
+      if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") ||
+          Name.starts_with("ds.fmax")) {
+        // Replaced with atomicrmw fadd/fmin/fmax, so there's no new
+        // declaration.
         NewFn = nullptr;
         return true;
       }
@@ -2347,6 +2349,8 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
   AtomicRMWInst::BinOp RMWOp =
       StringSwitch<AtomicRMWInst::BinOp>(Name)
           .StartsWith("ds.fadd", AtomicRMWInst::FAdd)
+          .StartsWith("ds.fmin", AtomicRMWInst::FMin)
+          .StartsWith("ds.fmax", AtomicRMWInst::FMax)
           .StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap)
           .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index f1254b2e9e1d2..dc165d65fa6ff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5401,35 +5401,6 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
   return true;
 }
 
-static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
-  switch (IID) {
-  case Intrinsic::amdgcn_ds_fmin:
-    return AMDGPU::G_ATOMICRMW_FMIN;
-  case Intrinsic::amdgcn_ds_fmax:
-    return AMDGPU::G_ATOMICRMW_FMAX;
-  default:
-    llvm_unreachable("not a DS FP intrinsic");
-  }
-}
-
-bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
-                                                      MachineInstr &MI,
-                                                      Intrinsic::ID IID) const {
-  GISelChangeObserver &Observer = Helper.Observer;
-  Observer.changingInstr(MI);
-
-  MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
-
-  // The remaining operands were used to set fields in the MemOperand on
-  // construction.
-  for (int I = 6; I > 3; --I)
-    MI.removeOperand(I);
-
-  MI.removeOperand(1); // Remove the intrinsic ID.
-  Observer.changedInstr(MI);
-  return true;
-}
-
 // TODO: Fix pointer type handling
 bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
                                          MachineInstr &MI,
@@ -7451,9 +7422,6 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return legalizeBufferAtomic(MI, B, IntrID);
   case Intrinsic::amdgcn_rsq_clamp:
     return legalizeRsqClampIntrinsic(MI, MRI, B);
-  case Intrinsic::amdgcn_ds_fmin:
-  case Intrinsic::amdgcn_ds_fmax:
-    return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
   case Intrinsic::amdgcn_image_bvh_intersect_ray:
     return legalizeBVHIntrinsic(MI, B);
   case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index ae01bb29c1108..db1c5874093a7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -175,9 +175,6 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
   bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
                                  MachineIRBuilder &B) const;
 
-  bool legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
-                                   MachineInstr &MI, Intrinsic::ID IID) const;
-
   bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI,
                          MachineIRBuilder &B) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index ed5bae3e4ff61..a323f63767737 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -252,8 +252,6 @@ def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>;
-def : SourceOfDivergence<int_amdgcn_ds_fmin>;
-def : SourceOfDivergence<int_amdgcn_ds_fmax>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_add>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_sub>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 1192b49fd1f08..8882839ed8de3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -501,9 +501,7 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
                                        MemIntrinsicInfo &Info) const {
   switch (Inst->getIntrinsicID()) {
   case Intrinsic::amdgcn_ds_ordered_add:
-  case Intrinsic::amdgcn_ds_ordered_swap:
-  case Intrinsic::amdgcn_ds_fmin:
-  case Intrinsic::amdgcn_ds_fmax: {
+  case Intrinsic::amdgcn_ds_ordered_swap: {
     auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
     auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
     if (!Ordering || !Volatile)
@@ -1018,8 +1016,6 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
                                             Intrinsic::ID IID) const {
   switch (IID) {
-  case Intrinsic::amdgcn_ds_fmin:
-  case Intrinsic::amdgcn_ds_fmax:
   case Intrinsic::amdgcn_is_shared:
   case Intrinsic::amdgcn_is_private:
   case Intrinsic::amdgcn_flat_atomic_fadd:
@@ -1039,20 +1035,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
                                                     Value *NewV) const {
   auto IntrID = II->getIntrinsicID();
   switch (IntrID) {
-  case Intrinsic::amdgcn_ds_fmin:
-  case Intrinsic::amdgcn_ds_fmax: {
-    const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
-    if (!IsVolatile->isZero())
-      return nullptr;
-    Module *M = II->getParent()->getParent()->getParent();
-    Type *DestTy = II->getType();
-    Type *SrcTy = NewV->getType();
-    Function *NewDecl =
-        Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
-    II->setArgOperand(0, NewV);
-    II->setCalledFunction(NewDecl);
-    return II;
-  }
   case Intrinsic::amdgcn_is_shared:
   case Intrinsic::amdgcn_is_private: {
     unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 83bfb622ee525..16fa7266a4b7d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1279,9 +1279,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
 
   switch (IntrID) {
   case Intrinsic::amdgcn_ds_ordered_add:
-  case Intrinsic::amdgcn_ds_ordered_swap:
-  case Intrinsic::amdgcn_ds_fmin:
-  case Intrinsic::amdgcn_ds_fmax: {
+  case Intrinsic::amdgcn_ds_ordered_swap: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(CI.getType());
     Info.ptrVal = CI.getOperand(0);
@@ -1450,8 +1448,6 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
   case Intrinsic::amdgcn_atomic_cond_sub_u32:
   case Intrinsic::amdgcn_ds_append:
   case Intrinsic::amdgcn_ds_consume:
-  case Intrinsic::amdgcn_ds_fmax:
-  case Intrinsic::amdgcn_ds_fmin:
   case Intrinsic::amdgcn_ds_ordered_add:
   case Intrinsic::amdgcn_ds_ordered_swap:
   case Intrinsic::amdgcn_flat_atomic_fadd:
@@ -8899,15 +8895,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                    M->getVTList(), Ops, M->getMemoryVT(),
                                    M->getMemOperand());
   }
-  case Intrinsic::amdgcn_ds_fmin:
-  case Intrinsic::amdgcn_ds_fmax: {
-    MemSDNode *M = cast<MemSDNode>(Op);
-    unsigned Opc = IntrID == Intrinsic::amdgcn_ds_fmin ? ISD::ATOMIC_LOAD_FMIN
-                                                       : ISD::ATOMIC_LOAD_FMAX;
-    return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), M->getOperand(0),
-                         M->getOperand(2), M->getOperand(3),
-                         M->getMemOperand());
-  }
   case Intrinsic::amdgcn_raw_buffer_load:
   case Intrinsic::amdgcn_raw_ptr_buffer_load:
   case Intrinsic::amdgcn_raw_buffer_load_format:
diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll
index 311bd8863859b..ed7b04a2f3146 100644
--- a/llvm/test/Bitcode/amdgcn-atomic.ll
+++ b/llvm/test/Bitcode/amdgcn-atomic.ll
@@ -248,4 +248,56 @@ define <2 x i16> @upgrade_amdgcn_ds_fadd_v2bf16__missing_args_as_i16(ptr addrspa
   ret <2 x i16> %result0
 }
 
+declare float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg)
+declare double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) nocapture, double, i32 immarg, i32 immarg, i1 immarg)
+
+define float @upgrade_amdgcn_ds_fmin_f32(ptr addrspace(3) %ptr, float %val) {
+  ; CHECK: atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
+  %result0 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
+
+  ; CHECK: = atomicrmw volatile fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
+  %result1 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 true)
+
+  ; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
+  %result2 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 43, i32 3, i1 false)
+
+  ; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") acquire, align 4
+  %result3 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 4, i32 2, i1 false)
+
+  ret float %result3
+}
+
+define double @upgrade_amdgcn_ds_fmin_f64(ptr addrspace(3) %ptr, double %val) {
+  ; CHECK: atomicrmw fmin ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8
+  %result0 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 0, i32 0, i1 false)
+
+  ; CHECK: = atomicrmw volatile fmin ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8
+  %result1 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 0, i32 0, i1 true)
+
+  ; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8
+  %result2 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 43, i32 3, i1 false)
+
+  ; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, double %val syncscope("agent") acquire, align 8
+  %result3 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 4, i32 2, i1 false)
+
+  ret double %result3
+}
+
+declare float @llvm.amdgcn.ds.fmin(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg)
+
+define float @upgrade_amdgcn_ds_fmin_f32_no_suffix(ptr addrspace(3) %ptr, float %val) {
+  ; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
+
+  %result0 = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
+  ret float %result0
+}
+
+declare float @llvm.amdgcn.ds.fmax(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg)
+
+define float @upgrade_amdgcn_ds_fmax_f32_no_suffix(ptr addrspace(3) %ptr, float %val) {
+  ; CHECK: = atomicrmw fmax ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
+  %result0 = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
+  ret float %result0
+}
+
 attributes #0 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll
deleted file mode 100644
index e4c4f42b137ef..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll
+++ /dev/null
@@ -1,371 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
-
-; Make sure the memory operand information is preserved.
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8-MIR %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9-MIR %s
-
-
-define amdgpu_ps float @ds_fmax_f32_ss(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fmax_f32_ss:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: ds_fmax_f32_ss:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-  ; GFX8-MIR-LABEL: name: ds_fmax_f32_ss
-  ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR-NEXT:   liveins: $sgpr2, $sgpr3
-  ; GFX8-MIR-NEXT: {{  $}}
-  ; GFX8-MIR-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; GFX8-MIR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; GFX8-MIR-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GFX8-MIR-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-  ; GFX8-MIR-NEXT:   $m0 = S_MOV_B32 -1
-  ; GFX8-MIR-NEXT:   [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY2]], [[COPY3]], 0, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX8-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_]]
-  ; GFX8-MIR-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
-  ; GFX9-MIR-LABEL: name: ds_fmax_f32_ss
-  ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR-NEXT:   liveins: $sgpr2, $sgpr3
-  ; GFX9-MIR-NEXT: {{  $}}
-  ; GFX9-MIR-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; GFX9-MIR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; GFX9-MIR-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GFX9-MIR-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-  ; GFX9-MIR-NEXT:   [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY2]], [[COPY3]], 0, 0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX9-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]]
-  ; GFX9-MIR-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
-  %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define amdgpu_ps float @ds_fmax_f32_ss_offset(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fmax_f32_ss_offset:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_max_rtn_f32 v0, v1, v0 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: ds_fmax_f32_ss_offset:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    ds_max_rtn_f32 v0, v1, v0 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-  ; GFX8-MIR-LABEL: name: ds_fmax_f32_ss_offset
-  ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR-NEXT:   liveins: $sgpr2, $sgpr3
-  ; GFX8-MIR-NEXT: {{  $}}
-  ; GFX8-MIR-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; GFX8-MIR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; GFX8-MIR-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-  ; GFX8-MIR-NEXT:   $m0 = S_MOV_B32 -1
-  ; GFX8-MIR-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GFX8-MIR-NEXT:   [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY3]], [[COPY2]], 512, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
-  ; GFX8-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_]]
-  ; GFX8-MIR-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
-  ; GFX9-MIR-LABEL: name: ds_fmax_f32_ss_offset
-  ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR-NEXT:   liveins: $sgpr2, $sgpr3
-  ; GFX9-MIR-NEXT: {{  $}}
-  ; GFX9-MIR-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; GFX9-MIR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; GFX9-MIR-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-  ; GFX9-MIR-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GFX9-MIR-NEXT:   [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY3]], [[COPY2]], 512, 0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
-  ; GFX9-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]]
-  ; GFX9-MIR-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define amdgpu_ps void @ds_fmax_f32_ss_nortn(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fmax_f32_ss_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_max_f32 v0, v1
-; GFX8-NEXT:    s_endpgm
-;
-; GFX9-LABEL: ds_fmax_f32_ss_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    ds_max_f32 v0, v1
-; GFX9-NEXT:    s_endpgm
-  ; GFX8-MIR-LABEL: name: ds_fmax_f32_ss_nortn
-  ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR-NEXT:   liveins: $sgpr2, $sgpr3
-  ; GFX8-MIR-NEXT: {{  $}}
-  ; GFX8-MIR-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; GFX8-MIR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; GFX8-MIR-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GFX8-MIR-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-  ; GFX8-MIR-NEXT:   $m0 = S_MOV_B32 -1
-  ; GFX8-MIR-NEXT:   DS_MAX_F32 [[COPY2]], [[COPY3]], 0, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX8-MIR-NEXT:   S_ENDPGM 0
-  ; GFX9-MIR-LABEL: name: ds_fmax_f32_ss_nortn
-  ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR-NEXT:   liveins: $sgpr2, $sgpr3
-  ; GFX9-MIR-NEXT: {{  $}}
-  ; GFX9-MIR-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; GFX9-MIR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; GFX9-MIR-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GFX9-MIR-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-  ; GFX9-MIR-NEXT:   DS_MAX_F32_gfx9 [[COPY2]], [[COPY3]], 0, 0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX9-MIR-NEXT:   S_ENDPGM 0
-  %unused = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define amdgpu_ps void @ds_fmax_f32_ss_offset_nortn(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fmax_f32_ss_offset_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_max_f32 v1, v0 offset:512
-; GFX8-NEXT:    s_endpgm
-;
-; GFX9-LABEL: ds_fmax_f32_ss_offset_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    ds_max_f32 v1, v0 offset:512
-; GFX9-NEXT:    s_endpgm
-  ; GFX8-MIR-LABEL: name: ds_fmax_f32_ss_offset_nortn
-  ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR-NEXT:   liveins: $sgpr2, $sgpr3
-  ; GFX8-MIR-NEXT: {{  $}}
-  ; GFX8-MIR-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; GFX8-MIR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; GFX8-MIR-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-  ; GFX8-MIR-NEXT:   $m0 = S_MOV_B32 -1
-  ; GFX8-MIR-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GFX8-MIR-NEXT:   DS_MAX_F32 [[COPY3]], [[COPY2]], 512, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
-  ; GFX8-MIR-NEXT:   S_ENDPGM 0
-  ; GFX9-MIR-LABEL: name: ds_fmax_f32_ss_offset_nortn
-  ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR-NEXT:   liveins: $sgpr2, $sgpr3
-  ; GFX9-MIR-NEXT: {{  $}}
-  ; GFX9-MIR-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; GFX9-MIR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; GFX9-MIR-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-  ; GFX9-MIR-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GFX9-MIR-NEXT:   DS_MAX_F32_gfx9 [[COPY3]], [[COPY2]], 512, 0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
-  ; GFX9-MIR-NEXT:   S_ENDPGM 0
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %unused = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define float @ds_fmax_f32_vv(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmax_f32_vv:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmax_f32_vv:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv
-  ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX8-MIR-NEXT: {{  $}}
-  ; GFX8-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX8-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX8-MIR-NEXT:   $m0 = S_MOV_B32 -1
-  ; GFX8-MIR-NEXT:   [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY]], [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX8-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_]]
-  ; GFX8-MIR-NEXT:   SI_RETURN implicit $vgpr0
-  ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv
-  ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX9-MIR-NEXT: {{  $}}
-  ; GFX9-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-MIR-NEXT:   [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX9-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]]
-  ; GFX9-MIR-NEXT:   SI_RETURN implicit $vgpr0
-  %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define float @ds_fmax_f32_vv_offset(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmax_f32_vv_offset:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_max_rtn_f32 v0, v0, v1 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmax_f32_vv_offset:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_max_rtn_f32 v0, v0, v1 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_offset
-  ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX8-MIR-NEXT: {{  $}}
-  ; GFX8-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX8-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX8-MIR-NEXT:   $m0 = S_MOV_B32 -1
-  ; GFX8-MIR-NEXT:   [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY]], [[COPY1]], 512, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
-  ; GFX8-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_]]
-  ; GFX8-MIR-NEXT:   SI_RETURN implicit $vgpr0
-  ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv_offset
-  ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX9-MIR-NEXT: {{  $}}
-  ; GFX9-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-MIR-NEXT:   [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY]], [[COPY1]], 512, 0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
-  ; GFX9-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]]
-  ; GFX9-MIR-NEXT:   SI_RETURN implicit $vgpr0
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define void @ds_fmax_f32_vv_nortn(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmax_f32_vv_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_max_f32 v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmax_f32_vv_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_max_f32 v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_nortn
-  ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX8-MIR-NEXT: {{  $}}
-  ; GFX8-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX8-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX8-MIR-NEXT:   $m0 = S_MOV_B32 -1
-  ; GFX8-MIR-NEXT:   DS_MAX_F32 [[COPY]], [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX8-MIR-NEXT:   SI_RETURN
-  ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv_nortn
-  ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX9-MIR-NEXT: {{  $}}
-  ; GFX9-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-MIR-NEXT:   DS_MAX_F32_gfx9 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX9-MIR-NEXT:   SI_RETURN
-  %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define void @ds_fmax_f32_vv_offset_nortn(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmax_f32_vv_offset_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_max_f32 v0, v1 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmax_f32_vv_offset_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_max_f32 v0, v1 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_offset_nortn
-  ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX8-MIR-NEXT: {{  $}}
-  ; GFX8-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX8-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX8-MIR-NEXT:   $m0 = S_MOV_B32 -1
-  ; GFX8-MIR-NEXT:   DS_MAX_F32 [[COPY]], [[COPY1]], 512, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
-  ; GFX8-MIR-NEXT:   SI_RETURN
-  ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv_offset_nortn
-  ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX9-MIR-NEXT: {{  $}}
-  ; GFX9-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-MIR-NEXT:   DS_MAX_F32_gfx9 [[COPY]], [[COPY1]], 512, 0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
-  ; GFX9-MIR-NEXT:   SI_RETURN
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define float @ds_fmax_f32_vv_volatile(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmax_f32_vv_volatile:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmax_f32_vv_volatile:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_volatile
-  ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX8-MIR-NEXT: {{  $}}
-  ; GFX8-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX8-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX8-MIR-NEXT:   $m0 = S_MOV_B32 -1
-  ; GFX8-MIR-NEXT:   [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY]], [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (volatile load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX8-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_]]
-  ; GFX8-MIR-NEXT:   SI_RETURN implicit $vgpr0
-  ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv_volatile
-  ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR-NEXT:   liveins: $vgpr0, $vgpr1
-  ; GFX9-MIR-NEXT: {{  $}}
-  ; GFX9-MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-MIR-NEXT:   [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (volatile load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX9-MIR-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]]
-  ; GFX9-MIR-NEXT:   SI_RETURN implicit $vgpr0
-  %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 true)
-  ret float %ret
-}
-
-declare float @llvm.amdgcn.ds.fmax(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg) #0
-
-attributes #0 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll
deleted file mode 100644
index 0f6fb5acd56ad..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll
+++ /dev/null
@@ -1,279 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11 %s
-
-define amdgpu_ps float @ds_fmin_f32_ss(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fmin_f32_ss:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: ds_fmin_f32_ss:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: ds_fmin_f32_ss:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX11-LABEL: ds_fmin_f32_ss:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-  %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define amdgpu_ps float @ds_fmin_f32_ss_offset(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fmin_f32_ss_offset:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_min_rtn_f32 v0, v1, v0 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: ds_fmin_f32_ss_offset:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    ds_min_rtn_f32 v0, v1, v0 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: ds_fmin_f32_ss_offset:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    ds_min_rtn_f32 v0, v1, v0 offset:512
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX11-LABEL: ds_fmin_f32_ss_offset:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    ds_min_rtn_f32 v0, v1, v0 offset:512
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define amdgpu_ps void @ds_fmin_f32_ss_nortn(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fmin_f32_ss_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_min_f32 v0, v1
-; GFX8-NEXT:    s_endpgm
-;
-; GFX9-LABEL: ds_fmin_f32_ss_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    ds_min_f32 v0, v1
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: ds_fmin_f32_ss_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    ds_min_f32 v0, v1
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: ds_fmin_f32_ss_nortn:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    ds_min_f32 v0, v1
-; GFX11-NEXT:    s_endpgm
-  %unused = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define amdgpu_ps void @ds_fmin_f32_ss_offset_nortn(ptr addrspace(3) inreg %ptr, float inreg %val) {
-; GFX8-LABEL: ds_fmin_f32_ss_offset_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_min_f32 v1, v0 offset:512
-; GFX8-NEXT:    s_endpgm
-;
-; GFX9-LABEL: ds_fmin_f32_ss_offset_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    ds_min_f32 v1, v0 offset:512
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: ds_fmin_f32_ss_offset_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    ds_min_f32 v1, v0 offset:512
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: ds_fmin_f32_ss_offset_nortn:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    ds_min_f32 v1, v0 offset:512
-; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %unused = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define float @ds_fmin_f32_vv(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmin_f32_vv:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmin_f32_vv:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fmin_f32_vv:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define float @ds_fmin_f32_vv_offset(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmin_f32_vv_offset:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_min_rtn_f32 v0, v0, v1 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmin_f32_vv_offset:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_min_rtn_f32 v0, v0, v1 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fmin_f32_vv_offset:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    ds_min_rtn_f32 v0, v0, v1 offset:512
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret float %ret
-}
-
-define void @ds_fmin_f32_vv_nortn(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmin_f32_vv_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_min_f32 v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmin_f32_vv_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_min_f32 v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fmin_f32_vv_nortn:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    ds_min_f32 v0, v1
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define void @ds_fmin_f32_vv_offset_nortn(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmin_f32_vv_offset_nortn:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_min_f32 v0, v1 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmin_f32_vv_offset_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_min_f32 v0, v1 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fmin_f32_vv_offset_nortn:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    ds_min_f32 v0, v1 offset:512
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
-  %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
-  ret void
-}
-
-define float @ds_fmin_f32_vv_volatile(ptr addrspace(3) %ptr, float %val) {
-; GFX8-LABEL: ds_fmin_f32_vv_volatile:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 m0, -1
-; GFX8-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: ds_fmin_f32_vv_volatile:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: ds_fmin_f32_vv_volatile:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 true)
-  ret float %ret
-}
-
-declare float @llvm.amdgcn.ds.fmin(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg) #0
-
-attributes #0 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
deleted file mode 100644
index 142a6ed19daf8..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
+++ /dev/null
@@ -1,1418 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s  | FileCheck -enable-var-scope -check-prefix=GFX7 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
-
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=G_SI %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s  | FileCheck -enable-var-scope -check-prefix=G_GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=G_VI %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=G_GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=G_GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=G_GFX11 %s
-
-declare float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) nocapture, float, i32, i32, i1)
-declare float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) nocapture, float, i32, i32, i1)
-declare double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) nocapture, double, i32, i32, i1)
-declare double @llvm.amdgcn.ds.fmax.f64(ptr addrspace(3) nocapture, double, i32, i32, i1)
-
-
-define amdgpu_kernel void @lds_ds_fmin(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) {
-; SI-LABEL: lds_ds_fmin:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; SI-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s7, 0xe8f000
-; SI-NEXT:    s_add_u32 s4, s4, s3
-; SI-NEXT:    s_addc_u32 s5, s5, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s3, s2, 4
-; SI-NEXT:    s_lshl_b32 s2, s2, 3
-; SI-NEXT:    s_add_i32 s2, s2, 32
-; SI-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; SI-NEXT:    v_mov_b32_e32 v1, s2
-; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    ds_min_rtn_f32 v1, v1, v0
-; SI-NEXT:    s_add_i32 s2, s3, 64
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    ds_min_f32 v2, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: lds_ds_fmin:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT:    s_load_dword s2, s[0:1], 0xb
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7-NEXT:    s_mov_b32 s6, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xe8f000
-; GFX7-NEXT:    s_add_u32 s4, s4, s3
-; GFX7-NEXT:    s_addc_u32 s5, s5, 0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshl_b32 s3, s2, 3
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    ds_min_rtn_f32 v1, v1, v0 offset:32
-; GFX7-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    ds_min_f32 v2, v0 offset:64
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; GFX7-NEXT:    s_endpgm
-;
-; VI-LABEL: lds_ds_fmin:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s90, -1
-; VI-NEXT:    s_mov_b32 s91, 0xe80000
-; VI-NEXT:    s_add_u32 s88, s88, s3
-; VI-NEXT:    s_addc_u32 s89, s89, 0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s3, s2, 3
-; VI-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    ds_min_rtn_f32 v1, v1, v0 offset:32
-; VI-NEXT:    s_lshl_b32 s2, s2, 4
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    ds_min_f32 v2, v0 offset:64
-; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    s_waitcnt lgkmcnt(1)
-; VI-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, v1, s[88:91], 0 offen
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: lds_ds_fmin:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
-; GFX9-NEXT:    s_add_u32 s8, s8, s3
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s0, s4, 3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    ds_min_rtn_f32 v1, v1, v0 offset:32
-; GFX9-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    ds_min_f32 v2, v0 offset:64
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: lds_ds_fmin:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX10-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX10-NEXT:    s_mov_b32 s10, -1
-; GFX10-NEXT:    s_mov_b32 s11, 0x31c16000
-; GFX10-NEXT:    s_add_u32 s8, s8, s3
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s0, s4, 3
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    ds_min_rtn_f32 v1, v1, v0 offset:32
-; GFX10-NEXT:    ds_min_f32 v2, v0 offset:64
-; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX10-NEXT:    ds_min_rtn_f32 v0, v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: lds_ds_fmin:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s3, s2, 3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    ds_min_rtn_f32 v1, v1, v0 offset:32
-; GFX11-NEXT:    ds_min_f32 v2, v0 offset:64
-; GFX11-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX11-NEXT:    ds_min_rtn_f32 v0, v3, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v0, s0
-; GFX11-NEXT:    s_endpgm
-;
-; G_SI-LABEL: lds_ds_fmin:
-; G_SI:       ; %bb.0:
-; G_SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; G_SI-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; G_SI-NEXT:    s_load_dword s2, s[0:1], 0xb
-; G_SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; G_SI-NEXT:    s_mov_b32 s6, -1
-; G_SI-NEXT:    s_mov_b32 s7, 0xe8f000
-; G_SI-NEXT:    s_add_u32 s4, s4, s3
-; G_SI-NEXT:    s_addc_u32 s5, s5, 0
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    s_add_i32 s2, s2, 4
-; G_SI-NEXT:    s_lshl_b32 s3, s2, 3
-; G_SI-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; G_SI-NEXT:    v_mov_b32_e32 v1, s3
-; G_SI-NEXT:    s_mov_b32 m0, -1
-; G_SI-NEXT:    ds_min_rtn_f32 v1, v1, v0
-; G_SI-NEXT:    s_lshl_b32 s2, s2, 4
-; G_SI-NEXT:    v_mov_b32_e32 v2, s2
-; G_SI-NEXT:    ds_min_f32 v2, v0
-; G_SI-NEXT:    v_mov_b32_e32 v0, s1
-; G_SI-NEXT:    s_waitcnt lgkmcnt(1)
-; G_SI-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; G_SI-NEXT:    v_mov_b32_e32 v1, s0
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: lds_ds_fmin:
-; G_GFX7:       ; %bb.0:
-; G_GFX7-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; G_GFX7-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; G_GFX7-NEXT:    s_load_dword s2, s[0:1], 0xb
-; G_GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; G_GFX7-NEXT:    s_mov_b32 s6, -1
-; G_GFX7-NEXT:    s_mov_b32 s7, 0xe8f000
-; G_GFX7-NEXT:    s_add_u32 s4, s4, s3
-; G_GFX7-NEXT:    s_addc_u32 s5, s5, 0
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    s_add_i32 s2, s2, 4
-; G_GFX7-NEXT:    s_lshl_b32 s3, s2, 3
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX7-NEXT:    s_mov_b32 m0, -1
-; G_GFX7-NEXT:    ds_min_rtn_f32 v1, v1, v0
-; G_GFX7-NEXT:    s_lshl_b32 s2, s2, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX7-NEXT:    ds_min_f32 v2, v0
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX7-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_VI-LABEL: lds_ds_fmin:
-; G_VI:       ; %bb.0:
-; G_VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; G_VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; G_VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; G_VI-NEXT:    s_mov_b32 s90, -1
-; G_VI-NEXT:    s_mov_b32 s91, 0xe80000
-; G_VI-NEXT:    s_add_u32 s88, s88, s3
-; G_VI-NEXT:    s_addc_u32 s89, s89, 0
-; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_VI-NEXT:    s_add_i32 s2, s2, 4
-; G_VI-NEXT:    s_lshl_b32 s3, s2, 3
-; G_VI-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; G_VI-NEXT:    v_mov_b32_e32 v1, s3
-; G_VI-NEXT:    s_mov_b32 m0, -1
-; G_VI-NEXT:    ds_min_rtn_f32 v1, v1, v0
-; G_VI-NEXT:    s_lshl_b32 s2, s2, 4
-; G_VI-NEXT:    v_mov_b32_e32 v2, s2
-; G_VI-NEXT:    ds_min_f32 v2, v0
-; G_VI-NEXT:    v_mov_b32_e32 v0, s1
-; G_VI-NEXT:    s_waitcnt lgkmcnt(1)
-; G_VI-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; G_VI-NEXT:    v_mov_b32_e32 v1, s0
-; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_VI-NEXT:    buffer_store_dword v0, v1, s[88:91], 0 offen
-; G_VI-NEXT:    s_endpgm
-;
-; G_GFX9-LABEL: lds_ds_fmin:
-; G_GFX9:       ; %bb.0:
-; G_GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_GFX9-NEXT:    s_mov_b32 s10, -1
-; G_GFX9-NEXT:    s_mov_b32 s11, 0xe00000
-; G_GFX9-NEXT:    s_add_u32 s8, s8, s3
-; G_GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; G_GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; G_GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, 0x42280000
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 3
-; G_GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; G_GFX9-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 4
-; G_GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; G_GFX9-NEXT:    ds_min_f32 v2, v1
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX9-NEXT:    ds_min_rtn_f32 v0, v1, v0
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
-; G_GFX9-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: lds_ds_fmin:
-; G_GFX10:       ; %bb.0:
-; G_GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; G_GFX10-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; G_GFX10-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; G_GFX10-NEXT:    s_mov_b32 s6, -1
-; G_GFX10-NEXT:    s_mov_b32 s7, 0x31c16000
-; G_GFX10-NEXT:    s_add_u32 s4, s4, s3
-; G_GFX10-NEXT:    s_addc_u32 s5, s5, 0
-; G_GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0x42280000
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    s_add_i32 s2, s2, 4
-; G_GFX10-NEXT:    s_lshl_b32 s3, s2, 3
-; G_GFX10-NEXT:    s_lshl_b32 s2, s2, 4
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s3
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; G_GFX10-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; G_GFX10-NEXT:    ds_min_f32 v2, v1
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX10-NEXT:    ds_min_rtn_f32 v0, v3, v0
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX11-LABEL: lds_ds_fmin:
-; G_GFX11:       ; %bb.0:
-; G_GFX11-NEXT:    s_clause 0x1
-; G_GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; G_GFX11-NEXT:    v_mov_b32_e32 v1, 0x42280000
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT:    s_add_i32 s2, s2, 4
-; G_GFX11-NEXT:    v_mov_b32_e32 v3, s1
-; G_GFX11-NEXT:    s_lshl_b32 s3, s2, 3
-; G_GFX11-NEXT:    s_lshl_b32 s2, s2, 4
-; G_GFX11-NEXT:    v_mov_b32_e32 v0, s3
-; G_GFX11-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX11-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; G_GFX11-NEXT:    ds_min_f32 v2, v1
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX11-NEXT:    ds_min_rtn_f32 v0, v3, v0
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT:    scratch_store_b32 off, v0, s0
-; G_GFX11-NEXT:    s_endpgm
-  %idx.add = add nuw i32 %idx, 4
-  %shl0 = shl i32 %idx.add, 3
-  %shl1 = shl i32 %idx.add, 4
-  %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
-  %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3)
-  %a1 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptrf, float %a1, i32 0, i32 0, i1 false)
-  store float %a3, ptr addrspace(5) %out
-  ret void
-}
-
-define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) {
-; SI-LABEL: lds_ds_fmax:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; SI-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s7, 0xe8f000
-; SI-NEXT:    s_add_u32 s4, s4, s3
-; SI-NEXT:    s_addc_u32 s5, s5, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s3, s2, 4
-; SI-NEXT:    s_lshl_b32 s2, s2, 3
-; SI-NEXT:    s_add_i32 s2, s2, 32
-; SI-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; SI-NEXT:    v_mov_b32_e32 v1, s2
-; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    ds_max_rtn_f32 v1, v1, v0
-; SI-NEXT:    s_add_i32 s2, s3, 64
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    ds_max_f32 v2, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: lds_ds_fmax:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT:    s_load_dword s2, s[0:1], 0xb
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7-NEXT:    s_mov_b32 s6, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xe8f000
-; GFX7-NEXT:    s_add_u32 s4, s4, s3
-; GFX7-NEXT:    s_addc_u32 s5, s5, 0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshl_b32 s3, s2, 3
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    ds_max_rtn_f32 v1, v1, v0 offset:32
-; GFX7-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    ds_max_f32 v2, v0 offset:64
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; GFX7-NEXT:    s_endpgm
-;
-; VI-LABEL: lds_ds_fmax:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s90, -1
-; VI-NEXT:    s_mov_b32 s91, 0xe80000
-; VI-NEXT:    s_add_u32 s88, s88, s3
-; VI-NEXT:    s_addc_u32 s89, s89, 0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s3, s2, 3
-; VI-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    ds_max_rtn_f32 v1, v1, v0 offset:32
-; VI-NEXT:    s_lshl_b32 s2, s2, 4
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    ds_max_f32 v2, v0 offset:64
-; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    s_waitcnt lgkmcnt(1)
-; VI-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, v1, s[88:91], 0 offen
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: lds_ds_fmax:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
-; GFX9-NEXT:    s_add_u32 s8, s8, s3
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s0, s4, 3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    ds_max_rtn_f32 v1, v1, v0 offset:32
-; GFX9-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    ds_max_f32 v2, v0 offset:64
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: lds_ds_fmax:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX10-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX10-NEXT:    s_mov_b32 s10, -1
-; GFX10-NEXT:    s_mov_b32 s11, 0x31c16000
-; GFX10-NEXT:    s_add_u32 s8, s8, s3
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s0, s4, 3
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    ds_max_rtn_f32 v1, v1, v0 offset:32
-; GFX10-NEXT:    ds_max_f32 v2, v0 offset:64
-; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX10-NEXT:    ds_max_rtn_f32 v0, v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: lds_ds_fmax:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s3, s2, 3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    ds_max_rtn_f32 v1, v1, v0 offset:32
-; GFX11-NEXT:    ds_max_f32 v2, v0 offset:64
-; GFX11-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX11-NEXT:    ds_max_rtn_f32 v0, v3, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v0, s0
-; GFX11-NEXT:    s_endpgm
-;
-; G_SI-LABEL: lds_ds_fmax:
-; G_SI:       ; %bb.0:
-; G_SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; G_SI-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; G_SI-NEXT:    s_load_dword s2, s[0:1], 0xb
-; G_SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; G_SI-NEXT:    s_mov_b32 s6, -1
-; G_SI-NEXT:    s_mov_b32 s7, 0xe8f000
-; G_SI-NEXT:    s_add_u32 s4, s4, s3
-; G_SI-NEXT:    s_addc_u32 s5, s5, 0
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    s_add_i32 s2, s2, 4
-; G_SI-NEXT:    s_lshl_b32 s3, s2, 3
-; G_SI-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; G_SI-NEXT:    v_mov_b32_e32 v1, s3
-; G_SI-NEXT:    s_mov_b32 m0, -1
-; G_SI-NEXT:    ds_max_rtn_f32 v1, v1, v0
-; G_SI-NEXT:    s_lshl_b32 s2, s2, 4
-; G_SI-NEXT:    v_mov_b32_e32 v2, s2
-; G_SI-NEXT:    ds_max_f32 v2, v0
-; G_SI-NEXT:    v_mov_b32_e32 v0, s1
-; G_SI-NEXT:    s_waitcnt lgkmcnt(1)
-; G_SI-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; G_SI-NEXT:    v_mov_b32_e32 v1, s0
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: lds_ds_fmax:
-; G_GFX7:       ; %bb.0:
-; G_GFX7-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; G_GFX7-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; G_GFX7-NEXT:    s_load_dword s2, s[0:1], 0xb
-; G_GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; G_GFX7-NEXT:    s_mov_b32 s6, -1
-; G_GFX7-NEXT:    s_mov_b32 s7, 0xe8f000
-; G_GFX7-NEXT:    s_add_u32 s4, s4, s3
-; G_GFX7-NEXT:    s_addc_u32 s5, s5, 0
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    s_add_i32 s2, s2, 4
-; G_GFX7-NEXT:    s_lshl_b32 s3, s2, 3
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX7-NEXT:    s_mov_b32 m0, -1
-; G_GFX7-NEXT:    ds_max_rtn_f32 v1, v1, v0
-; G_GFX7-NEXT:    s_lshl_b32 s2, s2, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX7-NEXT:    ds_max_f32 v2, v0
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX7-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_VI-LABEL: lds_ds_fmax:
-; G_VI:       ; %bb.0:
-; G_VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; G_VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; G_VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; G_VI-NEXT:    s_mov_b32 s90, -1
-; G_VI-NEXT:    s_mov_b32 s91, 0xe80000
-; G_VI-NEXT:    s_add_u32 s88, s88, s3
-; G_VI-NEXT:    s_addc_u32 s89, s89, 0
-; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_VI-NEXT:    s_add_i32 s2, s2, 4
-; G_VI-NEXT:    s_lshl_b32 s3, s2, 3
-; G_VI-NEXT:    v_mov_b32_e32 v0, 0x42280000
-; G_VI-NEXT:    v_mov_b32_e32 v1, s3
-; G_VI-NEXT:    s_mov_b32 m0, -1
-; G_VI-NEXT:    ds_max_rtn_f32 v1, v1, v0
-; G_VI-NEXT:    s_lshl_b32 s2, s2, 4
-; G_VI-NEXT:    v_mov_b32_e32 v2, s2
-; G_VI-NEXT:    ds_max_f32 v2, v0
-; G_VI-NEXT:    v_mov_b32_e32 v0, s1
-; G_VI-NEXT:    s_waitcnt lgkmcnt(1)
-; G_VI-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; G_VI-NEXT:    v_mov_b32_e32 v1, s0
-; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_VI-NEXT:    buffer_store_dword v0, v1, s[88:91], 0 offen
-; G_VI-NEXT:    s_endpgm
-;
-; G_GFX9-LABEL: lds_ds_fmax:
-; G_GFX9:       ; %bb.0:
-; G_GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_GFX9-NEXT:    s_mov_b32 s10, -1
-; G_GFX9-NEXT:    s_mov_b32 s11, 0xe00000
-; G_GFX9-NEXT:    s_add_u32 s8, s8, s3
-; G_GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; G_GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; G_GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, 0x42280000
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 3
-; G_GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; G_GFX9-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 4
-; G_GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; G_GFX9-NEXT:    ds_max_f32 v2, v1
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX9-NEXT:    ds_max_rtn_f32 v0, v1, v0
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
-; G_GFX9-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: lds_ds_fmax:
-; G_GFX10:       ; %bb.0:
-; G_GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; G_GFX10-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; G_GFX10-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; G_GFX10-NEXT:    s_mov_b32 s6, -1
-; G_GFX10-NEXT:    s_mov_b32 s7, 0x31c16000
-; G_GFX10-NEXT:    s_add_u32 s4, s4, s3
-; G_GFX10-NEXT:    s_addc_u32 s5, s5, 0
-; G_GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0x42280000
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    s_add_i32 s2, s2, 4
-; G_GFX10-NEXT:    s_lshl_b32 s3, s2, 3
-; G_GFX10-NEXT:    s_lshl_b32 s2, s2, 4
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s3
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; G_GFX10-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; G_GFX10-NEXT:    ds_max_f32 v2, v1
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX10-NEXT:    ds_max_rtn_f32 v0, v3, v0
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX11-LABEL: lds_ds_fmax:
-; G_GFX11:       ; %bb.0:
-; G_GFX11-NEXT:    s_clause 0x1
-; G_GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; G_GFX11-NEXT:    v_mov_b32_e32 v1, 0x42280000
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT:    s_add_i32 s2, s2, 4
-; G_GFX11-NEXT:    v_mov_b32_e32 v3, s1
-; G_GFX11-NEXT:    s_lshl_b32 s3, s2, 3
-; G_GFX11-NEXT:    s_lshl_b32 s2, s2, 4
-; G_GFX11-NEXT:    v_mov_b32_e32 v0, s3
-; G_GFX11-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX11-NEXT:    ds_max_rtn_f32 v0, v0, v1
-; G_GFX11-NEXT:    ds_max_f32 v2, v1
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX11-NEXT:    ds_max_rtn_f32 v0, v3, v0
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT:    scratch_store_b32 off, v0, s0
-; G_GFX11-NEXT:    s_endpgm
-  %idx.add = add nuw i32 %idx, 4
-  %shl0 = shl i32 %idx.add, 3
-  %shl1 = shl i32 %idx.add, 4
-  %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
-  %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3)
-  %a1 = call float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %ptrf, float %a1, i32 0, i32 0, i1 false)
-  store float %a3, ptr addrspace(5) %out
-  ret void
-}
-
-define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) {
-; SI-LABEL: lds_ds_fmin_f64:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; SI-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s7, 0xe8f000
-; SI-NEXT:    s_add_u32 s4, s4, s3
-; SI-NEXT:    s_addc_u32 s5, s5, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s3, s2, 4
-; SI-NEXT:    s_lshl_b32 s2, s2, 3
-; SI-NEXT:    v_mov_b32_e32 v0, 0
-; SI-NEXT:    s_add_i32 s2, s2, 32
-; SI-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; SI-NEXT:    v_mov_b32_e32 v4, s1
-; SI-NEXT:    s_add_i32 s1, s3, 64
-; SI-NEXT:    v_mov_b32_e32 v5, s1
-; SI-NEXT:    ds_min_f64 v5, v[0:1]
-; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    ds_min_rtn_f64 v[0:1], v4, v[2:3]
-; SI-NEXT:    s_add_i32 s1, s0, 4
-; SI-NEXT:    v_mov_b32_e32 v3, s1
-; SI-NEXT:    v_mov_b32_e32 v2, s0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, v3, s[4:7], 0 offen
-; SI-NEXT:    buffer_store_dword v0, v2, s[4:7], 0 offen
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: lds_ds_fmin_f64:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT:    s_load_dword s2, s[0:1], 0xb
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7-NEXT:    s_mov_b32 s6, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xe8f000
-; GFX7-NEXT:    s_add_u32 s4, s4, s3
-; GFX7-NEXT:    s_addc_u32 s5, s5, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshl_b32 s3, s2, 3
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; GFX7-NEXT:    v_mov_b32_e32 v2, s3
-; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX7-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s2
-; GFX7-NEXT:    v_mov_b32_e32 v4, s1
-; GFX7-NEXT:    ds_min_f64 v5, v[0:1] offset:64
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    ds_min_rtn_f64 v[0:1], v4, v[2:3]
-; GFX7-NEXT:    s_add_i32 s1, s0, 4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v1, v3, s[4:7], 0 offen
-; GFX7-NEXT:    buffer_store_dword v0, v2, s[4:7], 0 offen
-; GFX7-NEXT:    s_endpgm
-;
-; VI-LABEL: lds_ds_fmin_f64:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s90, -1
-; VI-NEXT:    s_mov_b32 s91, 0xe80000
-; VI-NEXT:    s_add_u32 s88, s88, s3
-; VI-NEXT:    s_addc_u32 s89, s89, 0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s3, s2, 3
-; VI-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; VI-NEXT:    s_lshl_b32 s2, s2, 4
-; VI-NEXT:    v_mov_b32_e32 v5, s2
-; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    ds_min_f64 v5, v[0:1] offset:64
-; VI-NEXT:    s_waitcnt lgkmcnt(1)
-; VI-NEXT:    ds_min_rtn_f64 v[0:1], v4, v[2:3]
-; VI-NEXT:    s_add_i32 s1, s0, 4
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
-; VI-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: lds_ds_fmin_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
-; GFX9-NEXT:    s_add_u32 s8, s8, s3
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s0, s4, 3
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX9-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s0
-; GFX9-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-NEXT:    ds_min_f64 v5, v[0:1] offset:64
-; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NEXT:    ds_min_rtn_f64 v[0:1], v4, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
-; GFX9-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: lds_ds_fmin_f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX10-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX10-NEXT:    s_mov_b32 s10, -1
-; GFX10-NEXT:    s_mov_b32 s11, 0x31c16000
-; GFX10-NEXT:    s_add_u32 s8, s8, s3
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s0, s4, 3
-; GFX10-NEXT:    v_mov_b32_e32 v5, s3
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX10-NEXT:    v_mov_b32_e32 v4, s0
-; GFX10-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX10-NEXT:    ds_min_f64 v4, v[0:1] offset:64
-; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX10-NEXT:    ds_min_rtn_f64 v[0:1], v5, v[2:3]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
-; GFX10-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: lds_ds_fmin_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s3, s2, 3
-; GFX11-NEXT:    v_mov_b32_e32 v5, s1
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s3
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v4, s2
-; GFX11-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX11-NEXT:    ds_min_f64 v4, v[0:1] offset:64
-; GFX11-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX11-NEXT:    ds_min_rtn_f64 v[0:1], v5, v[2:3]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s0
-; GFX11-NEXT:    s_endpgm
-;
-; G_SI-LABEL: lds_ds_fmin_f64:
-; G_SI:       ; %bb.0:
-; G_SI-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_SI-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_SI-NEXT:    s_load_dword s4, s[0:1], 0xb
-; G_SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; G_SI-NEXT:    s_mov_b32 s10, -1
-; G_SI-NEXT:    s_mov_b32 s11, 0xe8f000
-; G_SI-NEXT:    s_add_u32 s8, s8, s3
-; G_SI-NEXT:    s_mov_b32 s2, 0
-; G_SI-NEXT:    s_addc_u32 s9, s9, 0
-; G_SI-NEXT:    s_mov_b32 s3, 0x40450000
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    s_add_i32 s4, s4, 4
-; G_SI-NEXT:    v_mov_b32_e32 v0, s2
-; G_SI-NEXT:    s_lshl_b32 s2, s4, 3
-; G_SI-NEXT:    v_mov_b32_e32 v1, s3
-; G_SI-NEXT:    v_mov_b32_e32 v2, s2
-; G_SI-NEXT:    s_mov_b32 m0, -1
-; G_SI-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; G_SI-NEXT:    s_lshl_b32 s2, s4, 4
-; G_SI-NEXT:    v_mov_b32_e32 v4, s2
-; G_SI-NEXT:    ds_min_f64 v4, v[0:1]
-; G_SI-NEXT:    v_mov_b32_e32 v0, s1
-; G_SI-NEXT:    s_waitcnt lgkmcnt(1)
-; G_SI-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[2:3]
-; G_SI-NEXT:    v_mov_b32_e32 v2, s0
-; G_SI-NEXT:    s_add_u32 s0, s0, 4
-; G_SI-NEXT:    v_mov_b32_e32 v3, s0
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; G_SI-NEXT:    buffer_store_dword v1, v3, s[8:11], 0 offen
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: lds_ds_fmin_f64:
-; G_GFX7:       ; %bb.0:
-; G_GFX7-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_GFX7-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_GFX7-NEXT:    s_load_dword s4, s[0:1], 0xb
-; G_GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; G_GFX7-NEXT:    s_mov_b32 s10, -1
-; G_GFX7-NEXT:    s_mov_b32 s11, 0xe8f000
-; G_GFX7-NEXT:    s_add_u32 s8, s8, s3
-; G_GFX7-NEXT:    s_mov_b32 s2, 0
-; G_GFX7-NEXT:    s_addc_u32 s9, s9, 0
-; G_GFX7-NEXT:    s_mov_b32 s3, 0x40450000
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; G_GFX7-NEXT:    s_lshl_b32 s2, s4, 3
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX7-NEXT:    s_mov_b32 m0, -1
-; G_GFX7-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; G_GFX7-NEXT:    s_lshl_b32 s2, s4, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v4, s2
-; G_GFX7-NEXT:    ds_min_f64 v4, v[0:1]
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX7-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[2:3]
-; G_GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; G_GFX7-NEXT:    s_add_u32 s0, s0, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v3, s0
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; G_GFX7-NEXT:    buffer_store_dword v1, v3, s[8:11], 0 offen
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_VI-LABEL: lds_ds_fmin_f64:
-; G_VI:       ; %bb.0:
-; G_VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; G_VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; G_VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; G_VI-NEXT:    s_mov_b32 s90, -1
-; G_VI-NEXT:    s_mov_b32 s91, 0xe80000
-; G_VI-NEXT:    s_add_u32 s88, s88, s3
-; G_VI-NEXT:    s_mov_b32 s2, 0
-; G_VI-NEXT:    s_addc_u32 s89, s89, 0
-; G_VI-NEXT:    s_mov_b32 s3, 0x40450000
-; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_VI-NEXT:    s_add_i32 s4, s4, 4
-; G_VI-NEXT:    v_mov_b32_e32 v0, s2
-; G_VI-NEXT:    s_lshl_b32 s2, s4, 3
-; G_VI-NEXT:    v_mov_b32_e32 v1, s3
-; G_VI-NEXT:    v_mov_b32_e32 v2, s2
-; G_VI-NEXT:    s_mov_b32 m0, -1
-; G_VI-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; G_VI-NEXT:    s_lshl_b32 s2, s4, 4
-; G_VI-NEXT:    v_mov_b32_e32 v4, s2
-; G_VI-NEXT:    ds_min_f64 v4, v[0:1]
-; G_VI-NEXT:    v_mov_b32_e32 v0, s1
-; G_VI-NEXT:    s_waitcnt lgkmcnt(1)
-; G_VI-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[2:3]
-; G_VI-NEXT:    v_mov_b32_e32 v2, s0
-; G_VI-NEXT:    s_add_u32 s0, s0, 4
-; G_VI-NEXT:    v_mov_b32_e32 v3, s0
-; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_VI-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
-; G_VI-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
-; G_VI-NEXT:    s_endpgm
-;
-; G_GFX9-LABEL: lds_ds_fmin_f64:
-; G_GFX9:       ; %bb.0:
-; G_GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_GFX9-NEXT:    s_mov_b32 s10, -1
-; G_GFX9-NEXT:    s_mov_b32 s11, 0xe00000
-; G_GFX9-NEXT:    s_add_u32 s8, s8, s3
-; G_GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; G_GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; G_GFX9-NEXT:    s_mov_b32 s0, 0
-; G_GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; G_GFX9-NEXT:    s_mov_b32 s1, 0x40450000
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 3
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; G_GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; G_GFX9-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 4
-; G_GFX9-NEXT:    v_mov_b32_e32 v5, s0
-; G_GFX9-NEXT:    v_mov_b32_e32 v4, s3
-; G_GFX9-NEXT:    ds_min_f64 v5, v[0:1]
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX9-NEXT:    ds_min_rtn_f64 v[0:1], v4, v[2:3]
-; G_GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; G_GFX9-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
-; G_GFX9-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: lds_ds_fmin_f64:
-; G_GFX10:       ; %bb.0:
-; G_GFX10-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_GFX10-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_GFX10-NEXT:    s_mov_b32 s10, -1
-; G_GFX10-NEXT:    s_mov_b32 s11, 0x31c16000
-; G_GFX10-NEXT:    s_add_u32 s8, s8, s3
-; G_GFX10-NEXT:    s_clause 0x1
-; G_GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; G_GFX10-NEXT:    s_addc_u32 s9, s9, 0
-; G_GFX10-NEXT:    s_mov_b32 s0, 0
-; G_GFX10-NEXT:    s_mov_b32 s1, 0x40450000
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX10-NEXT:    v_mov_b32_e32 v5, s3
-; G_GFX10-NEXT:    s_lshl_b32 s5, s4, 3
-; G_GFX10-NEXT:    s_lshl_b32 s0, s4, 4
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s5
-; G_GFX10-NEXT:    v_mov_b32_e32 v4, s0
-; G_GFX10-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; G_GFX10-NEXT:    ds_min_f64 v4, v[0:1]
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX10-NEXT:    ds_min_rtn_f64 v[0:1], v5, v[2:3]
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; G_GFX10-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX11-LABEL: lds_ds_fmin_f64:
-; G_GFX11:       ; %bb.0:
-; G_GFX11-NEXT:    s_clause 0x1
-; G_GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT:    s_add_i32 s4, s2, 4
-; G_GFX11-NEXT:    s_mov_b32 s2, 0
-; G_GFX11-NEXT:    s_mov_b32 s3, 0x40450000
-; G_GFX11-NEXT:    s_lshl_b32 s5, s4, 3
-; G_GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s1
-; G_GFX11-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s5
-; G_GFX11-NEXT:    s_lshl_b32 s2, s4, 4
-; G_GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; G_GFX11-NEXT:    v_mov_b32_e32 v4, s2
-; G_GFX11-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; G_GFX11-NEXT:    ds_min_f64 v4, v[0:1]
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX11-NEXT:    ds_min_rtn_f64 v[0:1], v5, v[2:3]
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT:    scratch_store_b64 off, v[0:1], s0
-; G_GFX11-NEXT:    s_endpgm
-  %idx.add = add nuw i32 %idx, 4
-  %shl0 = shl i32 %idx.add, 3
-  %shl1 = shl i32 %idx.add, 4
-  %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
-  %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3)
-  %a1 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr0, double 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr1, double 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptrf, double %a1, i32 0, i32 0, i1 false)
-  store double %a3, ptr addrspace(5) %out
-  ret void
-}
-
-define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) {
-; SI-LABEL: lds_ds_fmax_f64:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; SI-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s7, 0xe8f000
-; SI-NEXT:    s_add_u32 s4, s4, s3
-; SI-NEXT:    s_addc_u32 s5, s5, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s3, s2, 4
-; SI-NEXT:    s_lshl_b32 s2, s2, 3
-; SI-NEXT:    v_mov_b32_e32 v0, 0
-; SI-NEXT:    s_add_i32 s2, s2, 32
-; SI-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; SI-NEXT:    v_mov_b32_e32 v4, s1
-; SI-NEXT:    s_add_i32 s1, s3, 64
-; SI-NEXT:    v_mov_b32_e32 v5, s1
-; SI-NEXT:    ds_max_f64 v5, v[0:1]
-; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    ds_max_rtn_f64 v[0:1], v4, v[2:3]
-; SI-NEXT:    s_add_i32 s1, s0, 4
-; SI-NEXT:    v_mov_b32_e32 v3, s1
-; SI-NEXT:    v_mov_b32_e32 v2, s0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, v3, s[4:7], 0 offen
-; SI-NEXT:    buffer_store_dword v0, v2, s[4:7], 0 offen
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: lds_ds_fmax_f64:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT:    s_load_dword s2, s[0:1], 0xb
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7-NEXT:    s_mov_b32 s6, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xe8f000
-; GFX7-NEXT:    s_add_u32 s4, s4, s3
-; GFX7-NEXT:    s_addc_u32 s5, s5, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshl_b32 s3, s2, 3
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; GFX7-NEXT:    v_mov_b32_e32 v2, s3
-; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX7-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX7-NEXT:    v_mov_b32_e32 v5, s2
-; GFX7-NEXT:    v_mov_b32_e32 v4, s1
-; GFX7-NEXT:    ds_max_f64 v5, v[0:1] offset:64
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    ds_max_rtn_f64 v[0:1], v4, v[2:3]
-; GFX7-NEXT:    s_add_i32 s1, s0, 4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v1, v3, s[4:7], 0 offen
-; GFX7-NEXT:    buffer_store_dword v0, v2, s[4:7], 0 offen
-; GFX7-NEXT:    s_endpgm
-;
-; VI-LABEL: lds_ds_fmax_f64:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s90, -1
-; VI-NEXT:    s_mov_b32 s91, 0xe80000
-; VI-NEXT:    s_add_u32 s88, s88, s3
-; VI-NEXT:    s_addc_u32 s89, s89, 0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s3, s2, 3
-; VI-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; VI-NEXT:    s_lshl_b32 s2, s2, 4
-; VI-NEXT:    v_mov_b32_e32 v5, s2
-; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    ds_max_f64 v5, v[0:1] offset:64
-; VI-NEXT:    s_waitcnt lgkmcnt(1)
-; VI-NEXT:    ds_max_rtn_f64 v[0:1], v4, v[2:3]
-; VI-NEXT:    s_add_i32 s1, s0, 4
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
-; VI-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: lds_ds_fmax_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
-; GFX9-NEXT:    s_add_u32 s8, s8, s3
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s0, s4, 3
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX9-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s0
-; GFX9-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-NEXT:    ds_max_f64 v5, v[0:1] offset:64
-; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NEXT:    ds_max_rtn_f64 v[0:1], v4, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
-; GFX9-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: lds_ds_fmax_f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX10-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX10-NEXT:    s_mov_b32 s10, -1
-; GFX10-NEXT:    s_mov_b32 s11, 0x31c16000
-; GFX10-NEXT:    s_add_u32 s8, s8, s3
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s0, s4, 3
-; GFX10-NEXT:    v_mov_b32_e32 v5, s3
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX10-NEXT:    v_mov_b32_e32 v4, s0
-; GFX10-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX10-NEXT:    ds_max_f64 v4, v[0:1] offset:64
-; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX10-NEXT:    ds_max_rtn_f64 v[0:1], v5, v[2:3]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
-; GFX10-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: lds_ds_fmax_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s3, s2, 3
-; GFX11-NEXT:    v_mov_b32_e32 v5, s1
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s3
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v4, s2
-; GFX11-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX11-NEXT:    ds_max_f64 v4, v[0:1] offset:64
-; GFX11-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX11-NEXT:    ds_max_rtn_f64 v[0:1], v5, v[2:3]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s0
-; GFX11-NEXT:    s_endpgm
-;
-; G_SI-LABEL: lds_ds_fmax_f64:
-; G_SI:       ; %bb.0:
-; G_SI-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_SI-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_SI-NEXT:    s_load_dword s4, s[0:1], 0xb
-; G_SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; G_SI-NEXT:    s_mov_b32 s10, -1
-; G_SI-NEXT:    s_mov_b32 s11, 0xe8f000
-; G_SI-NEXT:    s_add_u32 s8, s8, s3
-; G_SI-NEXT:    s_mov_b32 s2, 0
-; G_SI-NEXT:    s_addc_u32 s9, s9, 0
-; G_SI-NEXT:    s_mov_b32 s3, 0x40450000
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    s_add_i32 s4, s4, 4
-; G_SI-NEXT:    v_mov_b32_e32 v0, s2
-; G_SI-NEXT:    s_lshl_b32 s2, s4, 3
-; G_SI-NEXT:    v_mov_b32_e32 v1, s3
-; G_SI-NEXT:    v_mov_b32_e32 v2, s2
-; G_SI-NEXT:    s_mov_b32 m0, -1
-; G_SI-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; G_SI-NEXT:    s_lshl_b32 s2, s4, 4
-; G_SI-NEXT:    v_mov_b32_e32 v4, s2
-; G_SI-NEXT:    ds_max_f64 v4, v[0:1]
-; G_SI-NEXT:    v_mov_b32_e32 v0, s1
-; G_SI-NEXT:    s_waitcnt lgkmcnt(1)
-; G_SI-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[2:3]
-; G_SI-NEXT:    v_mov_b32_e32 v2, s0
-; G_SI-NEXT:    s_add_u32 s0, s0, 4
-; G_SI-NEXT:    v_mov_b32_e32 v3, s0
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; G_SI-NEXT:    buffer_store_dword v1, v3, s[8:11], 0 offen
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: lds_ds_fmax_f64:
-; G_GFX7:       ; %bb.0:
-; G_GFX7-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_GFX7-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_GFX7-NEXT:    s_load_dword s4, s[0:1], 0xb
-; G_GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; G_GFX7-NEXT:    s_mov_b32 s10, -1
-; G_GFX7-NEXT:    s_mov_b32 s11, 0xe8f000
-; G_GFX7-NEXT:    s_add_u32 s8, s8, s3
-; G_GFX7-NEXT:    s_mov_b32 s2, 0
-; G_GFX7-NEXT:    s_addc_u32 s9, s9, 0
-; G_GFX7-NEXT:    s_mov_b32 s3, 0x40450000
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; G_GFX7-NEXT:    s_lshl_b32 s2, s4, 3
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX7-NEXT:    s_mov_b32 m0, -1
-; G_GFX7-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; G_GFX7-NEXT:    s_lshl_b32 s2, s4, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v4, s2
-; G_GFX7-NEXT:    ds_max_f64 v4, v[0:1]
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX7-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[2:3]
-; G_GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; G_GFX7-NEXT:    s_add_u32 s0, s0, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v3, s0
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; G_GFX7-NEXT:    buffer_store_dword v1, v3, s[8:11], 0 offen
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_VI-LABEL: lds_ds_fmax_f64:
-; G_VI:       ; %bb.0:
-; G_VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; G_VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; G_VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; G_VI-NEXT:    s_mov_b32 s90, -1
-; G_VI-NEXT:    s_mov_b32 s91, 0xe80000
-; G_VI-NEXT:    s_add_u32 s88, s88, s3
-; G_VI-NEXT:    s_mov_b32 s2, 0
-; G_VI-NEXT:    s_addc_u32 s89, s89, 0
-; G_VI-NEXT:    s_mov_b32 s3, 0x40450000
-; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_VI-NEXT:    s_add_i32 s4, s4, 4
-; G_VI-NEXT:    v_mov_b32_e32 v0, s2
-; G_VI-NEXT:    s_lshl_b32 s2, s4, 3
-; G_VI-NEXT:    v_mov_b32_e32 v1, s3
-; G_VI-NEXT:    v_mov_b32_e32 v2, s2
-; G_VI-NEXT:    s_mov_b32 m0, -1
-; G_VI-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; G_VI-NEXT:    s_lshl_b32 s2, s4, 4
-; G_VI-NEXT:    v_mov_b32_e32 v4, s2
-; G_VI-NEXT:    ds_max_f64 v4, v[0:1]
-; G_VI-NEXT:    v_mov_b32_e32 v0, s1
-; G_VI-NEXT:    s_waitcnt lgkmcnt(1)
-; G_VI-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[2:3]
-; G_VI-NEXT:    v_mov_b32_e32 v2, s0
-; G_VI-NEXT:    s_add_u32 s0, s0, 4
-; G_VI-NEXT:    v_mov_b32_e32 v3, s0
-; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_VI-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
-; G_VI-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
-; G_VI-NEXT:    s_endpgm
-;
-; G_GFX9-LABEL: lds_ds_fmax_f64:
-; G_GFX9:       ; %bb.0:
-; G_GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_GFX9-NEXT:    s_mov_b32 s10, -1
-; G_GFX9-NEXT:    s_mov_b32 s11, 0xe00000
-; G_GFX9-NEXT:    s_add_u32 s8, s8, s3
-; G_GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; G_GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; G_GFX9-NEXT:    s_mov_b32 s0, 0
-; G_GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; G_GFX9-NEXT:    s_mov_b32 s1, 0x40450000
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 3
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; G_GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; G_GFX9-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 4
-; G_GFX9-NEXT:    v_mov_b32_e32 v5, s0
-; G_GFX9-NEXT:    v_mov_b32_e32 v4, s3
-; G_GFX9-NEXT:    ds_max_f64 v5, v[0:1]
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX9-NEXT:    ds_max_rtn_f64 v[0:1], v4, v[2:3]
-; G_GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; G_GFX9-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
-; G_GFX9-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: lds_ds_fmax_f64:
-; G_GFX10:       ; %bb.0:
-; G_GFX10-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; G_GFX10-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; G_GFX10-NEXT:    s_mov_b32 s10, -1
-; G_GFX10-NEXT:    s_mov_b32 s11, 0x31c16000
-; G_GFX10-NEXT:    s_add_u32 s8, s8, s3
-; G_GFX10-NEXT:    s_clause 0x1
-; G_GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; G_GFX10-NEXT:    s_addc_u32 s9, s9, 0
-; G_GFX10-NEXT:    s_mov_b32 s0, 0
-; G_GFX10-NEXT:    s_mov_b32 s1, 0x40450000
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX10-NEXT:    v_mov_b32_e32 v5, s3
-; G_GFX10-NEXT:    s_lshl_b32 s5, s4, 3
-; G_GFX10-NEXT:    s_lshl_b32 s0, s4, 4
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s5
-; G_GFX10-NEXT:    v_mov_b32_e32 v4, s0
-; G_GFX10-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; G_GFX10-NEXT:    ds_max_f64 v4, v[0:1]
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX10-NEXT:    ds_max_rtn_f64 v[0:1], v5, v[2:3]
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
-; G_GFX10-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX11-LABEL: lds_ds_fmax_f64:
-; G_GFX11:       ; %bb.0:
-; G_GFX11-NEXT:    s_clause 0x1
-; G_GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT:    s_add_i32 s4, s2, 4
-; G_GFX11-NEXT:    s_mov_b32 s2, 0
-; G_GFX11-NEXT:    s_mov_b32 s3, 0x40450000
-; G_GFX11-NEXT:    s_lshl_b32 s5, s4, 3
-; G_GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s1
-; G_GFX11-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s5
-; G_GFX11-NEXT:    s_lshl_b32 s2, s4, 4
-; G_GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; G_GFX11-NEXT:    v_mov_b32_e32 v4, s2
-; G_GFX11-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; G_GFX11-NEXT:    ds_max_f64 v4, v[0:1]
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(1)
-; G_GFX11-NEXT:    ds_max_rtn_f64 v[0:1], v5, v[2:3]
-; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT:    scratch_store_b64 off, v[0:1], s0
-; G_GFX11-NEXT:    s_endpgm
-  %idx.add = add nuw i32 %idx, 4
-  %shl0 = shl i32 %idx.add, 3
-  %shl1 = shl i32 %idx.add, 4
-  %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
-  %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3)
-  %a1 = call double @llvm.amdgcn.ds.fmax.f64(ptr addrspace(3) %ptr0, double 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call double @llvm.amdgcn.ds.fmax.f64(ptr addrspace(3) %ptr1, double 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call double @llvm.amdgcn.ds.fmax.f64(ptr addrspace(3) %ptrf, double %a1, i32 0, i32 0, i1 false)
-  store double %a3, ptr addrspace(5) %out
-  ret void
-}



More information about the llvm-commits mailing list